diff -Nru python-kafka-python-0.9.2/AUTHORS.md python-kafka-python-1.0.1/AUTHORS.md --- python-kafka-python-0.9.2/AUTHORS.md 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/AUTHORS.md 2016-01-23 22:22:32.000000000 +0000 @@ -0,0 +1,49 @@ +# Current Maintainer +* Dana Powers, [@dpkp](https://github.com/dpkp) + +# Original Author and First Commit +* David Arthur, [@mumrah](https://github.com/mumrah) + +# Contributors - 2015 (alpha by username) +* Alex Couture-Beil, [@alexcb](https://github.com/alexcb) +* Ali-Akber Saifee, [@alisaifee](https://github.com/alisaifee) +* Christophe-Marie Duquesne, [@chmduquesne](https://github.com/chmduquesne) +* Thomas Dimson, [@cosbynator](https://github.com/cosbynator) +* Kasper Jacobsen, [@Dinoshauer](https://github.com/Dinoshauer) +* Ross Duggan, [@duggan](https://github.com/duggan) +* Enrico Canzonieri, [@ecanzonieri](https://github.com/ecanzonieri) +* haosdent, [@haosdent](https://github.com/haosdent) +* Arturo Filastò, [@hellais](https://github.com/hellais) +* Job Evers‐Meltzer, [@jobevers](https://github.com/jobevers) +* Martin Olveyra, [@kalessin](https://github.com/kalessin) +* Kubilay Kocak, [@koobs](https://github.com/koobs) +* Matthew L Daniel +* Eric Hewitt, [@meandthewallaby](https://github.com/meandthewallaby) +* Oliver Jowett [@mutability](https://github.com/mutability) +* Shaolei Zhou, [@reAsOn2010](https://github.com/reAsOn2010) +* Oskari Saarenmaa, [@saaros](https://github.com/saaros) +* John Anderson, [@sontek](https://github.com/sontek) +* Eduard Iskandarov, [@toidi](https://github.com/toidi) +* Todd Palino, [@toddpalino](https://github.com/toddpalino) +* trbs, [@trbs](https://github.com/trbs) +* Viktor Shlapakov, [@vshlapakov](https://github.com/vshlapakov) +* Will Daly, [@wedaly](https://github.com/wedaly) +* Warren Kiser, [@wkiser](https://github.com/wkiser) +* William Ting, [@wting](https://github.com/wting) +* Zack Dever, [@zackdever](https://github.com/zackdever) + +# More Contributors +* Bruno Renié, [@brutasse](https://github.com/brutasse) +* Thomas Dimson, [@cosbynator](https://github.com/cosbynator) +* Jesse Myers, [@jessemyers](https://github.com/jessemyers) +* Mahendra M, [@mahendra](https://github.com/mahendra) +* Miguel Eduardo Gil Biraud, [@mgilbir](https://github.com/mgilbir) +* Marc Labbé, [@mrtheb](https://github.com/mrtheb) +* Patrick Lucas, [@patricklucas](https://github.com/patricklucas) +* Omar Ghishan, [@rdiomar](https://github.com/rdiomar) - RIP, Omar. 2014 +* Ivan Pouzyrevsky, [@sandello](https://github.com/sandello) +* Lou Marvin Caraig, [@se7entyse7en](https://github.com/se7entyse7en) +* waliaashish85, [@waliaashish85](https://github.com/waliaashish85) +* Mark Roberts, [@wizzat](https://github.com/wizzat) + +Thanks to all who have contributed! diff -Nru python-kafka-python-0.9.2/CHANGES.md python-kafka-python-1.0.1/CHANGES.md --- python-kafka-python-0.9.2/CHANGES.md 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/CHANGES.md 2016-02-19 17:02:17.000000000 +0000 @@ -0,0 +1,276 @@ +# 1.0.1 (Feb 19, 2016) + +Consumers +* Add RangePartitionAssignor (and use as default); add assignor tests (dpkp PR 550) +* Make sure all consumers are in same generation before stopping group test +* Verify node ready before sending offset fetch request from coordinator +* Improve warning when offset fetch request returns unknown topic / partition + +Producers +* Warn if pending batches failed during flush +* Fix concurrency bug in RecordAccumulator.ready() +* Fix bug in SimpleBufferPool memory condition waiting / timeout +* Support batch_size = 0 in producer buffers (dpkp PR 558) +* Catch duplicate batch.done() calls [e.g., maybe_expire then a response errback] + +Clients + +Documentation +* Improve kafka.cluster docstrings +* Migrate load_example.py to KafkaProducer / KafkaConsumer + +Internals +* Dont override system rcvbuf or sndbuf unless configured explicitly (dpkp PR 557) +* Some attributes may not exist in __del__ if we failed assertions +* Break up some circular references and close client wake pipes on __del__ (aisch PR 554) + + +# 1.0.0 (Feb 15, 2016) + +This release includes significant code changes. Users of older kafka-python +versions are encouraged to test upgrades before deploying to production as +some interfaces and configuration options have changed. + +Users of SimpleConsumer / SimpleProducer / SimpleClient (formerly KafkaClient) +from prior releases should migrate to KafkaConsumer / KafkaProducer. Low-level +APIs (Simple*) are no longer being actively maintained and will be removed in a +future release. + +For comprehensive API documentation, please see python help() / docstrings, +kafka-python.readthedocs.org, or run `tox -e docs` from source to build +documentation locally. + +Consumers +* KafkaConsumer re-written to emulate the new 0.9 kafka consumer (java client) + and support coordinated consumer groups (feature requires >= 0.9.0.0 brokers) + + * Methods no longer available: + + * configure [initialize a new consumer instead] + * set_topic_partitions [use subscribe() or assign()] + * fetch_messages [use poll() or iterator interface] + * get_partition_offsets + * offsets [use committed(partition)] + * task_done [handled internally by auto-commit; or commit offsets manually] + + * Configuration changes (consistent with updated java client): + + * lots of new configuration parameters -- see docs for details + * auto_offset_reset: previously values were 'smallest' or 'largest', now + values are 'earliest' or 'latest' + * fetch_wait_max_ms is now fetch_max_wait_ms + * max_partition_fetch_bytes is now max_partition_fetch_bytes + * deserializer_class is now value_deserializer and key_deserializer + * auto_commit_enable is now enable_auto_commit + * auto_commit_interval_messages was removed + * socket_timeout_ms was removed + * refresh_leader_backoff_ms was removed + +* SimpleConsumer and MultiProcessConsumer are now deprecated and will be removed + in a future release. Users are encouraged to migrate to KafkaConsumer. + +Producers +* new producer class: KafkaProducer. Exposes the same interface as official java client. + Async by default; returned future.get() can be called for synchronous blocking +* SimpleProducer is now deprecated and will be removed in a future release. Users are + encouraged to migrate to KafkaProducer. + +Clients +* synchronous KafkaClient renamed to SimpleClient. For backwards compatibility, you + will get a SimpleClient via `from kafka import KafkaClient`. This will change in + a future release. +* All client calls use non-blocking IO under the hood. +* Add probe method check_version() to infer broker versions. + +Documentation +* Updated README and sphinx documentation to address new classes. +* Docstring improvements to make python help() easier to use. + +Internals +* Old protocol stack is deprecated. It has been moved to kafka.protocol.legacy + and may be removed in a future release. +* Protocol layer re-written using Type classes, Schemas and Structs (modeled on + the java client). +* Add support for LZ4 compression (including broken framing header checksum). + + +# 0.9.5 (Dec 6, 2015) + +Consumers +* Initial support for consumer coordinator: offsets only (toddpalino PR 420) +* Allow blocking until some messages are received in SimpleConsumer (saaros PR 457) +* Support subclass config changes in KafkaConsumer (zackdever PR 446) +* Support retry semantics in MultiProcessConsumer (barricadeio PR 456) +* Support partition_info in MultiProcessConsumer (scrapinghub PR 418) +* Enable seek() to an absolute offset in SimpleConsumer (haosdent PR 412) +* Add KafkaConsumer.close() (ucarion PR 426) + +Producers +* Catch client.reinit() exceptions in async producer (dpkp) +* Producer.stop() now blocks until async thread completes (dpkp PR 485) +* Catch errors during load_metadata_for_topics in async producer (bschopman PR 467) +* Add compression-level support for codecs that support it (trbs PR 454) +* Fix translation of Java murmur2 code, fix byte encoding for Python 3 (chrischamberlin PR 439) +* Only call stop() on not-stopped producer objects (docker-hub PR 435) +* Allow null payload for deletion feature (scrapinghub PR 409) + +Clients +* Use non-blocking io for broker aware requests (ecanzonieri PR 473) +* Use debug logging level for metadata request (ecanzonieri PR 415) +* Catch KafkaUnavailableError in _send_broker_aware_request (mutability PR 436) +* Lower logging level on replica not available and commit (ecanzonieri PR 415) + +Documentation +* Update docs and links wrt maintainer change (mumrah -> dpkp) + +Internals +* Add py35 to tox testing +* Update travis config to use container infrastructure +* Add 0.8.2.2 and 0.9.0.0 resources for integration tests; update default official releases +* new pylint disables for pylint 1.5.1 (zackdever PR 481) +* Fix python3 / python2 comments re queue/Queue (dpkp) +* Add Murmur2Partitioner to kafka __all__ imports (dpkp Issue 471) +* Include LICENSE in PyPI sdist (koobs PR 441) + +# 0.9.4 (June 11, 2015) + +Consumers +* Refactor SimpleConsumer internal fetch handling (dpkp PR 399) +* Handle exceptions in SimpleConsumer commit() and reset_partition_offset() (dpkp PR 404) +* Improve FailedPayloadsError handling in KafkaConsumer (dpkp PR 398) +* KafkaConsumer: avoid raising KeyError in task_done (dpkp PR 389) +* MultiProcessConsumer -- support configured partitions list (dpkp PR 380) +* Fix SimpleConsumer leadership change handling (dpkp PR 393) +* Fix SimpleConsumer connection error handling (reAsOn2010 PR 392) +* Improve Consumer handling of 'falsy' partition values (wting PR 342) +* Fix _offsets call error in KafkaConsumer (hellais PR 376) +* Fix str/bytes bug in KafkaConsumer (dpkp PR 365) +* Register atexit handlers for consumer and producer thread/multiprocess cleanup (dpkp PR 360) +* Always fetch commit offsets in base consumer unless group is None (dpkp PR 356) +* Stop consumer threads on delete (dpkp PR 357) +* Deprecate metadata_broker_list in favor of bootstrap_servers in KafkaConsumer (dpkp PR 340) +* Support pass-through parameters in multiprocess consumer (scrapinghub PR 336) +* Enable offset commit on SimpleConsumer.seek (ecanzonieri PR 350) +* Improve multiprocess consumer partition distribution (scrapinghub PR 335) +* Ignore messages with offset less than requested (wkiser PR 328) +* Handle OffsetOutOfRange in SimpleConsumer (ecanzonieri PR 296) + +Producers +* Add Murmur2Partitioner (dpkp PR 378) +* Log error types in SimpleProducer and SimpleConsumer (dpkp PR 405) +* SimpleProducer support configuration of fail_on_error (dpkp PR 396) +* Deprecate KeyedProducer.send() (dpkp PR 379) +* Further improvements to async producer code (dpkp PR 388) +* Add more configuration parameters for async producer (dpkp) +* Deprecate SimpleProducer batch_send=True in favor of async (dpkp) +* Improve async producer error handling and retry logic (vshlapakov PR 331) +* Support message keys in async producer (vshlapakov PR 329) +* Use threading instead of multiprocessing for Async Producer (vshlapakov PR 330) +* Stop threads on __del__ (chmduquesne PR 324) +* Fix leadership failover handling in KeyedProducer (dpkp PR 314) + +KafkaClient +* Add .topics property for list of known topics (dpkp) +* Fix request / response order guarantee bug in KafkaClient (dpkp PR 403) +* Improve KafkaClient handling of connection failures in _get_conn (dpkp) +* Client clears local metadata cache before updating from server (dpkp PR 367) +* KafkaClient should return a response or error for each request - enable better retry handling (dpkp PR 366) +* Improve str/bytes conversion in KafkaClient and KafkaConsumer (dpkp PR 332) +* Always return sorted partition ids in client.get_partition_ids_for_topic() (dpkp PR 315) + +Documentation +* Cleanup Usage Documentation +* Improve KafkaConsumer documentation (dpkp PR 341) +* Update consumer documentation (sontek PR 317) +* Add doc configuration for tox (sontek PR 316) +* Switch to .rst doc format (sontek PR 321) +* Fixup google groups link in README (sontek PR 320) +* Automate documentation at kafka-python.readthedocs.org + +Internals +* Switch integration testing from 0.8.2.0 to 0.8.2.1 (dpkp PR 402) +* Fix most flaky tests, improve debug logging, improve fixture handling (dpkp) +* General style cleanups (dpkp PR 394) +* Raise error on duplicate topic-partition payloads in protocol grouping (dpkp) +* Use module-level loggers instead of simply 'kafka' (dpkp) +* Remove pkg_resources check for __version__ at runtime (dpkp PR 387) +* Make external API consistently support python3 strings for topic (kecaps PR 361) +* Fix correlation id overflow (dpkp PR 355) +* Cleanup kafka/common structs (dpkp PR 338) +* Use context managers in gzip_encode / gzip_decode (dpkp PR 337) +* Save failed request as FailedPayloadsError attribute (jobevers PR 302) +* Remove unused kafka.queue (mumrah) + +# 0.9.3 (Feb 3, 2015) + +* Add coveralls.io support (sontek PR 307) +* Fix python2.6 threading.Event bug in ReentrantTimer (dpkp PR 312) +* Add kafka 0.8.2.0 to travis integration tests (dpkp PR 310) +* Auto-convert topics to utf-8 bytes in Producer (sontek PR 306) +* Fix reference cycle between SimpleConsumer and ReentrantTimer (zhaopengzp PR 309) +* Add Sphinx API docs (wedaly PR 282) +* Handle additional error cases exposed by 0.8.2.0 kafka server (dpkp PR 295) +* Refactor error class management (alexcb PR 289) +* Expose KafkaConsumer in __all__ for easy imports (Dinoshauer PR 286) +* SimpleProducer starts on random partition by default (alexcb PR 288) +* Add keys to compressed messages (meandthewallaby PR 281) +* Add new high-level KafkaConsumer class based on java client api (dpkp PR 234) +* Add KeyedProducer.send_messages api (pubnub PR 277) +* Fix consumer pending() method (jettify PR 276) +* Update low-level demo in README (sunisdown PR 274) +* Include key in KeyedProducer messages (se7entyse7en PR 268) +* Fix SimpleConsumer timeout behavior in get_messages (dpkp PR 238) +* Fix error in consumer.py test against max_buffer_size (rthille/wizzat PR 225/242) +* Improve string concat performance on pypy / py3 (dpkp PR 233) +* Reorg directory layout for consumer/producer/partitioners (dpkp/wizzat PR 232/243) +* Add OffsetCommitContext (locationlabs PR 217) +* Metadata Refactor (dpkp PR 223) +* Add Python 3 support (brutasse/wizzat - PR 227) +* Minor cleanups - imports / README / PyPI classifiers (dpkp - PR 221) +* Fix socket test (dpkp - PR 222) +* Fix exception catching bug in test_failover_integration (zever - PR 216) + +# 0.9.2 (Aug 26, 2014) + +* Warn users that async producer does not reliably handle failures (dpkp - PR 213) +* Fix spurious ConsumerFetchSizeTooSmall error in consumer (DataDog - PR 136) +* Use PyLint for static error checking (dpkp - PR 208) +* Strictly enforce str message type in producer.send_messages (dpkp - PR 211) +* Add test timers via nose-timer plugin; list 10 slowest timings by default (dpkp) +* Move fetching last known offset logic to a stand alone function (zever - PR 177) +* Improve KafkaConnection and add more tests (dpkp - PR 196) +* Raise TypeError if necessary when encoding strings (mdaniel - PR 204) +* Use Travis-CI to publish tagged releases to pypi (tkuhlman / mumrah) +* Use official binary tarballs for integration tests and parallelize travis tests (dpkp - PR 193) +* Improve new-topic creation handling (wizzat - PR 174) + +# 0.9.1 (Aug 10, 2014) + +* Add codec parameter to Producers to enable compression (patricklucas - PR 166) +* Support IPv6 hosts and network (snaury - PR 169) +* Remove dependency on distribute (patricklucas - PR 163) +* Fix connection error timeout and improve tests (wizzat - PR 158) +* SimpleProducer randomization of initial round robin ordering (alexcb - PR 139) +* Fix connection timeout in KafkaClient and KafkaConnection (maciejkula - PR 161) +* Fix seek + commit behavior (wizzat - PR 148) + + +# 0.9.0 (Mar 21, 2014) + +* Connection refactor and test fixes (wizzat - PR 134) +* Fix when partition has no leader (mrtheb - PR 109) +* Change Producer API to take topic as send argument, not as instance variable (rdiomar - PR 111) +* Substantial refactor and Test Fixing (rdiomar - PR 88) +* Fix Multiprocess Consumer on windows (mahendra - PR 62) +* Improve fault tolerance; add integration tests (jimjh) +* PEP8 / Flakes / Style cleanups (Vetoshkin Nikita; mrtheb - PR 59) +* Setup Travis CI (jimjh - PR 53/54) +* Fix import of BufferUnderflowError (jimjh - PR 49) +* Fix code examples in README (StevenLeRoux - PR 47/48) + +# 0.8.0 + +* Changing auto_commit to False in [SimpleConsumer](kafka/consumer.py), until 0.8.1 is release offset commits are unsupported +* Adding fetch_size_bytes to SimpleConsumer constructor to allow for user-configurable fetch sizes +* Allow SimpleConsumer to automatically increase the fetch size if a partial message is read and no other messages were read during that fetch request. The increase factor is 1.5 +* Exception classes moved to kafka.common diff -Nru python-kafka-python-0.9.2/debian/changelog python-kafka-python-1.0.1/debian/changelog --- python-kafka-python-0.9.2/debian/changelog 2015-06-22 18:43:58.000000000 +0000 +++ python-kafka-python-1.0.1/debian/changelog 2016-02-25 13:57:16.000000000 +0000 @@ -1,4 +1,4 @@ -python-kafka-python (0.9.2-0contrail0) precise; urgency=low +python-kafka-python (1.0.1-0contrail0) precise; urgency=low * Debian package for kafka diff -Nru python-kafka-python-0.9.2/kafka/client_async.py python-kafka-python-1.0.1/kafka/client_async.py --- python-kafka-python-0.9.2/kafka/client_async.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/client_async.py 2016-02-19 17:01:46.000000000 +0000 @@ -0,0 +1,753 @@ +from __future__ import absolute_import + +import copy +import heapq +import itertools +import logging +import os +import random +import select +import time + +import six + +import kafka.common as Errors # TODO: make Errors a separate class + +from .cluster import ClusterMetadata +from .conn import BrokerConnection, ConnectionStates, collect_hosts +from .future import Future +from .protocol.metadata import MetadataRequest +from .protocol.produce import ProduceRequest +from .version import __version__ + +if six.PY2: + ConnectionError = None + + +log = logging.getLogger('kafka.client') + + +class KafkaClient(object): + """ + A network client for asynchronous request/response network i/o. + This is an internal class used to implement the + user-facing producer and consumer clients. + + This class is not thread-safe! + """ + DEFAULT_CONFIG = { + 'bootstrap_servers': 'localhost', + 'client_id': 'kafka-python-' + __version__, + 'request_timeout_ms': 40000, + 'reconnect_backoff_ms': 50, + 'max_in_flight_requests_per_connection': 5, + 'receive_buffer_bytes': None, + 'send_buffer_bytes': None, + 'retry_backoff_ms': 100, + 'metadata_max_age_ms': 300000, + } + + def __init__(self, **configs): + """Initialize an asynchronous kafka client + + Keyword Arguments: + bootstrap_servers: 'host[:port]' string (or list of 'host[:port]' + strings) that the consumer should contact to bootstrap initial + cluster metadata. This does not have to be the full node list. + It just needs to have at least one broker that will respond to a + Metadata API Request. Default port is 9092. If no servers are + specified, will default to localhost:9092. + client_id (str): a name for this client. This string is passed in + each request to servers and can be used to identify specific + server-side log entries that correspond to this client. Also + submitted to GroupCoordinator for logging with respect to + consumer group administration. Default: 'kafka-python-{version}' + request_timeout_ms (int): Client request timeout in milliseconds. + Default: 40000. + reconnect_backoff_ms (int): The amount of time in milliseconds to + wait before attempting to reconnect to a given host. + Default: 50. + max_in_flight_requests_per_connection (int): Requests are pipelined + to kafka brokers up to this number of maximum requests per + broker connection. Default: 5. + send_buffer_bytes (int): The size of the TCP send buffer + (SO_SNDBUF) to use when sending data. Default: None (relies on + system defaults). Java client defaults to 131072. + receive_buffer_bytes (int): The size of the TCP receive buffer + (SO_RCVBUF) to use when reading data. Default: None (relies on + system defaults). Java client defaults to 32768. + metadata_max_age_ms (int): The period of time in milliseconds after + which we force a refresh of metadata even if we haven't seen any + partition leadership changes to proactively discover any new + brokers or partitions. Default: 300000 + retry_backoff_ms (int): Milliseconds to backoff when retrying on + errors. Default: 100. + """ + self.config = copy.copy(self.DEFAULT_CONFIG) + for key in self.config: + if key in configs: + self.config[key] = configs[key] + + self.cluster = ClusterMetadata(**self.config) + self._topics = set() # empty set will fetch all topic metadata + self._metadata_refresh_in_progress = False + self._conns = {} + self._connecting = set() + self._delayed_tasks = DelayedTaskQueue() + self._last_bootstrap = 0 + self._bootstrap_fails = 0 + self._bootstrap(collect_hosts(self.config['bootstrap_servers'])) + self._wake_r, self._wake_w = os.pipe() + + def __del__(self): + os.close(self._wake_r) + os.close(self._wake_w) + + def _bootstrap(self, hosts): + # Exponential backoff if bootstrap fails + backoff_ms = self.config['reconnect_backoff_ms'] * 2 ** self._bootstrap_fails + next_at = self._last_bootstrap + backoff_ms / 1000.0 + now = time.time() + if next_at > now: + log.debug("Sleeping %0.4f before bootstrapping again", next_at - now) + time.sleep(next_at - now) + self._last_bootstrap = time.time() + + metadata_request = MetadataRequest([]) + for host, port in hosts: + log.debug("Attempting to bootstrap via node at %s:%s", host, port) + bootstrap = BrokerConnection(host, port, **self.config) + bootstrap.connect() + while bootstrap.state is ConnectionStates.CONNECTING: + bootstrap.connect() + if bootstrap.state is not ConnectionStates.CONNECTED: + bootstrap.close() + continue + future = bootstrap.send(metadata_request) + while not future.is_done: + bootstrap.recv() + if future.failed(): + bootstrap.close() + continue + self.cluster.update_metadata(future.value) + + # A cluster with no topics can return no broker metadata + # in that case, we should keep the bootstrap connection + if not len(self.cluster.brokers()): + self._conns['bootstrap'] = bootstrap + self._bootstrap_fails = 0 + break + # No bootstrap found... + else: + log.error('Unable to bootstrap from %s', hosts) + # Max exponential backoff is 2^12, x4000 (50ms -> 200s) + self._bootstrap_fails = min(self._bootstrap_fails + 1, 12) + + def _can_connect(self, node_id): + if node_id not in self._conns: + if self.cluster.broker_metadata(node_id): + return True + return False + conn = self._conns[node_id] + return conn.state is ConnectionStates.DISCONNECTED and not conn.blacked_out() + + def _initiate_connect(self, node_id): + """Initiate a connection to the given node (must be in metadata)""" + if node_id not in self._conns: + broker = self.cluster.broker_metadata(node_id) + assert broker, 'Broker id %s not in current metadata' % node_id + + log.debug("Initiating connection to node %s at %s:%s", + node_id, broker.host, broker.port) + self._conns[node_id] = BrokerConnection(broker.host, broker.port, + **self.config) + return self._finish_connect(node_id) + + def _finish_connect(self, node_id): + assert node_id in self._conns, '%s is not in current conns' % node_id + state = self._conns[node_id].connect() + if state is ConnectionStates.CONNECTING: + self._connecting.add(node_id) + elif node_id in self._connecting: + log.debug("Node %s connection state is %s", node_id, state) + self._connecting.remove(node_id) + + if state is ConnectionStates.DISCONNECTED: + log.warning("Node %s connect failed -- refreshing metadata", node_id) + self.cluster.request_update() + + return state + + def ready(self, node_id): + """Check whether a node is connected and ok to send more requests. + + Arguments: + node_id (int): the id of the node to check + + Returns: + bool: True if we are ready to send to the given node + """ + if self.is_ready(node_id): + return True + + if self._can_connect(node_id): + # if we are interested in sending to a node + # and we don't have a connection to it, initiate one + self._initiate_connect(node_id) + + if node_id in self._connecting: + self._finish_connect(node_id) + + return self.is_ready(node_id) + + def close(self, node_id=None): + """Closes the connection to a particular node (if there is one). + + Arguments: + node_id (int): the id of the node to close + """ + if node_id is None: + for conn in self._conns.values(): + conn.close() + elif node_id in self._conns: + self._conns[node_id].close() + else: + log.warning("Node %s not found in current connection list; skipping", node_id) + return + + def is_disconnected(self, node_id): + """Check whether the node connection has been disconnected failed. + + A disconnected node has either been closed or has failed. Connection + failures are usually transient and can be resumed in the next ready() + call, but there are cases where transient failures need to be caught + and re-acted upon. + + Arguments: + node_id (int): the id of the node to check + + Returns: + bool: True iff the node exists and is disconnected + """ + if node_id not in self._conns: + return False + return self._conns[node_id].state is ConnectionStates.DISCONNECTED + + def connection_delay(self, node_id): + """ + Returns the number of milliseconds to wait, based on the connection + state, before attempting to send data. When disconnected, this respects + the reconnect backoff time. When connecting, returns 0 to allow + non-blocking connect to finish. When connected, returns a very large + number to handle slow/stalled connections. + + Arguments: + node_id (int): The id of the node to check + + Returns: + int: The number of milliseconds to wait. + """ + if node_id not in self._conns: + return 0 + + conn = self._conns[node_id] + time_waited_ms = time.time() - (conn.last_attempt or 0) + if conn.state is ConnectionStates.DISCONNECTED: + return max(self.config['reconnect_backoff_ms'] - time_waited_ms, 0) + elif conn.state is ConnectionStates.CONNECTING: + return 0 + else: + return 999999999 + + def is_ready(self, node_id): + """Check whether a node is ready to send more requests. + + In addition to connection-level checks, this method also is used to + block additional requests from being sent during a metadata refresh. + + Arguments: + node_id (int): id of the node to check + + Returns: + bool: True if the node is ready and metadata is not refreshing + """ + # if we need to update our metadata now declare all requests unready to + # make metadata requests first priority + if not self._metadata_refresh_in_progress and not self.cluster.ttl() == 0: + if self._can_send_request(node_id): + return True + return False + + def _can_send_request(self, node_id): + if node_id not in self._conns: + return False + conn = self._conns[node_id] + return conn.connected() and conn.can_send_more() + + def send(self, node_id, request): + """Send a request to a specific node. + + Arguments: + node_id (int): destination node + request (Struct): request object (not-encoded) + + Raises: + NodeNotReadyError: if node_id is not ready + + Returns: + Future: resolves to Response struct + """ + if not self._can_send_request(node_id): + raise Errors.NodeNotReadyError("Attempt to send a request to node" + " which is not ready (node id %s)." + % node_id) + + # Every request gets a response, except one special case: + expect_response = True + if isinstance(request, ProduceRequest) and request.required_acks == 0: + expect_response = False + + return self._conns[node_id].send(request, expect_response=expect_response) + + def poll(self, timeout_ms=None, future=None, sleep=False): + """Try to read and write to sockets. + + This method will also attempt to complete node connections, refresh + stale metadata, and run previously-scheduled tasks. + + Arguments: + timeout_ms (int, optional): maximum amount of time to wait (in ms) + for at least one response. Must be non-negative. The actual + timeout will be the minimum of timeout, request timeout and + metadata timeout. Default: request_timeout_ms + future (Future, optional): if provided, blocks until future.is_done + sleep (bool): if True and there is nothing to do (no connections + or requests in flight), will sleep for duration timeout before + returning empty results. Default: False. + + Returns: + list: responses received (can be empty) + """ + if timeout_ms is None: + timeout_ms = self.config['request_timeout_ms'] + + responses = [] + + # Loop for futures, break after first loop if None + while True: + + # Attempt to complete pending connections + for node_id in list(self._connecting): + self._finish_connect(node_id) + + # Send a metadata request if needed + metadata_timeout_ms = self._maybe_refresh_metadata() + + # Send scheduled tasks + for task, task_future in self._delayed_tasks.pop_ready(): + try: + result = task() + except Exception as e: + log.error("Task %s failed: %s", task, e) + task_future.failure(e) + else: + task_future.success(result) + + # If we got a future that is already done, dont block in _poll + if future and future.is_done: + timeout = 0 + else: + timeout = min( + timeout_ms, + metadata_timeout_ms, + self._delayed_tasks.next_at() * 1000, + self.config['request_timeout_ms']) + timeout = max(0, timeout / 1000.0) # avoid negative timeouts + + responses.extend(self._poll(timeout, sleep=sleep)) + + # If all we had was a timeout (future is None) - only do one poll + # If we do have a future, we keep looping until it is done + if not future or future.is_done: + break + + return responses + + def _poll(self, timeout, sleep=False): + # select on reads across all connected sockets, blocking up to timeout + sockets = dict([(conn._sock, conn) + for conn in six.itervalues(self._conns) + if conn.state is ConnectionStates.CONNECTED + and conn.in_flight_requests]) + if not sockets: + # if sockets are connecting, we can wake when they are writeable + if self._connecting: + sockets = [self._conns[node]._sock for node in self._connecting] + select.select([self._wake_r], sockets, [], timeout) + elif timeout: + if sleep: + log.debug('Sleeping at %s for %s', time.time(), timeout) + select.select([self._wake_r], [], [], timeout) + log.debug('Woke up at %s', time.time()) + else: + log.warning('_poll called with a non-zero timeout and' + ' sleep=False -- but there was nothing to do.' + ' This can cause high CPU usage during idle.') + self._clear_wake_fd() + return [] + + # Add a private pipe fd to allow external wakeups + fds = list(sockets.keys()) + fds.append(self._wake_r) + ready, _, _ = select.select(fds, [], [], timeout) + + responses = [] + for sock in ready: + if sock == self._wake_r: + continue + conn = sockets[sock] + while conn.in_flight_requests: + response = conn.recv() # Note: conn.recv runs callbacks / errbacks + if not response: + break + responses.append(response) + self._clear_wake_fd() + return responses + + def in_flight_request_count(self, node_id=None): + """Get the number of in-flight requests for a node or all nodes. + + Arguments: + node_id (int, optional): a specific node to check. If unspecified, + return the total for all nodes + + Returns: + int: pending in-flight requests for the node, or all nodes if None + """ + if node_id is not None: + if node_id not in self._conns: + return 0 + return len(self._conns[node_id].in_flight_requests) + else: + return sum([len(conn.in_flight_requests) for conn in self._conns.values()]) + + def least_loaded_node(self): + """Choose the node with fewest outstanding requests, with fallbacks. + + This method will prefer a node with an existing connection, but will + potentially choose a node for which we don't yet have a connection if + all existing connections are in use. This method will never choose a + node that was disconnected within the reconnect backoff period. + If all else fails, the method will attempt to bootstrap again using the + bootstrap_servers list. + + Returns: + node_id or None if no suitable node was found + """ + nodes = list(self._conns.keys()) + random.shuffle(nodes) + + # If there's a lingering bootstrap node, always try it last + # really we should just kill this connection + if 'bootstrap' in nodes: + nodes.remove('bootstrap') + nodes.append('bootstrap') + + inflight = float('inf') + found = None + for node_id in nodes: + conn = self._conns[node_id] + curr_inflight = len(conn.in_flight_requests) + if curr_inflight == 0 and conn.connected(): + # if we find an established connection with no in-flight requests we can stop right away + return node_id + elif not conn.blacked_out() and curr_inflight < inflight: + # otherwise if this is the best we have found so far, record that + inflight = curr_inflight + found = node_id + + if found is not None: + return found + + # if we found no connected node, return a disconnected one + log.debug("No connected nodes found. Trying disconnected nodes.") + for node_id in nodes: + if not self._conns[node_id].blacked_out(): + return node_id + + # if still no luck, look for a node not in self._conns yet + log.debug("No luck. Trying all broker metadata") + for broker in self.cluster.brokers(): + if broker.nodeId not in self._conns: + return broker.nodeId + + # Last option: try to bootstrap again + log.error('No nodes found in metadata -- retrying bootstrap') + self._bootstrap(collect_hosts(self.config['bootstrap_servers'])) + return None + + def set_topics(self, topics): + """Set specific topics to track for metadata. + + Arguments: + topics (list of str): topics to check for metadata + + Returns: + Future: resolves after metadata request/response + """ + if set(topics).difference(self._topics): + future = self.cluster.request_update() + else: + future = Future().success(set(topics)) + self._topics = set(topics) + return future + + def add_topic(self, topic): + """Add a topic to the list of topics tracked via metadata. + + Arguments: + topic (str): topic to track + + Returns: + Future: resolves after metadata request/response + """ + if topic in self._topics: + return Future().success(set(self._topics)) + + self._topics.add(topic) + return self.cluster.request_update() + + # request metadata update on disconnect and timedout + def _maybe_refresh_metadata(self): + """Send a metadata request if needed. + + Returns: + int: milliseconds until next refresh + """ + ttl = self.cluster.ttl() + if ttl > 0: + return ttl + + if self._metadata_refresh_in_progress: + return 9999999999 + + node_id = self.least_loaded_node() + + topics = list(self._topics) + if self.cluster.need_all_topic_metadata: + topics = [] + + if self._can_send_request(node_id): + request = MetadataRequest(topics) + log.debug("Sending metadata request %s to node %s", request, node_id) + future = self.send(node_id, request) + future.add_callback(self.cluster.update_metadata) + future.add_errback(self.cluster.failed_update) + + self._metadata_refresh_in_progress = True + def refresh_done(val_or_error): + self._metadata_refresh_in_progress = False + future.add_callback(refresh_done) + future.add_errback(refresh_done) + + elif self._can_connect(node_id): + log.debug("Initializing connection to node %s for metadata request", node_id) + self._initiate_connect(node_id) + + return 0 + + def schedule(self, task, at): + """Schedule a new task to be executed at the given time. + + This is "best-effort" scheduling and should only be used for coarse + synchronization. A task cannot be scheduled for multiple times + simultaneously; any previously scheduled instance of the same task + will be cancelled. + + Arguments: + task (callable): task to be scheduled + at (float or int): epoch seconds when task should run + + Returns: + Future: resolves to result of task call, or exception if raised + """ + return self._delayed_tasks.add(task, at) + + def unschedule(self, task): + """Unschedule a task. + + This will remove all instances of the task from the task queue. + This is a no-op if the task is not scheduled. + + Arguments: + task (callable): task to be unscheduled + """ + self._delayed_tasks.remove(task) + + def check_version(self, node_id=None, timeout=2, strict=False): + """Attempt to guess the broker version""" + if node_id is None: + node_id = self.least_loaded_node() + + def connect(node_id): + timeout_at = time.time() + timeout + # brokers < 0.9 do not return any broker metadata if there are no topics + # so we're left with a single bootstrap connection + while not self.ready(node_id): + if time.time() >= timeout_at: + raise Errors.NodeNotReadyError(node_id) + time.sleep(0.025) + + # Monkeypatch the connection request timeout + # Generally this timeout should not get triggered + # but in case it does, we want it to be reasonably short + self._conns[node_id].config['request_timeout_ms'] = timeout * 1000 + + # kafka kills the connection when it doesnt recognize an API request + # so we can send a test request and then follow immediately with a + # vanilla MetadataRequest. If the server did not recognize the first + # request, both will be failed with a ConnectionError that wraps + # socket.error (32, 54, or 104) + import socket + from .protocol.admin import ListGroupsRequest + from .protocol.commit import ( + OffsetFetchRequest_v0, GroupCoordinatorRequest) + from .protocol.metadata import MetadataRequest + + # Socket errors are logged as exceptions and can alarm users. Mute them + from logging import Filter + class ConnFilter(Filter): + def filter(self, record): + if record.funcName in ('recv', 'send'): + return False + return True + log_filter = ConnFilter() + + test_cases = [ + ('0.9', ListGroupsRequest()), + ('0.8.2', GroupCoordinatorRequest('kafka-python-default-group')), + ('0.8.1', OffsetFetchRequest_v0('kafka-python-default-group', [])), + ('0.8.0', MetadataRequest([])), + ] + + logging.getLogger('kafka.conn').addFilter(log_filter) + for version, request in test_cases: + connect(node_id) + f = self.send(node_id, request) + time.sleep(0.1) # HACK: sleeping to wait for socket to send bytes + metadata = self.send(node_id, MetadataRequest([])) + self.poll(future=f) + self.poll(future=metadata) + + assert f.is_done, 'Future is not done? Please file bug report' + + if f.succeeded(): + log.info('Broker version identifed as %s', version) + break + + # Only enable strict checking to verify that we understand failure + # modes. For most users, the fact that the request failed should be + # enough to rule out a particular broker version. + if strict: + # If the socket flush hack did not work (which should force the + # connection to close and fail all pending requests), then we + # get a basic Request Timeout. Thisisn + if isinstance(f.exception, Errors.RequestTimedOutError): + pass + elif six.PY2: + assert isinstance(f.exception.args[0], socket.error) + assert f.exception.args[0].errno in (32, 54, 104) + else: + assert isinstance(f.exception.args[0], ConnectionError) + log.info("Broker is not v%s -- it did not recognize %s", + version, request.__class__.__name__) + else: + + raise Errors.UnrecognizedBrokerVersion() + + logging.getLogger('kafka.conn').removeFilter(log_filter) + self._conns[node_id].config['request_timeout_ms'] = self.config['request_timeout_ms'] + return version + + def wakeup(self): + os.write(self._wake_w, b'x') + + def _clear_wake_fd(self): + while True: + fds, _, _ = select.select([self._wake_r], [], [], 0) + if not fds: + break + os.read(self._wake_r, 1) + + +class DelayedTaskQueue(object): + # see https://docs.python.org/2/library/heapq.html + def __init__(self): + self._tasks = [] # list of entries arranged in a heap + self._task_map = {} # mapping of tasks to entries + self._counter = itertools.count() # unique sequence count + + def add(self, task, at): + """Add a task to run at a later time. + + Arguments: + task: can be anything, but generally a callable + at (float or int): epoch seconds to schedule task + + Returns: + Future: a future that will be returned with the task when ready + """ + if task in self._task_map: + self.remove(task) + count = next(self._counter) + future = Future() + entry = [at, count, (task, future)] + self._task_map[task] = entry + heapq.heappush(self._tasks, entry) + return future + + def remove(self, task): + """Remove a previously scheduled task. + + Raises: + KeyError: if task is not found + """ + entry = self._task_map.pop(task) + task, future = entry[-1] + future.failure(Errors.Cancelled) + entry[-1] = 'REMOVED' + + def _drop_removed(self): + while self._tasks and self._tasks[0][-1] is 'REMOVED': + at, count, task = heapq.heappop(self._tasks) + + def _pop_next(self): + self._drop_removed() + if not self._tasks: + raise KeyError('pop from an empty DelayedTaskQueue') + _, _, maybe_task = heapq.heappop(self._tasks) + if maybe_task is 'REMOVED': + raise ValueError('popped a removed tasks from queue - bug') + else: + task, future = maybe_task + del self._task_map[task] + return (task, future) + + def next_at(self): + """Number of seconds until next task is ready.""" + self._drop_removed() + if not self._tasks: + return 9999999999 + else: + return max(self._tasks[0][0] - time.time(), 0) + + def pop_ready(self): + """Pop and return a list of all ready (task, future) tuples""" + ready_tasks = [] + while self._tasks and self._tasks[0][0] < time.time(): + try: + task = self._pop_next() + except KeyError: + break + ready_tasks.append(task) + return ready_tasks diff -Nru python-kafka-python-0.9.2/kafka/client.py python-kafka-python-1.0.1/kafka/client.py --- python-kafka-python-0.9.2/kafka/client.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/client.py 2016-02-19 17:01:46.000000000 +0000 @@ -1,104 +1,178 @@ import collections import copy import functools -import itertools import logging +import random import time -import kafka.common -from kafka.common import (TopicAndPartition, - ConnectionError, FailedPayloadsError, - PartitionUnavailableError, LeaderUnavailableError, KafkaUnavailableError, - KafkaTimeoutError, - UnknownTopicOrPartitionError, NotLeaderForPartitionError) +import six -from kafka.conn import collect_hosts, KafkaConnection, DEFAULT_SOCKET_TIMEOUT_SECONDS +import kafka.common +from kafka.common import (TopicPartition, BrokerMetadata, UnknownError, + ConnectionError, FailedPayloadsError, + KafkaTimeoutError, KafkaUnavailableError, + LeaderNotAvailableError, UnknownTopicOrPartitionError, + NotLeaderForPartitionError, ReplicaNotAvailableError) + +from kafka.conn import ( + collect_hosts, BrokerConnection, DEFAULT_SOCKET_TIMEOUT_SECONDS, + ConnectionStates) from kafka.protocol import KafkaProtocol -log = logging.getLogger("kafka") +# New KafkaClient +# this is not exposed in top-level imports yet, +# due to conflicts with legacy SimpleConsumer / SimpleProducer usage +from kafka.client_async import KafkaClient -class KafkaClient(object): +log = logging.getLogger(__name__) - CLIENT_ID = "kafka-python" - ID_GEN = itertools.count() + +# Legacy KafkaClient interface -- will be deprecated soon +class SimpleClient(object): + + CLIENT_ID = b'kafka-python' # NOTE: The timeout given to the client should always be greater than the # one passed to SimpleConsumer.get_message(), otherwise you can get a # socket timeout. def __init__(self, hosts, client_id=CLIENT_ID, - timeout=DEFAULT_SOCKET_TIMEOUT_SECONDS): + timeout=DEFAULT_SOCKET_TIMEOUT_SECONDS, + correlation_id=0): # We need one connection to bootstrap self.client_id = client_id self.timeout = timeout self.hosts = collect_hosts(hosts) + self.correlation_id = correlation_id - # create connections only when we need them - self.conns = {} + self._conns = {} self.brokers = {} # broker_id -> BrokerMetadata - self.topics_to_brokers = {} # topic_id -> broker_id - self.topic_partitions = {} # topic_id -> [0, 1, 2, ...] - self.load_metadata_for_topics() # bootstrap with all metadata + self.topics_to_brokers = {} # TopicPartition -> BrokerMetadata + self.topic_partitions = {} # topic -> partition -> leader + self.load_metadata_for_topics() # bootstrap with all metadata ################## # Private API # ################## def _get_conn(self, host, port): - "Get or create a connection to a broker using host and port" + """Get or create a connection to a broker using host and port""" host_key = (host, port) - if host_key not in self.conns: - self.conns[host_key] = KafkaConnection( - host, - port, - timeout=self.timeout + if host_key not in self._conns: + self._conns[host_key] = BrokerConnection( + host, port, + request_timeout_ms=self.timeout * 1000, + client_id=self.client_id ) - return self.conns[host_key] + conn = self._conns[host_key] + while conn.connect() == ConnectionStates.CONNECTING: + pass + return conn def _get_leader_for_partition(self, topic, partition): """ Returns the leader for a partition or None if the partition exists but has no leader. - PartitionUnavailableError will be raised if the topic or partition + UnknownTopicOrPartitionError will be raised if the topic or partition is not part of the metadata. + + LeaderNotAvailableError is raised if server has metadata, but there is + no current leader """ - key = TopicAndPartition(topic, partition) - # reload metadata whether the partition is not available - # or has no leader (broker is None) - if self.topics_to_brokers.get(key) is None: - self.load_metadata_for_topics(topic) + key = TopicPartition(topic, partition) - if key not in self.topics_to_brokers: - raise PartitionUnavailableError("%s not available" % str(key)) + # Use cached metadata if it is there + if self.topics_to_brokers.get(key) is not None: + return self.topics_to_brokers[key] - return self.topics_to_brokers[key] + # Otherwise refresh metadata - def _next_id(self): + # If topic does not already exist, this will raise + # UnknownTopicOrPartitionError if not auto-creating + # LeaderNotAvailableError otherwise until partitions are created + self.load_metadata_for_topics(topic) + + # If the partition doesn't actually exist, raise + if partition not in self.topic_partitions.get(topic, []): + raise UnknownTopicOrPartitionError(key) + + # If there's no leader for the partition, raise + leader = self.topic_partitions[topic][partition] + if leader == -1: + raise LeaderNotAvailableError((topic, partition)) + + # Otherwise return the BrokerMetadata + return self.brokers[leader] + + def _get_coordinator_for_group(self, group): """ - Generate a new correlation id + Returns the coordinator broker for a consumer group. + + GroupCoordinatorNotAvailableError will be raised if the coordinator + does not currently exist for the group. + + GroupLoadInProgressError is raised if the coordinator is available + but is still loading offsets from the internal topic """ - return KafkaClient.ID_GEN.next() - def _send_broker_unaware_request(self, requestId, request): + resp = self.send_consumer_metadata_request(group) + + # If there's a problem with finding the coordinator, raise the + # provided error + kafka.common.check_error(resp) + + # Otherwise return the BrokerMetadata + return BrokerMetadata(resp.nodeId, resp.host, resp.port) + + def _next_id(self): + """Generate a new correlation id""" + # modulo to keep w/i int32 + self.correlation_id = (self.correlation_id + 1) % 2**31 + return self.correlation_id + + def _send_broker_unaware_request(self, payloads, encoder_fn, decoder_fn): """ Attempt to send a broker-agnostic request to one of the available brokers. Keep trying until you succeed. """ - for (host, port) in self.hosts: - try: - conn = self._get_conn(host, port) - conn.send(requestId, request) - response = conn.recv(requestId) - return response - except Exception as e: - log.warning("Could not send request [%r] to server %s:%i, " - "trying next server: %s" % (request, host, port, e)) + hosts = set([(broker.host, broker.port) for broker in self.brokers.values()]) + hosts.update(self.hosts) + hosts = list(hosts) + random.shuffle(hosts) + + for (host, port) in hosts: + conn = self._get_conn(host, port) + if not conn.connected(): + log.warning("Skipping unconnected connection: %s", conn) + continue + request = encoder_fn(payloads=payloads) + future = conn.send(request) + + # Block + while not future.is_done: + conn.recv() + + if future.failed(): + log.error("Request failed: %s", future.exception) + continue - raise KafkaUnavailableError("All servers failed to process request") + return decoder_fn(future.value) + + raise KafkaUnavailableError('All servers failed to process request: %s' % hosts) + + def _payloads_by_broker(self, payloads): + payloads_by_broker = collections.defaultdict(list) + for payload in payloads: + try: + leader = self._get_leader_for_partition(payload.topic, payload.partition) + except KafkaUnavailableError: + leader = None + payloads_by_broker[leader].append(payload) + return dict(payloads_by_broker) def _send_broker_aware_request(self, payloads, encoder_fn, decoder_fn): """ @@ -106,175 +180,385 @@ the leader broker for that partition using the supplied encode/decode functions - Params - ====== - payloads: list of object-like entities with a topic and - partition attribute + Arguments: + + payloads: list of object-like entities with a topic (str) and + partition (int) attribute; payloads with duplicate topic-partitions + are not supported. + encode_fn: a method to encode the list of payloads to a request body, - must accept client_id, correlation_id, and payloads as - keyword arguments + must accept client_id, correlation_id, and payloads as + keyword arguments + decode_fn: a method to decode a response body into response objects. - The response objects must be object-like and have topic - and partition attributes + The response objects must be object-like and have topic + and partition attributes + + Returns: - Return - ====== List of response objects in the same order as the supplied payloads """ + # encoders / decoders do not maintain ordering currently + # so we need to keep this so we can rebuild order before returning + original_ordering = [(p.topic, p.partition) for p in payloads] + + # Connection errors generally mean stale metadata + # although sometimes it means incorrect api request + # Unfortunately there is no good way to tell the difference + # so we'll just reset metadata on all errors to be safe + refresh_metadata = False - # Group the requests by topic+partition - original_keys = [] - payloads_by_broker = collections.defaultdict(list) + # For each broker, send the list of request payloads + # and collect the responses and errors + payloads_by_broker = self._payloads_by_broker(payloads) + responses = {} + + def failed_payloads(payloads): + for payload in payloads: + topic_partition = (str(payload.topic), payload.partition) + responses[(topic_partition)] = FailedPayloadsError(payload) + + # For each BrokerConnection keep the real socket so that we can use + # a select to perform unblocking I/O + connections_by_future = {} + for broker, broker_payloads in six.iteritems(payloads_by_broker): + if broker is None: + failed_payloads(broker_payloads) + continue - for payload in payloads: - leader = self._get_leader_for_partition(payload.topic, - payload.partition) - if leader is None: - raise LeaderUnavailableError( - "Leader not available for topic %s partition %s" % - (payload.topic, payload.partition)) + conn = self._get_conn(broker.host, broker.port) + conn.connect() + if not conn.connected(): + refresh_metadata = True + failed_payloads(broker_payloads) + continue - payloads_by_broker[leader].append(payload) - original_keys.append((payload.topic, payload.partition)) + request = encoder_fn(payloads=broker_payloads) + # decoder_fn=None signal that the server is expected to not + # send a response. This probably only applies to + # ProduceRequest w/ acks = 0 + expect_response = (decoder_fn is not None) + future = conn.send(request, expect_response=expect_response) + + if future.failed(): + refresh_metadata = True + failed_payloads(broker_payloads) + continue - # Accumulate the responses in a dictionary - acc = {} + if not expect_response: + for payload in broker_payloads: + topic_partition = (str(payload.topic), payload.partition) + responses[topic_partition] = None + continue - # keep a list of payloads that were failed to be sent to brokers - failed_payloads = [] + connections_by_future[future] = (conn, broker) - # For each broker, send the list of request payloads - for broker, payloads in payloads_by_broker.items(): + conn = None + while connections_by_future: + futures = list(connections_by_future.keys()) + for future in futures: + + if not future.is_done: + conn, _ = connections_by_future[future] + conn.recv() + continue + + _, broker = connections_by_future.pop(future) + if future.failed(): + refresh_metadata = True + failed_payloads(payloads_by_broker[broker]) + + else: + for payload_response in decoder_fn(future.value): + topic_partition = (str(payload_response.topic), + payload_response.partition) + responses[topic_partition] = payload_response + + if refresh_metadata: + self.reset_all_metadata() + + # Return responses in the same order as provided + return [responses[tp] for tp in original_ordering] + + def _send_consumer_aware_request(self, group, payloads, encoder_fn, decoder_fn): + """ + Send a list of requests to the consumer coordinator for the group + specified using the supplied encode/decode functions. As the payloads + that use consumer-aware requests do not contain the group (e.g. + OffsetFetchRequest), all payloads must be for a single group. + + Arguments: + + group: the name of the consumer group (str) the payloads are for + payloads: list of object-like entities with topic (str) and + partition (int) attributes; payloads with duplicate + topic+partition are not supported. + + encode_fn: a method to encode the list of payloads to a request body, + must accept client_id, correlation_id, and payloads as + keyword arguments + + decode_fn: a method to decode a response body into response objects. + The response objects must be object-like and have topic + and partition attributes + + Returns: + + List of response objects in the same order as the supplied payloads + """ + # encoders / decoders do not maintain ordering currently + # so we need to keep this so we can rebuild order before returning + original_ordering = [(p.topic, p.partition) for p in payloads] + + broker = self._get_coordinator_for_group(group) + + # Send the list of request payloads and collect the responses and + # errors + responses = {} + requestId = self._next_id() + log.debug('Request %s to %s: %s', requestId, broker, payloads) + request = encoder_fn(client_id=self.client_id, + correlation_id=requestId, payloads=payloads) + + # Send the request, recv the response + try: conn = self._get_conn(broker.host, broker.port) - requestId = self._next_id() - request = encoder_fn(client_id=self.client_id, - correlation_id=requestId, payloads=payloads) + conn.send(requestId, request) + + except ConnectionError as e: + log.warning('ConnectionError attempting to send request %s ' + 'to server %s: %s', requestId, broker, e) + + for payload in payloads: + topic_partition = (payload.topic, payload.partition) + responses[topic_partition] = FailedPayloadsError(payload) + + # No exception, try to get response + else: + + # decoder_fn=None signal that the server is expected to not + # send a response. This probably only applies to + # ProduceRequest w/ acks = 0 + if decoder_fn is None: + log.debug('Request %s does not expect a response ' + '(skipping conn.recv)', requestId) + for payload in payloads: + topic_partition = (payload.topic, payload.partition) + responses[topic_partition] = None + return [] - failed = False - # Send the request, recv the response try: - conn.send(requestId, request) - if decoder_fn is None: - continue - try: - response = conn.recv(requestId) - except ConnectionError as e: - log.warning("Could not receive response to request [%s] " - "from server %s: %s", request, conn, e) - failed = True + response = conn.recv(requestId) except ConnectionError as e: - log.warning("Could not send request [%s] to server %s: %s", - request, conn, e) - failed = True - - if failed: - failed_payloads += payloads - self.reset_all_metadata() - continue + log.warning('ConnectionError attempting to receive a ' + 'response to request %s from server %s: %s', + requestId, broker, e) + + for payload in payloads: + topic_partition = (payload.topic, payload.partition) + responses[topic_partition] = FailedPayloadsError(payload) - for response in decoder_fn(response): - acc[(response.topic, response.partition)] = response - - if failed_payloads: - raise FailedPayloadsError(failed_payloads) + else: + _resps = [] + for payload_response in decoder_fn(response): + topic_partition = (payload_response.topic, + payload_response.partition) + responses[topic_partition] = payload_response + _resps.append(payload_response) + log.debug('Response %s: %s', requestId, _resps) - # Order the accumulated responses by the original key order - return (acc[k] for k in original_keys) if acc else () + # Return responses in the same order as provided + return [responses[tp] for tp in original_ordering] def __repr__(self): return '' % (self.client_id) def _raise_on_response_error(self, resp): + + # Response can be an unraised exception object (FailedPayloadsError) + if isinstance(resp, Exception): + raise resp + + # Or a server api error response try: kafka.common.check_error(resp) except (UnknownTopicOrPartitionError, NotLeaderForPartitionError): self.reset_topic_metadata(resp.topic) raise + # Return False if no error to enable list comprehensions + return False + ################# # Public API # ################# - def reset_topic_metadata(self, *topics): - for topic in topics: - try: - partitions = self.topic_partitions[topic] - except KeyError: - continue + def close(self): + for conn in self._conns.values(): + conn.close() + + def copy(self): + """ + Create an inactive copy of the client object, suitable for passing + to a separate thread. + + Note that the copied connections are not initialized, so reinit() must + be called on the returned copy. + """ + _conns = self._conns + self._conns = {} + c = copy.deepcopy(self) + self._conns = _conns + return c - for partition in partitions: - self.topics_to_brokers.pop(TopicAndPartition(topic, partition), None) + def reinit(self): + for conn in self._conns.values(): + conn.close() + while conn.connect() == ConnectionStates.CONNECTING: + pass - del self.topic_partitions[topic] + def reset_topic_metadata(self, *topics): + for topic in topics: + for topic_partition in list(self.topics_to_brokers.keys()): + if topic_partition.topic == topic: + del self.topics_to_brokers[topic_partition] + if topic in self.topic_partitions: + del self.topic_partitions[topic] def reset_all_metadata(self): self.topics_to_brokers.clear() self.topic_partitions.clear() def has_metadata_for_topic(self, topic): - return topic in self.topic_partitions + return ( + topic in self.topic_partitions + and len(self.topic_partitions[topic]) > 0 + ) + + def get_partition_ids_for_topic(self, topic): + if topic not in self.topic_partitions: + return [] + + return sorted(list(self.topic_partitions[topic])) + + @property + def topics(self): + return list(self.topic_partitions.keys()) def ensure_topic_exists(self, topic, timeout = 30): start_time = time.time() - self.load_metadata_for_topics(topic) while not self.has_metadata_for_topic(topic): if time.time() > start_time + timeout: - raise KafkaTimeoutError("Unable to create topic {0}".format(topic)) - self.load_metadata_for_topics(topic) + raise KafkaTimeoutError('Unable to create topic {0}'.format(topic)) + try: + self.load_metadata_for_topics(topic) + except LeaderNotAvailableError: + pass + except UnknownTopicOrPartitionError: + # Server is not configured to auto-create + # retrying in this case will not help + raise time.sleep(.5) - def close(self): - for conn in self.conns.values(): - conn.close() - - def copy(self): - """ - Create an inactive copy of the client object - A reinit() has to be done on the copy before it can be used again - """ - c = copy.deepcopy(self) - for k, v in c.conns.items(): - c.conns[k] = v.copy() - return c - - def reinit(self): - for conn in self.conns.values(): - conn.reinit() - def load_metadata_for_topics(self, *topics): - """ - Discover brokers and metadata for a set of topics. This function is called - lazily whenever metadata is unavailable. - """ - request_id = self._next_id() - request = KafkaProtocol.encode_metadata_request(self.client_id, - request_id, topics) - - response = self._send_broker_unaware_request(request_id, request) + """Fetch broker and topic-partition metadata from the server. - (brokers, topics) = KafkaProtocol.decode_metadata_response(response) - - log.debug("Broker metadata: %s", brokers) - log.debug("Topic metadata: %s", topics) + Updates internal data: broker list, topic/partition list, and + topic/parition -> broker map. This method should be called after + receiving any error. + + Note: Exceptions *will not* be raised in a full refresh (i.e. no topic + list). In this case, error codes will be logged as errors. + Partition-level errors will also not be raised here (a single partition + w/o a leader, for example). + + Arguments: + *topics (optional): If a list of topics is provided, + the metadata refresh will be limited to the specified topics + only. + + Raises: + UnknownTopicOrPartitionError: Raised for topics that do not exist, + unless the broker is configured to auto-create topics. + LeaderNotAvailableError: Raised for topics that do not exist yet, + when the broker is configured to auto-create topics. Retry + after a short backoff (topics/partitions are initializing). + """ + if topics: + self.reset_topic_metadata(*topics) + else: + self.reset_all_metadata() - self.brokers = brokers + resp = self.send_metadata_request(topics) - for topic, partitions in topics.items(): - self.reset_topic_metadata(topic) + log.debug('Updating broker metadata: %s', resp.brokers) + log.debug('Updating topic metadata: %s', [topic for _, topic, _ in resp.topics]) - if not partitions: - log.warning('No partitions for %s', topic) - continue + self.brokers = dict([(nodeId, BrokerMetadata(nodeId, host, port)) + for nodeId, host, port in resp.brokers]) + + for error, topic, partitions in resp.topics: + # Errors expected for new topics + if error: + error_type = kafka.common.kafka_errors.get(error, UnknownError) + if error_type in (UnknownTopicOrPartitionError, LeaderNotAvailableError): + log.error('Error loading topic metadata for %s: %s (%s)', + topic, error_type, error) + if topic not in topics: + continue + raise error_type(topic) + + self.topic_partitions[topic] = {} + for error, partition, leader, _, _ in partitions: + + self.topic_partitions[topic][partition] = leader + + # Populate topics_to_brokers dict + topic_part = TopicPartition(topic, partition) + + # Check for partition errors + if error: + error_type = kafka.common.kafka_errors.get(error, UnknownError) + + # If No Leader, topics_to_brokers topic_partition -> None + if error_type is LeaderNotAvailableError: + log.error('No leader for topic %s partition %d', topic, partition) + self.topics_to_brokers[topic_part] = None + continue + + # If one of the replicas is unavailable -- ignore + # this error code is provided for admin purposes only + # we never talk to replicas, only the leader + elif error_type is ReplicaNotAvailableError: + log.debug('Some (non-leader) replicas not available for topic %s partition %d', topic, partition) + + else: + raise error_type(topic_part) + + # If Known Broker, topic_partition -> BrokerMetadata + if leader in self.brokers: + self.topics_to_brokers[topic_part] = self.brokers[leader] - self.topic_partitions[topic] = [] - for partition, meta in partitions.items(): - self.topic_partitions[topic].append(partition) - topic_part = TopicAndPartition(topic, partition) - if meta.leader == -1: - log.warning('No leader for topic %s partition %s', topic, partition) - self.topics_to_brokers[topic_part] = None + # If Unknown Broker, fake BrokerMetadata so we dont lose the id + # (not sure how this could happen. server could be in bad state) else: - self.topics_to_brokers[topic_part] = brokers[meta.leader] + self.topics_to_brokers[topic_part] = BrokerMetadata( + leader, None, None + ) + + def send_metadata_request(self, payloads=[], fail_on_error=True, + callback=None): + encoder = KafkaProtocol.encode_metadata_request + decoder = KafkaProtocol.decode_metadata_response + + return self._send_broker_unaware_request(payloads, encoder, decoder) + + def send_consumer_metadata_request(self, payloads=[], fail_on_error=True, + callback=None): + encoder = KafkaProtocol.encode_consumer_metadata_request + decoder = KafkaProtocol.decode_consumer_metadata_response + + return self._send_broker_unaware_request(payloads, encoder, decoder) def send_produce_request(self, payloads=[], acks=1, timeout=1000, fail_on_error=True, callback=None): @@ -285,18 +569,29 @@ sent to a specific broker. Output is a list of responses in the same order as the list of payloads specified - Params - ====== - payloads: list of ProduceRequest - fail_on_error: boolean, should we raise an Exception if we - encounter an API error? - callback: function, instead of returning the ProduceResponse, - first pass it through this function - - Return - ====== - list of ProduceResponse or callback(ProduceResponse), in the - order of input payloads + Arguments: + payloads (list of ProduceRequest): produce requests to send to kafka + ProduceRequest payloads must not contain duplicates for any + topic-partition. + acks (int, optional): how many acks the servers should receive from replica + brokers before responding to the request. If it is 0, the server + will not send any response. If it is 1, the server will wait + until the data is written to the local log before sending a + response. If it is -1, the server will wait until the message + is committed by all in-sync replicas before sending a response. + For any value > 1, the server will wait for this number of acks to + occur (but the server will never wait for more acknowledgements than + there are in-sync replicas). defaults to 1. + timeout (int, optional): maximum time in milliseconds the server can + await the receipt of the number of acks, defaults to 1000. + fail_on_error (bool, optional): raise exceptions on connection and + server response errors, defaults to True. + callback (function, optional): instead of returning the ProduceResponse, + first pass it through this function, defaults to None. + + Returns: + list of ProduceResponses, or callback results if supplied, in the + order of input payloads """ encoder = functools.partial( @@ -311,16 +606,9 @@ resps = self._send_broker_aware_request(payloads, encoder, decoder) - out = [] - for resp in resps: - if fail_on_error is True: - self._raise_on_response_error(resp) - - if callback is not None: - out.append(callback(resp)) - else: - out.append(resp) - return out + return [resp if not callback else callback(resp) for resp in resps + if resp is not None and + (not fail_on_error or not self._raise_on_response_error(resp))] def send_fetch_request(self, payloads=[], fail_on_error=True, callback=None, max_wait_time=100, min_bytes=4096): @@ -339,16 +627,8 @@ payloads, encoder, KafkaProtocol.decode_fetch_response) - out = [] - for resp in resps: - if fail_on_error is True: - self._raise_on_response_error(resp) - - if callback is not None: - out.append(callback(resp)) - else: - out.append(resp) - return out + return [resp if not callback else callback(resp) for resp in resps + if not fail_on_error or not self._raise_on_response_error(resp)] def send_offset_request(self, payloads=[], fail_on_error=True, callback=None): @@ -357,15 +637,8 @@ KafkaProtocol.encode_offset_request, KafkaProtocol.decode_offset_response) - out = [] - for resp in resps: - if fail_on_error is True: - self._raise_on_response_error(resp) - if callback is not None: - out.append(callback(resp)) - else: - out.append(resp) - return out + return [resp if not callback else callback(resp) for resp in resps + if not fail_on_error or not self._raise_on_response_error(resp)] def send_offset_commit_request(self, group, payloads=[], fail_on_error=True, callback=None): @@ -374,16 +647,8 @@ decoder = KafkaProtocol.decode_offset_commit_response resps = self._send_broker_aware_request(payloads, encoder, decoder) - out = [] - for resp in resps: - if fail_on_error is True: - self._raise_on_response_error(resp) - - if callback is not None: - out.append(callback(resp)) - else: - out.append(resp) - return out + return [resp if not callback else callback(resp) for resp in resps + if not fail_on_error or not self._raise_on_response_error(resp)] def send_offset_fetch_request(self, group, payloads=[], fail_on_error=True, callback=None): @@ -393,12 +658,16 @@ decoder = KafkaProtocol.decode_offset_fetch_response resps = self._send_broker_aware_request(payloads, encoder, decoder) - out = [] - for resp in resps: - if fail_on_error is True: - self._raise_on_response_error(resp) - if callback is not None: - out.append(callback(resp)) - else: - out.append(resp) - return out + return [resp if not callback else callback(resp) for resp in resps + if not fail_on_error or not self._raise_on_response_error(resp)] + + def send_offset_fetch_request_kafka(self, group, payloads=[], + fail_on_error=True, callback=None): + + encoder = functools.partial(KafkaProtocol.encode_offset_fetch_request, + group=group, from_kafka=True) + decoder = KafkaProtocol.decode_offset_fetch_response + resps = self._send_consumer_aware_request(group, payloads, encoder, decoder) + + return [resp if not callback else callback(resp) for resp in resps + if not fail_on_error or not self._raise_on_response_error(resp)] diff -Nru python-kafka-python-0.9.2/kafka/cluster.py python-kafka-python-1.0.1/kafka/cluster.py --- python-kafka-python-0.9.2/kafka/cluster.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/cluster.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,294 @@ +from __future__ import absolute_import + +import collections +import copy +import logging +import random +import threading +import time + +import six + +import kafka.common as Errors +from kafka.common import BrokerMetadata, PartitionMetadata, TopicPartition +from .future import Future + +log = logging.getLogger(__name__) + + +class ClusterMetadata(object): + DEFAULT_CONFIG = { + 'retry_backoff_ms': 100, + 'metadata_max_age_ms': 300000, + } + + def __init__(self, **configs): + self._brokers = {} # node_id -> BrokerMetadata + self._partitions = {} # topic -> partition -> PartitionMetadata + self._broker_partitions = collections.defaultdict(set) # node_id -> {TopicPartition...} + self._groups = {} # group_name -> node_id + self._last_refresh_ms = 0 + self._last_successful_refresh_ms = 0 + self._need_update = False + self._future = None + self._listeners = set() + self._lock = threading.Lock() + self.need_all_topic_metadata = False + self.unauthorized_topics = set() + + self.config = copy.copy(self.DEFAULT_CONFIG) + for key in self.config: + if key in configs: + self.config[key] = configs[key] + + def brokers(self): + """Get all BrokerMetadata + + Returns: + set: {BrokerMetadata, ...} + """ + return set(self._brokers.values()) + + def broker_metadata(self, broker_id): + """Get BrokerMetadata + + Arguments: + broker_id (int): node_id for a broker to check + + Returns: + BrokerMetadata or None if not found + """ + return self._brokers.get(broker_id) + + def partitions_for_topic(self, topic): + """Return set of all partitions for topic (whether available or not) + + Arguments: + topic (str): topic to check for partitions + + Returns: + set: {partition (int), ...} + """ + if topic not in self._partitions: + return None + return set(self._partitions[topic].keys()) + + def available_partitions_for_topic(self, topic): + """Return set of partitions with known leaders + + Arguments: + topic (str): topic to check for partitions + + Returns: + set: {partition (int), ...} + """ + if topic not in self._partitions: + return None + return set([partition for partition, metadata + in six.iteritems(self._partitions[topic]) + if metadata.leader != -1]) + + def leader_for_partition(self, partition): + """Return node_id of leader, -1 unavailable, None if unknown.""" + if partition.topic not in self._partitions: + return None + elif partition.partition not in self._partitions[partition.topic]: + return None + return self._partitions[partition.topic][partition.partition].leader + + def partitions_for_broker(self, broker_id): + """Return TopicPartitions for which the broker is a leader. + + Arguments: + broker_id (int): node id for a broker + + Returns: + set: {TopicPartition, ...} + """ + return self._broker_partitions.get(broker_id) + + def coordinator_for_group(self, group): + """Return node_id of group coordinator. + + Arguments: + group (str): name of consumer group + + Returns: + int: node_id for group coordinator + """ + return self._groups.get(group) + + def ttl(self): + """Milliseconds until metadata should be refreshed""" + now = time.time() * 1000 + if self._need_update: + ttl = 0 + else: + metadata_age = now - self._last_successful_refresh_ms + ttl = self.config['metadata_max_age_ms'] - metadata_age + + retry_age = now - self._last_refresh_ms + next_retry = self.config['retry_backoff_ms'] - retry_age + + return max(ttl, next_retry, 0) + + def request_update(self): + """Flags metadata for update, return Future() + + Actual update must be handled separately. This method will only + change the reported ttl() + + Returns: + kafka.future.Future (value will be the cluster object after update) + """ + with self._lock: + self._need_update = True + if not self._future or self._future.is_done: + self._future = Future() + return self._future + + def topics(self): + """Get set of known topics. + + Returns: + set: {topic (str), ...} + """ + return set(self._partitions.keys()) + + def failed_update(self, exception): + """Update cluster state given a failed MetadataRequest.""" + f = None + with self._lock: + if self._future: + f = self._future + self._future = None + if f: + f.failure(exception) + self._last_refresh_ms = time.time() * 1000 + + def update_metadata(self, metadata): + """Update cluster state given a MetadataResponse. + + Arguments: + metadata (MetadataResponse): broker response to a metadata request + + Returns: None + """ + # In the common case where we ask for a single topic and get back an + # error, we should fail the future + if len(metadata.topics) == 1 and metadata.topics[0][0] != 0: + error_code, topic, _ = metadata.topics[0] + error = Errors.for_code(error_code)(topic) + return self.failed_update(error) + + if not metadata.brokers: + log.warning("No broker metadata found in MetadataResponse") + + for node_id, host, port in metadata.brokers: + self._brokers.update({ + node_id: BrokerMetadata(node_id, host, port) + }) + + _new_partitions = {} + _new_broker_partitions = collections.defaultdict(set) + _new_unauthorized_topics = set() + + for error_code, topic, partitions in metadata.topics: + error_type = Errors.for_code(error_code) + if error_type is Errors.NoError: + _new_partitions[topic] = {} + for p_error, partition, leader, replicas, isr in partitions: + _new_partitions[topic][partition] = PartitionMetadata( + topic=topic, partition=partition, leader=leader, + replicas=replicas, isr=isr, error=p_error) + if leader != -1: + _new_broker_partitions[leader].add( + TopicPartition(topic, partition)) + + elif error_type is Errors.LeaderNotAvailableError: + log.warning("Topic %s is not available during auto-create" + " initialization", topic) + elif error_type is Errors.UnknownTopicOrPartitionError: + log.error("Topic %s not found in cluster metadata", topic) + elif error_type is Errors.TopicAuthorizationFailedError: + log.error("Topic %s is not authorized for this client", topic) + _new_unauthorized_topics.add(topic) + elif error_type is Errors.InvalidTopicError: + log.error("'%s' is not a valid topic name", topic) + else: + log.error("Error fetching metadata for topic %s: %s", + topic, error_type) + + with self._lock: + self._partitions = _new_partitions + self._broker_partitions = _new_broker_partitions + self.unauthorized_topics = _new_unauthorized_topics + f = None + if self._future: + f = self._future + self._future = None + self._need_update = False + + now = time.time() * 1000 + self._last_refresh_ms = now + self._last_successful_refresh_ms = now + + if f: + f.success(self) + log.debug("Updated cluster metadata to %s", self) + + for listener in self._listeners: + listener(self) + + def add_listener(self, listener): + """Add a callback function to be called on each metadata update""" + self._listeners.add(listener) + + def remove_listener(self, listener): + """Remove a previously added listener callback""" + self._listeners.remove(listener) + + def add_group_coordinator(self, group, response): + """Update with metadata for a group coordinator + + Arguments: + group (str): name of group from GroupCoordinatorRequest + response (GroupCoordinatorResponse): broker response + + Returns: + bool: True if metadata is updated, False on error + """ + log.debug("Updating coordinator for %s: %s", group, response) + error_type = Errors.for_code(response.error_code) + if error_type is not Errors.NoError: + log.error("GroupCoordinatorResponse error: %s", error_type) + self._groups[group] = -1 + return False + + node_id = response.coordinator_id + coordinator = BrokerMetadata( + response.coordinator_id, + response.host, + response.port) + + # Assume that group coordinators are just brokers + # (this is true now, but could diverge in future) + if node_id not in self._brokers: + self._brokers[node_id] = coordinator + + # If this happens, either brokers have moved without + # changing IDs, or our assumption above is wrong + elif coordinator != self._brokers[node_id]: + log.error("GroupCoordinator metadata conflicts with existing" + " broker metadata. Coordinator: %s, Broker: %s", + coordinator, self._brokers[node_id]) + self._groups[group] = node_id + return False + + log.info("Group coordinator for %s is %s", group, coordinator) + self._groups[group] = node_id + return True + + def __str__(self): + return 'Cluster(brokers: %d, topics: %d, groups: %d)' % \ + (len(self._brokers), len(self._partitions), len(self._groups)) diff -Nru python-kafka-python-0.9.2/kafka/codec.py python-kafka-python-1.0.1/kafka/codec.py --- python-kafka-python-0.9.2/kafka/codec.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/codec.py 2016-02-17 18:37:58.000000000 +0000 @@ -1,90 +1,129 @@ -from cStringIO import StringIO import gzip +import io +import platform import struct -_XERIAL_V1_HEADER = (-126, 'S', 'N', 'A', 'P', 'P', 'Y', 0, 1, 1) +import six +from six.moves import xrange + +_XERIAL_V1_HEADER = (-126, b'S', b'N', b'A', b'P', b'P', b'Y', 0, 1, 1) _XERIAL_V1_FORMAT = 'bccccccBii' try: import snappy - _has_snappy = True except ImportError: - _has_snappy = False + snappy = None + +try: + import lz4f + import xxhash +except ImportError: + lz4f = None +PYPY = bool(platform.python_implementation() == 'PyPy') def has_gzip(): return True def has_snappy(): - return _has_snappy + return snappy is not None + + +def has_lz4(): + return lz4f is not None + + +def gzip_encode(payload, compresslevel=None): + if not compresslevel: + compresslevel = 9 + buf = io.BytesIO() -def gzip_encode(payload): - buffer = StringIO() - handle = gzip.GzipFile(fileobj=buffer, mode="w") - handle.write(payload) - handle.close() - buffer.seek(0) - result = buffer.read() - buffer.close() - return result + # Gzip context manager introduced in python 2.7 + # so old-fashioned way until we decide to not support 2.6 + gzipper = gzip.GzipFile(fileobj=buf, mode="w", compresslevel=compresslevel) + try: + gzipper.write(payload) + finally: + gzipper.close() + + return buf.getvalue() def gzip_decode(payload): - buffer = StringIO(payload) - handle = gzip.GzipFile(fileobj=buffer, mode='r') - result = handle.read() - handle.close() - buffer.close() - return result - - -def snappy_encode(payload, xerial_compatible=False, xerial_blocksize=32 * 1024): - """Encodes the given data with snappy if xerial_compatible is set then the - stream is encoded in a fashion compatible with the xerial snappy library + buf = io.BytesIO(payload) + + # Gzip context manager introduced in python 2.7 + # so old-fashioned way until we decide to not support 2.6 + gzipper = gzip.GzipFile(fileobj=buf, mode='r') + try: + return gzipper.read() + finally: + gzipper.close() + + +def snappy_encode(payload, xerial_compatible=True, xerial_blocksize=32*1024): + """Encodes the given data with snappy compression. + + If xerial_compatible is set then the stream is encoded in a fashion + compatible with the xerial snappy library. + + The block size (xerial_blocksize) controls how frequent the blocking occurs + 32k is the default in the xerial library. + + The format winds up being: - The block size (xerial_blocksize) controls how frequent the blocking occurs - 32k is the default in the xerial library. - The format winds up being +-------------+------------+--------------+------------+--------------+ | Header | Block1 len | Block1 data | Blockn len | Blockn data | - |-------------+------------+--------------+------------+--------------| + +-------------+------------+--------------+------------+--------------+ | 16 bytes | BE int32 | snappy bytes | BE int32 | snappy bytes | +-------------+------------+--------------+------------+--------------+ - It is important to not that the blocksize is the amount of uncompressed - data presented to snappy at each block, whereas the blocklen is the - number of bytes that will be present in the stream, that is the - length will always be <= blocksize. + + It is important to note that the blocksize is the amount of uncompressed + data presented to snappy at each block, whereas the blocklen is the number + of bytes that will be present in the stream; so the length will always be + <= blocksize. + """ - if not _has_snappy: + if not has_snappy(): raise NotImplementedError("Snappy codec is not available") - if xerial_compatible: - def _chunker(): - for i in xrange(0, len(payload), xerial_blocksize): - yield payload[i:i+xerial_blocksize] - - out = StringIO() - - header = ''.join([struct.pack('!' + fmt, dat) for fmt, dat - in zip(_XERIAL_V1_FORMAT, _XERIAL_V1_HEADER)]) - - out.write(header) - for chunk in _chunker(): - block = snappy.compress(chunk) - block_size = len(block) - out.write(struct.pack('!i', block_size)) - out.write(block) - - out.seek(0) - return out.read() + if not xerial_compatible: + return snappy.compress(payload) + out = io.BytesIO() + for fmt, dat in zip(_XERIAL_V1_FORMAT, _XERIAL_V1_HEADER): + out.write(struct.pack('!' + fmt, dat)) + + # Chunk through buffers to avoid creating intermediate slice copies + if PYPY: + # on pypy, snappy.compress() on a sliced buffer consumes the entire + # buffer... likely a python-snappy bug, so just use a slice copy + chunker = lambda payload, i, size: payload[i:size+i] + + elif six.PY2: + # Sliced buffer avoids additional copies + # pylint: disable-msg=undefined-variable + chunker = lambda payload, i, size: buffer(payload, i, size) else: - return snappy.compress(payload) + # snappy.compress does not like raw memoryviews, so we have to convert + # tobytes, which is a copy... oh well. it's the thought that counts. + # pylint: disable-msg=undefined-variable + chunker = lambda payload, i, size: memoryview(payload)[i:size+i].tobytes() + + for chunk in (chunker(payload, i, xerial_blocksize) + for i in xrange(0, len(payload), xerial_blocksize)): + + block = snappy.compress(chunk) + block_size = len(block) + out.write(struct.pack('!i', block_size)) + out.write(block) + + return out.getvalue() def _detect_xerial_stream(payload): @@ -94,9 +133,9 @@ This mode writes a magic header of the format: +--------+--------------+------------+---------+--------+ | Marker | Magic String | Null / Pad | Version | Compat | - |--------+--------------+------------+---------+--------| + +--------+--------------+------------+---------+--------+ | byte | c-string | byte | int32 | int32 | - |--------+--------------+------------+---------+--------| + +--------+--------------+------------+---------+--------+ | -126 | 'SNAPPY' | \0 | | | +--------+--------------+------------+---------+--------+ @@ -110,19 +149,19 @@ """ if len(payload) > 16: - header = header = struct.unpack('!' + _XERIAL_V1_FORMAT, bytes(payload)[:16]) + header = struct.unpack('!' + _XERIAL_V1_FORMAT, bytes(payload)[:16]) return header == _XERIAL_V1_HEADER return False def snappy_decode(payload): - if not _has_snappy: + if not has_snappy(): raise NotImplementedError("Snappy codec is not available") if _detect_xerial_stream(payload): # TODO ? Should become a fileobj ? - out = StringIO() - byt = buffer(payload[16:]) + out = io.BytesIO() + byt = payload[16:] length = len(byt) cursor = 0 @@ -138,3 +177,50 @@ return out.read() else: return snappy.decompress(payload) + + +def lz4_encode(payload): + data = lz4f.compressFrame(payload) # pylint: disable-msg=no-member + # Kafka's LZ4 code has a bug in its header checksum implementation + header_size = 7 + if isinstance(data[4], int): + flg = data[4] + else: + flg = ord(data[4]) + content_size_bit = ((flg >> 3) & 1) + if content_size_bit: + header_size += 8 + + # This is the incorrect hc + hc = xxhash.xxh32(data[0:header_size-1]).digest()[-2:-1] # pylint: disable-msg=no-member + + return b''.join([ + data[0:header_size-1], + hc, + data[header_size:] + ]) + + +def lz4_decode(payload): + # Kafka's LZ4 code has a bug in its header checksum implementation + header_size = 7 + if isinstance(payload[4], int): + flg = payload[4] + else: + flg = ord(payload[4]) + content_size_bit = ((flg >> 3) & 1) + if content_size_bit: + header_size += 8 + + # This should be the correct hc + hc = xxhash.xxh32(payload[4:header_size-1]).digest()[-2:-1] # pylint: disable-msg=no-member + + munged_payload = b''.join([ + payload[0:header_size-1], + hc, + payload[header_size:] + ]) + + cCtx = lz4f.createCompContext() # pylint: disable-msg=no-member + data = lz4f.decompressFrame(munged_payload, cCtx) # pylint: disable-msg=no-member + return data['decomp'] diff -Nru python-kafka-python-0.9.2/kafka/common.py python-kafka-python-1.0.1/kafka/common.py --- python-kafka-python-0.9.2/kafka/common.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/common.py 2016-02-17 18:37:58.000000000 +0000 @@ -1,51 +1,89 @@ +import inspect +import sys from collections import namedtuple -############### -# Structs # -############### -# Request payloads -ProduceRequest = namedtuple("ProduceRequest", - ["topic", "partition", "messages"]) +# SimpleClient Payload Structs - Deprecated -FetchRequest = namedtuple("FetchRequest", - ["topic", "partition", "offset", "max_bytes"]) +# https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-MetadataAPI +MetadataRequest = namedtuple("MetadataRequest", + ["topics"]) -OffsetRequest = namedtuple("OffsetRequest", - ["topic", "partition", "time", "max_offsets"]) +MetadataResponse = namedtuple("MetadataResponse", + ["brokers", "topics"]) -OffsetCommitRequest = namedtuple("OffsetCommitRequest", - ["topic", "partition", "offset", "metadata"]) +# https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-ConsumerMetadataRequest +ConsumerMetadataRequest = namedtuple("ConsumerMetadataRequest", + ["groups"]) -OffsetFetchRequest = namedtuple("OffsetFetchRequest", ["topic", "partition"]) +ConsumerMetadataResponse = namedtuple("ConsumerMetadataResponse", + ["error", "nodeId", "host", "port"]) -# Response payloads -ProduceResponse = namedtuple("ProduceResponse", - ["topic", "partition", "error", "offset"]) +# https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-ProduceAPI +ProduceRequestPayload = namedtuple("ProduceRequestPayload", + ["topic", "partition", "messages"]) -FetchResponse = namedtuple("FetchResponse", ["topic", "partition", "error", - "highwaterMark", "messages"]) +ProduceResponsePayload = namedtuple("ProduceResponsePayload", + ["topic", "partition", "error", "offset"]) -OffsetResponse = namedtuple("OffsetResponse", - ["topic", "partition", "error", "offsets"]) +# https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-FetchAPI +FetchRequestPayload = namedtuple("FetchRequestPayload", + ["topic", "partition", "offset", "max_bytes"]) -OffsetCommitResponse = namedtuple("OffsetCommitResponse", - ["topic", "partition", "error"]) +FetchResponsePayload = namedtuple("FetchResponsePayload", + ["topic", "partition", "error", "highwaterMark", "messages"]) -OffsetFetchResponse = namedtuple("OffsetFetchResponse", - ["topic", "partition", "offset", - "metadata", "error"]) +# https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetAPI +OffsetRequestPayload = namedtuple("OffsetRequestPayload", + ["topic", "partition", "time", "max_offsets"]) + +OffsetResponsePayload = namedtuple("OffsetResponsePayload", + ["topic", "partition", "error", "offsets"]) + +# https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetCommit/FetchAPI +OffsetCommitRequestPayload = namedtuple("OffsetCommitRequestPayload", + ["topic", "partition", "offset", "metadata"]) + +OffsetCommitResponsePayload = namedtuple("OffsetCommitResponsePayload", + ["topic", "partition", "error"]) + +OffsetFetchRequestPayload = namedtuple("OffsetFetchRequestPayload", + ["topic", "partition"]) + +OffsetFetchResponsePayload = namedtuple("OffsetFetchResponsePayload", + ["topic", "partition", "offset", "metadata", "error"]) -BrokerMetadata = namedtuple("BrokerMetadata", ["nodeId", "host", "port"]) -PartitionMetadata = namedtuple("PartitionMetadata", - ["topic", "partition", "leader", - "replicas", "isr"]) # Other useful structs -OffsetAndMessage = namedtuple("OffsetAndMessage", ["offset", "message"]) -Message = namedtuple("Message", ["magic", "attributes", "key", "value"]) -TopicAndPartition = namedtuple("TopicAndPartition", ["topic", "partition"]) +TopicPartition = namedtuple("TopicPartition", + ["topic", "partition"]) + +BrokerMetadata = namedtuple("BrokerMetadata", + ["nodeId", "host", "port"]) + +PartitionMetadata = namedtuple("PartitionMetadata", + ["topic", "partition", "leader", "replicas", "isr", "error"]) + +OffsetAndMetadata = namedtuple("OffsetAndMetadata", + ["offset", "metadata"]) + + +# Deprecated structs +OffsetAndMessage = namedtuple("OffsetAndMessage", + ["offset", "message"]) + +Message = namedtuple("Message", + ["magic", "attributes", "key", "value"]) + +KafkaMessage = namedtuple("KafkaMessage", + ["topic", "partition", "offset", "key", "value"]) + + +# Define retry policy for async producer +# Limit value: int >= 0, 0 means no retries +RetryOptions = namedtuple("RetryOptions", + ["limit", "backoff_ms", "retry_on_timeouts"]) ################# @@ -54,105 +92,326 @@ class KafkaError(RuntimeError): + retriable = False + # whether metadata should be refreshed on error + invalid_metadata = False + + +class IllegalStateError(KafkaError): pass -class BrokerResponseError(KafkaError): +class IllegalArgumentError(KafkaError): + pass + + +class NoBrokersAvailable(KafkaError): + retriable = True + invalid_metadata = True + + +class NodeNotReadyError(KafkaError): + retriable = True + + +class CorrelationIdError(KafkaError): + retriable = True + + +class Cancelled(KafkaError): + retriable = True + + +class TooManyInFlightRequests(KafkaError): + retriable = True + + +class StaleMetadata(KafkaError): + retriable = True + invalid_metadata = True + + +class UnrecognizedBrokerVersion(KafkaError): pass +class BrokerResponseError(KafkaError): + errno = None + message = None + description = None + + def __str__(self): + return '%s - %s - %s' % (self.__class__.__name__, self.errno, self.description) + + +class NoError(BrokerResponseError): + errno = 0 + message = 'NO_ERROR' + description = 'No error--it worked!' + + class UnknownError(BrokerResponseError): errno = -1 message = 'UNKNOWN' + description = 'An unexpected server error.' class OffsetOutOfRangeError(BrokerResponseError): errno = 1 message = 'OFFSET_OUT_OF_RANGE' + description = ('The requested offset is outside the range of offsets' + ' maintained by the server for the given topic/partition.') class InvalidMessageError(BrokerResponseError): errno = 2 message = 'INVALID_MESSAGE' + description = ('This indicates that a message contents does not match its' + ' CRC.') class UnknownTopicOrPartitionError(BrokerResponseError): errno = 3 message = 'UNKNOWN_TOPIC_OR_PARTITON' + description = ('This request is for a topic or partition that does not' + ' exist on this broker.') + invalid_metadata = True class InvalidFetchRequestError(BrokerResponseError): errno = 4 message = 'INVALID_FETCH_SIZE' + description = 'The message has a negative size.' class LeaderNotAvailableError(BrokerResponseError): errno = 5 message = 'LEADER_NOT_AVAILABLE' + description = ('This error is thrown if we are in the middle of a' + ' leadership election and there is currently no leader for' + ' this partition and hence it is unavailable for writes.') + retriable = True + invalid_metadata = True class NotLeaderForPartitionError(BrokerResponseError): errno = 6 message = 'NOT_LEADER_FOR_PARTITION' + description = ('This error is thrown if the client attempts to send' + ' messages to a replica that is not the leader for some' + ' partition. It indicates that the clients metadata is out' + ' of date.') + retriable = True + invalid_metadata = True class RequestTimedOutError(BrokerResponseError): errno = 7 message = 'REQUEST_TIMED_OUT' + description = ('This error is thrown if the request exceeds the' + ' user-specified time limit in the request.') + retriable = True class BrokerNotAvailableError(BrokerResponseError): errno = 8 message = 'BROKER_NOT_AVAILABLE' - + description = ('This is not a client facing error and is used mostly by' + ' tools when a broker is not alive.') class ReplicaNotAvailableError(BrokerResponseError): errno = 9 message = 'REPLICA_NOT_AVAILABLE' + description = ('If replica is expected on a broker, but is not (this can be' + ' safely ignored).') class MessageSizeTooLargeError(BrokerResponseError): errno = 10 message = 'MESSAGE_SIZE_TOO_LARGE' + description = ('The server has a configurable maximum message size to avoid' + ' unbounded memory allocation. This error is thrown if the' + ' client attempt to produce a message larger than this' + ' maximum.') class StaleControllerEpochError(BrokerResponseError): errno = 11 message = 'STALE_CONTROLLER_EPOCH' + description = 'Internal error code for broker-to-broker communication.' class OffsetMetadataTooLargeError(BrokerResponseError): errno = 12 message = 'OFFSET_METADATA_TOO_LARGE' + description = ('If you specify a string larger than configured maximum for' + ' offset metadata.') +# TODO is this deprecated? https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-ErrorCodes class StaleLeaderEpochCodeError(BrokerResponseError): errno = 13 message = 'STALE_LEADER_EPOCH_CODE' -class KafkaUnavailableError(KafkaError): - pass +class GroupLoadInProgressError(BrokerResponseError): + errno = 14 + message = 'OFFSETS_LOAD_IN_PROGRESS' + description = ('The broker returns this error code for an offset fetch' + ' request if it is still loading offsets (after a leader' + ' change for that offsets topic partition), or in response' + ' to group membership requests (such as heartbeats) when' + ' group metadata is being loaded by the coordinator.') + retriable = True + + +class GroupCoordinatorNotAvailableError(BrokerResponseError): + errno = 15 + message = 'CONSUMER_COORDINATOR_NOT_AVAILABLE' + description = ('The broker returns this error code for group coordinator' + ' requests, offset commits, and most group management' + ' requests if the offsets topic has not yet been created, or' + ' if the group coordinator is not active.') + retriable = True + + +class NotCoordinatorForGroupError(BrokerResponseError): + errno = 16 + message = 'NOT_COORDINATOR_FOR_CONSUMER' + description = ('The broker returns this error code if it receives an offset' + ' fetch or commit request for a group that it is not a' + ' coordinator for.') + retriable = True + + +class InvalidTopicError(BrokerResponseError): + errno = 17 + message = 'INVALID_TOPIC' + description = ('For a request which attempts to access an invalid topic' + ' (e.g. one which has an illegal name), or if an attempt' + ' is made to write to an internal topic (such as the' + ' consumer offsets topic).') + + +class RecordListTooLargeError(BrokerResponseError): + errno = 18 + message = 'RECORD_LIST_TOO_LARGE' + description = ('If a message batch in a produce request exceeds the maximum' + ' configured segment size.') + + +class NotEnoughReplicasError(BrokerResponseError): + errno = 19 + message = 'NOT_ENOUGH_REPLICAS' + description = ('Returned from a produce request when the number of in-sync' + ' replicas is lower than the configured minimum and' + ' requiredAcks is -1.') + + +class NotEnoughReplicasAfterAppendError(BrokerResponseError): + errno = 20 + message = 'NOT_ENOUGH_REPLICAS_AFTER_APPEND' + description = ('Returned from a produce request when the message was' + ' written to the log, but with fewer in-sync replicas than' + ' required.') + + +class InvalidRequiredAcksError(BrokerResponseError): + errno = 21 + message = 'INVALID_REQUIRED_ACKS' + description = ('Returned from a produce request if the requested' + ' requiredAcks is invalid (anything other than -1, 1, or 0).') + + +class IllegalGenerationError(BrokerResponseError): + errno = 22 + message = 'ILLEGAL_GENERATION' + description = ('Returned from group membership requests (such as heartbeats)' + ' when the generation id provided in the request is not the' + ' current generation.') + + +class InconsistentGroupProtocolError(BrokerResponseError): + errno = 23 + message = 'INCONSISTENT_GROUP_PROTOCOL' + description = ('Returned in join group when the member provides a protocol' + ' type or set of protocols which is not compatible with the current group.') + + +class InvalidGroupIdError(BrokerResponseError): + errno = 24 + message = 'INVALID_GROUP_ID' + description = 'Returned in join group when the groupId is empty or null.' + + +class UnknownMemberIdError(BrokerResponseError): + errno = 25 + message = 'UNKNOWN_MEMBER_ID' + description = ('Returned from group requests (offset commits/fetches,' + ' heartbeats, etc) when the memberId is not in the current' + ' generation.') + + +class InvalidSessionTimeoutError(BrokerResponseError): + errno = 26 + message = 'INVALID_SESSION_TIMEOUT' + description = ('Return in join group when the requested session timeout is' + ' outside of the allowed range on the broker') + + +class RebalanceInProgressError(BrokerResponseError): + errno = 27 + message = 'REBALANCE_IN_PROGRESS' + description = ('Returned in heartbeat requests when the coordinator has' + ' begun rebalancing the group. This indicates to the client' + ' that it should rejoin the group.') + + +class InvalidCommitOffsetSizeError(BrokerResponseError): + errno = 28 + message = 'INVALID_COMMIT_OFFSET_SIZE' + description = ('This error indicates that an offset commit was rejected' + ' because of oversize metadata.') + + +class TopicAuthorizationFailedError(BrokerResponseError): + errno = 29 + message = 'TOPIC_AUTHORIZATION_FAILED' + description = ('Returned by the broker when the client is not authorized to' + ' access the requested topic.') + + +class GroupAuthorizationFailedError(BrokerResponseError): + errno = 30 + message = 'GROUP_AUTHORIZATION_FAILED' + description = ('Returned by the broker when the client is not authorized to' + ' access a particular groupId.') + + +class ClusterAuthorizationFailedError(BrokerResponseError): + errno = 31 + message = 'CLUSTER_AUTHORIZATION_FAILED' + description = ('Returned by the broker when the client is not authorized to' + ' use an inter-broker or administrative API.') -class KafkaTimeoutError(KafkaError): - pass - - -class LeaderUnavailableError(KafkaError): +class KafkaUnavailableError(KafkaError): pass -class PartitionUnavailableError(KafkaError): +class KafkaTimeoutError(KafkaError): pass class FailedPayloadsError(KafkaError): - pass + def __init__(self, payload, *args): + super(FailedPayloadsError, self).__init__(*args) + self.payload = payload class ConnectionError(KafkaError): - pass + retriable = True + invalid_metadata = True class BufferUnderflowError(KafkaError): @@ -171,6 +430,10 @@ pass +class ConsumerTimeout(KafkaError): + pass + + class ProtocolError(KafkaError): pass @@ -179,26 +442,47 @@ pass -kafka_errors = { - -1 : UnknownError, - 1 : OffsetOutOfRangeError, - 2 : InvalidMessageError, - 3 : UnknownTopicOrPartitionError, - 4 : InvalidFetchRequestError, - 5 : LeaderNotAvailableError, - 6 : NotLeaderForPartitionError, - 7 : RequestTimedOutError, - 8 : BrokerNotAvailableError, - 9 : ReplicaNotAvailableError, - 10 : MessageSizeTooLargeError, - 11 : StaleControllerEpochError, - 12 : OffsetMetadataTooLargeError, - 13 : StaleLeaderEpochCodeError, -} +class KafkaConfigurationError(KafkaError): + pass + + +class AsyncProducerQueueFull(KafkaError): + def __init__(self, failed_msgs, *args): + super(AsyncProducerQueueFull, self).__init__(*args) + self.failed_msgs = failed_msgs + + +def _iter_broker_errors(): + for name, obj in inspect.getmembers(sys.modules[__name__]): + if inspect.isclass(obj) and issubclass(obj, BrokerResponseError) and obj != BrokerResponseError: + yield obj + + +kafka_errors = dict([(x.errno, x) for x in _iter_broker_errors()]) + + +def for_code(error_code): + return kafka_errors.get(error_code, UnknownError) def check_error(response): - error = kafka_errors.get(response.error) - if error: - raise error(response) + if isinstance(response, Exception): + raise response + if response.error: + error_class = kafka_errors.get(response.error, UnknownError) + raise error_class(response) + + +RETRY_BACKOFF_ERROR_TYPES = ( + KafkaUnavailableError, LeaderNotAvailableError, + ConnectionError, FailedPayloadsError +) + + +RETRY_REFRESH_ERROR_TYPES = ( + NotLeaderForPartitionError, UnknownTopicOrPartitionError, + LeaderNotAvailableError, ConnectionError +) + +RETRY_ERROR_TYPES = RETRY_BACKOFF_ERROR_TYPES + RETRY_REFRESH_ERROR_TYPES diff -Nru python-kafka-python-0.9.2/kafka/conn.py python-kafka-python-1.0.1/kafka/conn.py --- python-kafka-python-0.9.2/kafka/conn.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/conn.py 2016-02-19 17:01:46.000000000 +0000 @@ -1,25 +1,366 @@ +import collections import copy +import errno import logging +import io +from random import shuffle +from select import select import socket import struct -from random import shuffle from threading import local +import time +import warnings + +import six + +import kafka.common as Errors +from kafka.future import Future +from kafka.protocol.api import RequestHeader +from kafka.protocol.commit import GroupCoordinatorResponse +from kafka.protocol.types import Int32 +from kafka.version import __version__ -from kafka.common import ConnectionError -log = logging.getLogger("kafka") +if six.PY2: + ConnectionError = socket.error + BlockingIOError = Exception + +log = logging.getLogger(__name__) DEFAULT_SOCKET_TIMEOUT_SECONDS = 120 DEFAULT_KAFKA_PORT = 9092 +class ConnectionStates(object): + DISCONNECTED = '' + CONNECTING = '' + CONNECTED = '' + + +InFlightRequest = collections.namedtuple('InFlightRequest', + ['request', 'response_type', 'correlation_id', 'future', 'timestamp']) + + +class BrokerConnection(object): + DEFAULT_CONFIG = { + 'client_id': 'kafka-python-' + __version__, + 'request_timeout_ms': 40000, + 'reconnect_backoff_ms': 50, + 'max_in_flight_requests_per_connection': 5, + 'receive_buffer_bytes': None, + 'send_buffer_bytes': None, + 'api_version': (0, 8, 2), # default to most restrictive + } + + def __init__(self, host, port, **configs): + self.host = host + self.port = port + self.in_flight_requests = collections.deque() + + self.config = copy.copy(self.DEFAULT_CONFIG) + for key in self.config: + if key in configs: + self.config[key] = configs[key] + + self.state = ConnectionStates.DISCONNECTED + self._sock = None + self._rbuffer = io.BytesIO() + self._receiving = False + self._next_payload_bytes = 0 + self.last_attempt = 0 + self.last_failure = 0 + self._processing = False + self._correlation_id = 0 + + def connect(self): + """Attempt to connect and return ConnectionState""" + if self.state is ConnectionStates.DISCONNECTED: + self.close() + self._sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + if self.config['receive_buffer_bytes'] is not None: + self._sock.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, + self.config['receive_buffer_bytes']) + if self.config['send_buffer_bytes'] is not None: + self._sock.setsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF, + self.config['send_buffer_bytes']) + self._sock.setblocking(False) + try: + ret = self._sock.connect_ex((self.host, self.port)) + except socket.error as ret: + pass + self.last_attempt = time.time() + + if not ret or ret is errno.EISCONN: + self.state = ConnectionStates.CONNECTED + elif ret in (errno.EINPROGRESS, errno.EALREADY): + self.state = ConnectionStates.CONNECTING + else: + log.error('Connect attempt to %s returned error %s.' + ' Disconnecting.', self, ret) + self.close() + self.last_failure = time.time() + + if self.state is ConnectionStates.CONNECTING: + # in non-blocking mode, use repeated calls to socket.connect_ex + # to check connection status + request_timeout = self.config['request_timeout_ms'] / 1000.0 + if time.time() > request_timeout + self.last_attempt: + log.error('Connection attempt to %s timed out', self) + self.close() # error=TimeoutError ? + self.last_failure = time.time() + + else: + try: + ret = self._sock.connect_ex((self.host, self.port)) + except socket.error as ret: + pass + if not ret or ret is errno.EISCONN: + self.state = ConnectionStates.CONNECTED + elif ret is not errno.EALREADY: + log.error('Connect attempt to %s returned error %s.' + ' Disconnecting.', self, ret) + self.close() + self.last_failure = time.time() + return self.state + + def blacked_out(self): + """ + Return true if we are disconnected from the given node and can't + re-establish a connection yet + """ + if self.state is ConnectionStates.DISCONNECTED: + backoff = self.config['reconnect_backoff_ms'] / 1000.0 + if time.time() < self.last_attempt + backoff: + return True + return False + + def connected(self): + """Return True iff socket is connected.""" + return self.state is ConnectionStates.CONNECTED + + def close(self, error=None): + """Close socket and fail all in-flight-requests. + + Arguments: + error (Exception, optional): pending in-flight-requests + will be failed with this exception. + Default: kafka.common.ConnectionError. + """ + if self._sock: + self._sock.close() + self._sock = None + self.state = ConnectionStates.DISCONNECTED + self._receiving = False + self._next_payload_bytes = 0 + self._rbuffer.seek(0) + self._rbuffer.truncate() + if error is None: + error = Errors.ConnectionError() + while self.in_flight_requests: + ifr = self.in_flight_requests.popleft() + ifr.future.failure(error) + + def send(self, request, expect_response=True): + """send request, return Future() + + Can block on network if request is larger than send_buffer_bytes + """ + future = Future() + if not self.connected(): + return future.failure(Errors.ConnectionError()) + if not self.can_send_more(): + return future.failure(Errors.TooManyInFlightRequests()) + correlation_id = self._next_correlation_id() + header = RequestHeader(request, + correlation_id=correlation_id, + client_id=self.config['client_id']) + message = b''.join([header.encode(), request.encode()]) + size = Int32.encode(len(message)) + try: + # In the future we might manage an internal write buffer + # and send bytes asynchronously. For now, just block + # sending each request payload + self._sock.setblocking(True) + sent_bytes = self._sock.send(size) + assert sent_bytes == len(size) + sent_bytes = self._sock.send(message) + assert sent_bytes == len(message) + self._sock.setblocking(False) + except (AssertionError, ConnectionError) as e: + log.exception("Error sending %s to %s", request, self) + error = Errors.ConnectionError(e) + self.close(error=error) + return future.failure(error) + log.debug('%s Request %d: %s', self, correlation_id, request) + + if expect_response: + ifr = InFlightRequest(request=request, + correlation_id=correlation_id, + response_type=request.RESPONSE_TYPE, + future=future, + timestamp=time.time()) + self.in_flight_requests.append(ifr) + else: + future.success(None) + + return future + + def can_send_more(self): + """Return True unless there are max_in_flight_requests.""" + max_ifrs = self.config['max_in_flight_requests_per_connection'] + return len(self.in_flight_requests) < max_ifrs + + def recv(self, timeout=0): + """Non-blocking network receive. + + Return response if available + """ + assert not self._processing, 'Recursion not supported' + if not self.connected(): + log.warning('%s cannot recv: socket not connected', self) + # If requests are pending, we should close the socket and + # fail all the pending request futures + if self.in_flight_requests: + self.close() + return None + + elif not self.in_flight_requests: + log.warning('%s: No in-flight-requests to recv', self) + return None + + elif self._requests_timed_out(): + log.warning('%s timed out after %s ms. Closing connection.', + self, self.config['request_timeout_ms']) + self.close(error=Errors.RequestTimedOutError( + 'Request timed out after %s ms' % + self.config['request_timeout_ms'])) + return None + + readable, _, _ = select([self._sock], [], [], timeout) + if not readable: + return None + + # Not receiving is the state of reading the payload header + if not self._receiving: + try: + # An extremely small, but non-zero, probability that there are + # more than 0 but not yet 4 bytes available to read + self._rbuffer.write(self._sock.recv(4 - self._rbuffer.tell())) + except ConnectionError as e: + if six.PY2 and e.errno == errno.EWOULDBLOCK: + # This shouldn't happen after selecting above + # but just in case + return None + log.exception('%s: Error receiving 4-byte payload header -' + ' closing socket', self) + self.close(error=Errors.ConnectionError(e)) + return None + except BlockingIOError: + if six.PY3: + return None + raise + + if self._rbuffer.tell() == 4: + self._rbuffer.seek(0) + self._next_payload_bytes = Int32.decode(self._rbuffer) + # reset buffer and switch state to receiving payload bytes + self._rbuffer.seek(0) + self._rbuffer.truncate() + self._receiving = True + elif self._rbuffer.tell() > 4: + raise Errors.KafkaError('this should not happen - are you threading?') + + if self._receiving: + staged_bytes = self._rbuffer.tell() + try: + self._rbuffer.write(self._sock.recv(self._next_payload_bytes - staged_bytes)) + except ConnectionError as e: + # Extremely small chance that we have exactly 4 bytes for a + # header, but nothing to read in the body yet + if six.PY2 and e.errno == errno.EWOULDBLOCK: + return None + log.exception('%s: Error in recv', self) + self.close(error=Errors.ConnectionError(e)) + return None + except BlockingIOError: + if six.PY3: + return None + raise + + staged_bytes = self._rbuffer.tell() + if staged_bytes > self._next_payload_bytes: + self.close(error=Errors.KafkaError('Receive buffer has more bytes than expected?')) + + if staged_bytes != self._next_payload_bytes: + return None + + self._receiving = False + self._next_payload_bytes = 0 + self._rbuffer.seek(0) + response = self._process_response(self._rbuffer) + self._rbuffer.seek(0) + self._rbuffer.truncate() + return response + + def _process_response(self, read_buffer): + assert not self._processing, 'Recursion not supported' + self._processing = True + ifr = self.in_flight_requests.popleft() + + # verify send/recv correlation ids match + recv_correlation_id = Int32.decode(read_buffer) + + # 0.8.2 quirk + if (self.config['api_version'] == (0, 8, 2) and + ifr.response_type is GroupCoordinatorResponse and + ifr.correlation_id != 0 and + recv_correlation_id == 0): + log.warning('Kafka 0.8.2 quirk -- GroupCoordinatorResponse' + ' coorelation id does not match request. This' + ' should go away once at least one topic has been' + ' initialized on the broker') + + elif ifr.correlation_id != recv_correlation_id: + + + error = Errors.CorrelationIdError( + 'Correlation ids do not match: sent %d, recv %d' + % (ifr.correlation_id, recv_correlation_id)) + ifr.future.fail(error) + self.close() + self._processing = False + return None + + # decode response + response = ifr.response_type.decode(read_buffer) + log.debug('%s Response %d: %s', self, ifr.correlation_id, response) + ifr.future.success(response) + self._processing = False + return response + + def _requests_timed_out(self): + if self.in_flight_requests: + oldest_at = self.in_flight_requests[0].timestamp + timeout = self.config['request_timeout_ms'] / 1000.0 + if time.time() >= oldest_at + timeout: + return True + return False + + def _next_correlation_id(self): + self._correlation_id = (self._correlation_id + 1) % 2**31 + return self._correlation_id + + def __repr__(self): + return "" % (self.host, self.port) + + def collect_hosts(hosts, randomize=True): """ Collects a comma-separated set of hosts (host:port) and optionally randomize the returned list. """ - if isinstance(hosts, basestring): + if isinstance(hosts, six.string_types): hosts = hosts.strip().split(',') result = [] @@ -37,20 +378,17 @@ class KafkaConnection(local): - """ - A socket connection to a single Kafka broker + """A socket connection to a single Kafka broker - This class is _not_ thread safe. Each call to `send` must be followed - by a call to `recv` in order to get the correct response. Eventually, - we can do something in here to facilitate multiplexed requests/responses - since the Kafka API includes a correlation id. - - host: the host name or IP address of a kafka broker - port: the port number the kafka broker is listening on - timeout: default 120. The socket timeout for sending and receiving data - in seconds. None means no timeout, so a request can block forever. + Arguments: + host: the host name or IP address of a kafka broker + port: the port number the kafka broker is listening on + timeout: default 120. The socket timeout for sending and receiving data + in seconds. None means no timeout, so a request can block forever. """ def __init__(self, host, port, timeout=DEFAULT_SOCKET_TIMEOUT_SECONDS): + warnings.warn('KafkaConnection has been deprecated and will be' + ' removed in a future release', DeprecationWarning) super(KafkaConnection, self).__init__() self.host = host self.port = port @@ -59,6 +397,9 @@ self.reinit() + def __getnewargs__(self): + return (self.host, self.port, self.timeout) + def __repr__(self): return "" % (self.host, self.port) @@ -72,7 +413,7 @@ self.close() # And then raise - raise ConnectionError("Kafka @ {0}:{1} went away".format(self.host, self.port)) + raise Errors.ConnectionError("Kafka @ {0}:{1} went away".format(self.host, self.port)) def _read_bytes(self, num_bytes): bytes_left = num_bytes @@ -87,12 +428,13 @@ while bytes_left: try: + # pylint: disable-msg=no-member data = self._sock.recv(min(bytes_left, 4096)) # Receiving empty string from recv signals # that the socket is in error. we will never get # more data from this socket - if data == '': + if data == b'': raise socket.error("Not enough data to read message -- did server kill socket?") except socket.error: @@ -103,7 +445,7 @@ log.debug("Read %d/%d bytes from Kafka", num_bytes - bytes_left, num_bytes) responses.append(data) - return ''.join(responses) + return b''.join(responses) ################## # Public API # @@ -111,11 +453,18 @@ # TODO multiplex socket communication to allow for multi-threaded clients + def get_connected_socket(self): + if not self._sock: + self.reinit() + return self._sock + def send(self, request_id, payload): """ Send a request to Kafka - param: request_id -- can be any int (used only for debug logging...) - param: payload -- an encoded kafka packet (see KafkaProtocol) + + Arguments:: + request_id (int): can be any int (used only for debug logging...) + payload: an encoded kafka packet (see KafkaProtocol) """ log.debug("About to send %d bytes to Kafka, request %d" % (len(payload), request_id)) @@ -125,6 +474,7 @@ self.reinit() try: + # pylint: disable-msg=no-member self._sock.sendall(payload) except socket.error: log.exception('Unable to send payload to Kafka') @@ -133,26 +483,40 @@ def recv(self, request_id): """ Get a response packet from Kafka - param: request_id -- can be any int (only used for debug logging...) - returns encoded kafka packet response from server as type str + + Arguments: + request_id: can be any int (only used for debug logging...) + + Returns: + str: Encoded kafka packet response from server """ log.debug("Reading response %d from Kafka" % request_id) + # Make sure we have a connection + if not self._sock: + self.reinit() + # Read the size off of the header resp = self._read_bytes(4) (size,) = struct.unpack('>i', resp) # Read the remainder of the response resp = self._read_bytes(size) - return str(resp) + return resp def copy(self): """ - Create an inactive copy of the connection object - A reinit() has to be done on the copy before it can be used again - return a new KafkaConnection object + Create an inactive copy of the connection object, suitable for + passing to a background thread. + + The returned copy is not connected; you must call reinit() before + using. """ c = copy.deepcopy(self) + # Python 3 doesn't copy custom attributes of the threadlocal subclass + c.host = copy.copy(self.host) + c.port = copy.copy(self.port) + c.timeout = copy.copy(self.timeout) c._sock = None return c @@ -166,6 +530,7 @@ # But expect an error if the socket has already been # closed by the server try: + # pylint: disable-msg=no-member self._sock.shutdown(socket.SHUT_RDWR) except socket.error: pass diff -Nru python-kafka-python-0.9.2/kafka/consumer/base.py python-kafka-python-1.0.1/kafka/consumer/base.py --- python-kafka-python-0.9.2/kafka/consumer/base.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/consumer/base.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,234 @@ +from __future__ import absolute_import + +import atexit +import logging +import numbers +from threading import Lock +import warnings + +import kafka.common +from kafka.common import ( + OffsetRequestPayload, OffsetCommitRequestPayload, OffsetFetchRequestPayload, + UnknownTopicOrPartitionError, check_error, KafkaError +) + +from kafka.util import ReentrantTimer + + +log = logging.getLogger('kafka.consumer') + +AUTO_COMMIT_MSG_COUNT = 100 +AUTO_COMMIT_INTERVAL = 5000 + +FETCH_DEFAULT_BLOCK_TIMEOUT = 1 +FETCH_MAX_WAIT_TIME = 100 +FETCH_MIN_BYTES = 4096 +FETCH_BUFFER_SIZE_BYTES = 4096 +MAX_FETCH_BUFFER_SIZE_BYTES = FETCH_BUFFER_SIZE_BYTES * 8 + +ITER_TIMEOUT_SECONDS = 60 +NO_MESSAGES_WAIT_TIME_SECONDS = 0.1 +FULL_QUEUE_WAIT_TIME_SECONDS = 0.1 + +MAX_BACKOFF_SECONDS = 60 + +class Consumer(object): + """ + Base class to be used by other consumers. Not to be used directly + + This base class provides logic for + + * initialization and fetching metadata of partitions + * Auto-commit logic + * APIs for fetching pending message count + + """ + def __init__(self, client, group, topic, partitions=None, auto_commit=True, + auto_commit_every_n=AUTO_COMMIT_MSG_COUNT, + auto_commit_every_t=AUTO_COMMIT_INTERVAL): + + warnings.warn('deprecated -- this class will be removed in a future' + ' release. Use KafkaConsumer instead.', + DeprecationWarning) + self.client = client + self.topic = topic + self.group = group + self.client.load_metadata_for_topics(topic) + self.offsets = {} + + if partitions is None: + partitions = self.client.get_partition_ids_for_topic(topic) + else: + assert all(isinstance(x, numbers.Integral) for x in partitions) + + # Variables for handling offset commits + self.commit_lock = Lock() + self.commit_timer = None + self.count_since_commit = 0 + self.auto_commit = auto_commit + self.auto_commit_every_n = auto_commit_every_n + self.auto_commit_every_t = auto_commit_every_t + + # Set up the auto-commit timer + if auto_commit is True and auto_commit_every_t is not None: + self.commit_timer = ReentrantTimer(auto_commit_every_t, + self.commit) + self.commit_timer.start() + + # Set initial offsets + if self.group is not None: + self.fetch_last_known_offsets(partitions) + else: + for partition in partitions: + self.offsets[partition] = 0 + + # Register a cleanup handler + def cleanup(obj): + obj.stop() + self._cleanup_func = cleanup + atexit.register(cleanup, self) + + self.partition_info = False # Do not return partition info in msgs + + def provide_partition_info(self): + """ + Indicates that partition info must be returned by the consumer + """ + self.partition_info = True + + def fetch_last_known_offsets(self, partitions=None): + if self.group is None: + raise ValueError('SimpleClient.group must not be None') + + if partitions is None: + partitions = self.client.get_partition_ids_for_topic(self.topic) + + responses = self.client.send_offset_fetch_request( + self.group, + [OffsetFetchRequestPayload(self.topic, p) for p in partitions], + fail_on_error=False + ) + + for resp in responses: + try: + check_error(resp) + # API spec says server wont set an error here + # but 0.8.1.1 does actually... + except UnknownTopicOrPartitionError: + pass + + # -1 offset signals no commit is currently stored + if resp.offset == -1: + self.offsets[resp.partition] = 0 + + # Otherwise we committed the stored offset + # and need to fetch the next one + else: + self.offsets[resp.partition] = resp.offset + + def commit(self, partitions=None): + """Commit stored offsets to Kafka via OffsetCommitRequest (v0) + + Keyword Arguments: + partitions (list): list of partitions to commit, default is to commit + all of them + + Returns: True on success, False on failure + """ + + # short circuit if nothing happened. This check is kept outside + # to prevent un-necessarily acquiring a lock for checking the state + if self.count_since_commit == 0: + return + + with self.commit_lock: + # Do this check again, just in case the state has changed + # during the lock acquiring timeout + if self.count_since_commit == 0: + return + + reqs = [] + if partitions is None: # commit all partitions + partitions = list(self.offsets.keys()) + + log.debug('Committing new offsets for %s, partitions %s', + self.topic, partitions) + for partition in partitions: + offset = self.offsets[partition] + log.debug('Commit offset %d in SimpleConsumer: ' + 'group=%s, topic=%s, partition=%s', + offset, self.group, self.topic, partition) + + reqs.append(OffsetCommitRequestPayload(self.topic, partition, + offset, None)) + + try: + self.client.send_offset_commit_request(self.group, reqs) + except KafkaError as e: + log.error('%s saving offsets: %s', e.__class__.__name__, e) + return False + else: + self.count_since_commit = 0 + return True + + def _auto_commit(self): + """ + Check if we have to commit based on number of messages and commit + """ + + # Check if we are supposed to do an auto-commit + if not self.auto_commit or self.auto_commit_every_n is None: + return + + if self.count_since_commit >= self.auto_commit_every_n: + self.commit() + + def stop(self): + if self.commit_timer is not None: + self.commit_timer.stop() + self.commit() + + if hasattr(self, '_cleanup_func'): + # Remove cleanup handler now that we've stopped + + # py3 supports unregistering + if hasattr(atexit, 'unregister'): + atexit.unregister(self._cleanup_func) # pylint: disable=no-member + + # py2 requires removing from private attribute... + else: + + # ValueError on list.remove() if the exithandler no longer + # exists is fine here + try: + atexit._exithandlers.remove( # pylint: disable=no-member + (self._cleanup_func, (self,), {})) + except ValueError: + pass + + del self._cleanup_func + + def pending(self, partitions=None): + """ + Gets the pending message count + + Keyword Arguments: + partitions (list): list of partitions to check for, default is to check all + """ + if partitions is None: + partitions = self.offsets.keys() + + total = 0 + reqs = [] + + for partition in partitions: + reqs.append(OffsetRequestPayload(self.topic, partition, -1, 1)) + + resps = self.client.send_offset_request(reqs) + for resp in resps: + partition = resp.partition + pending = resp.offsets[0] + offset = self.offsets[partition] + total += pending - offset + + return total diff -Nru python-kafka-python-0.9.2/kafka/consumer/fetcher.py python-kafka-python-1.0.1/kafka/consumer/fetcher.py --- python-kafka-python-0.9.2/kafka/consumer/fetcher.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/consumer/fetcher.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,691 @@ +from __future__ import absolute_import + +import collections +import copy +import logging + +import six + +import kafka.common as Errors +from kafka.common import TopicPartition +from kafka.future import Future +from kafka.protocol.fetch import FetchRequest +from kafka.protocol.message import PartialMessage +from kafka.protocol.offset import OffsetRequest, OffsetResetStrategy + +log = logging.getLogger(__name__) + + +ConsumerRecord = collections.namedtuple("ConsumerRecord", + ["topic", "partition", "offset", "key", "value"]) + + +class NoOffsetForPartitionError(Errors.KafkaError): + pass + + +class RecordTooLargeError(Errors.KafkaError): + pass + + +class Fetcher(six.Iterator): + DEFAULT_CONFIG = { + 'key_deserializer': None, + 'value_deserializer': None, + 'fetch_min_bytes': 1, + 'fetch_max_wait_ms': 500, + 'max_partition_fetch_bytes': 1048576, + 'check_crcs': True, + 'iterator_refetch_records': 1, # undocumented -- interface may change + } + + def __init__(self, client, subscriptions, **configs): + """Initialize a Kafka Message Fetcher. + + Keyword Arguments: + key_deserializer (callable): Any callable that takes a + raw message key and returns a deserialized key. + value_deserializer (callable, optional): Any callable that takes a + raw message value and returns a deserialized value. + fetch_min_bytes (int): Minimum amount of data the server should + return for a fetch request, otherwise wait up to + fetch_max_wait_ms for more data to accumulate. Default: 1. + fetch_max_wait_ms (int): The maximum amount of time in milliseconds + the server will block before answering the fetch request if + there isn't sufficient data to immediately satisfy the + requirement given by fetch_min_bytes. Default: 500. + max_partition_fetch_bytes (int): The maximum amount of data + per-partition the server will return. The maximum total memory + used for a request = #partitions * max_partition_fetch_bytes. + This size must be at least as large as the maximum message size + the server allows or else it is possible for the producer to + send messages larger than the consumer can fetch. If that + happens, the consumer can get stuck trying to fetch a large + message on a certain partition. Default: 1048576. + check_crcs (bool): Automatically check the CRC32 of the records + consumed. This ensures no on-the-wire or on-disk corruption to + the messages occurred. This check adds some overhead, so it may + be disabled in cases seeking extreme performance. Default: True + """ + #metrics=None, + #metric_group_prefix='consumer', + self.config = copy.copy(self.DEFAULT_CONFIG) + for key in self.config: + if key in configs: + self.config[key] = configs[key] + + self._client = client + self._subscriptions = subscriptions + self._records = collections.deque() # (offset, topic_partition, messages) + self._unauthorized_topics = set() + self._offset_out_of_range_partitions = dict() # {topic_partition: offset} + self._record_too_large_partitions = dict() # {topic_partition: offset} + self._iterator = None + self._fetch_futures = collections.deque() + + #self.sensors = FetchManagerMetrics(metrics, metric_group_prefix) + + def init_fetches(self): + """Send FetchRequests asynchronously for all assigned partitions. + + Note: noop if there are unconsumed records internal to the fetcher + + Returns: + List of Futures: each future resolves to a FetchResponse + """ + # We need to be careful when creating fetch records during iteration + # so we verify that there are no records in the deque, or in an + # iterator + if self._records or self._iterator: + log.debug('Skipping init_fetches because there are unconsumed' + ' records internally') + return [] + return self._init_fetches() + + def _init_fetches(self): + futures = [] + for node_id, request in six.iteritems(self._create_fetch_requests()): + if self._client.ready(node_id): + log.debug("Sending FetchRequest to node %s", node_id) + future = self._client.send(node_id, request) + future.add_callback(self._handle_fetch_response, request) + future.add_errback(log.error, 'Fetch to node %s failed: %s', node_id) + futures.append(future) + self._fetch_futures.extend(futures) + self._clean_done_fetch_futures() + return futures + + def _clean_done_fetch_futures(self): + while True: + if not self._fetch_futures: + break + if not self._fetch_futures[0].is_done: + break + self._fetch_futures.popleft() + + def in_flight_fetches(self): + """Return True if there are any unprocessed FetchRequests in flight.""" + self._clean_done_fetch_futures() + return bool(self._fetch_futures) + + def update_fetch_positions(self, partitions): + """Update the fetch positions for the provided partitions. + + Arguments: + partitions (list of TopicPartitions): partitions to update + + Raises: + NoOffsetForPartitionError: if no offset is stored for a given + partition and no reset policy is available + """ + # reset the fetch position to the committed position + for tp in partitions: + if not self._subscriptions.is_assigned(tp): + log.warning("partition %s is not assigned - skipping offset" + " update", tp) + continue + elif self._subscriptions.is_fetchable(tp): + log.warning("partition %s is still fetchable -- skipping offset" + " update", tp) + continue + + # TODO: If there are several offsets to reset, + # we could submit offset requests in parallel + # for now, each call to _reset_offset will block + if self._subscriptions.is_offset_reset_needed(tp): + self._reset_offset(tp) + elif self._subscriptions.assignment[tp].committed is None: + # there's no committed position, so we need to reset with the + # default strategy + self._subscriptions.need_offset_reset(tp) + self._reset_offset(tp) + else: + committed = self._subscriptions.assignment[tp].committed + log.debug("Resetting offset for partition %s to the committed" + " offset %s", tp, committed) + self._subscriptions.seek(tp, committed) + + def _reset_offset(self, partition): + """Reset offsets for the given partition using the offset reset strategy. + + Arguments: + partition (TopicPartition): the partition that needs reset offset + + Raises: + NoOffsetForPartitionError: if no offset reset strategy is defined + """ + timestamp = self._subscriptions.assignment[partition].reset_strategy + if timestamp is OffsetResetStrategy.EARLIEST: + strategy = 'earliest' + elif timestamp is OffsetResetStrategy.LATEST: + strategy = 'latest' + else: + raise NoOffsetForPartitionError(partition) + + log.debug("Resetting offset for partition %s to %s offset.", + partition, strategy) + offset = self._offset(partition, timestamp) + + # we might lose the assignment while fetching the offset, + # so check it is still active + if self._subscriptions.is_assigned(partition): + self._subscriptions.seek(partition, offset) + + def _offset(self, partition, timestamp): + """Fetch a single offset before the given timestamp for the partition. + + Blocks until offset is obtained, or a non-retriable exception is raised + + Arguments: + partition The partition that needs fetching offset. + timestamp (int): timestamp for fetching offset. -1 for the latest + available, -2 for the earliest available. Otherwise timestamp + is treated as epoch seconds. + + Returns: + int: message offset + """ + while True: + future = self._send_offset_request(partition, timestamp) + self._client.poll(future=future) + + if future.succeeded(): + return future.value + + if not future.retriable(): + raise future.exception # pylint: disable-msg=raising-bad-type + + if future.exception.invalid_metadata: + refresh_future = self._client.cluster.request_update() + self._client.poll(future=refresh_future, sleep=True) + + def _raise_if_offset_out_of_range(self): + """Check FetchResponses for offset out of range. + + Raises: + OffsetOutOfRangeError: if any partition from previous FetchResponse + contains OffsetOutOfRangeError and the default_reset_policy is + None + """ + if not self._offset_out_of_range_partitions: + return + + current_out_of_range_partitions = {} + + # filter only the fetchable partitions + for partition, offset in self._offset_out_of_range_partitions: + if not self._subscriptions.is_fetchable(partition): + log.debug("Ignoring fetched records for %s since it is no" + " longer fetchable", partition) + continue + position = self._subscriptions.assignment[partition].position + # ignore partition if the current position != offset in FetchResponse + # e.g. after seek() + if position is not None and offset == position: + current_out_of_range_partitions[partition] = position + + self._offset_out_of_range_partitions.clear() + if current_out_of_range_partitions: + raise Errors.OffsetOutOfRangeError(current_out_of_range_partitions) + + def _raise_if_unauthorized_topics(self): + """Check FetchResponses for topic authorization failures. + + Raises: + TopicAuthorizationFailedError + """ + if self._unauthorized_topics: + topics = set(self._unauthorized_topics) + self._unauthorized_topics.clear() + raise Errors.TopicAuthorizationFailedError(topics) + + def _raise_if_record_too_large(self): + """Check FetchResponses for messages larger than the max per partition. + + Raises: + RecordTooLargeError: if there is a message larger than fetch size + """ + if not self._record_too_large_partitions: + return + + copied_record_too_large_partitions = dict(self._record_too_large_partitions) + self._record_too_large_partitions.clear() + + raise RecordTooLargeError( + "There are some messages at [Partition=Offset]: %s " + " whose size is larger than the fetch size %s" + " and hence cannot be ever returned." + " Increase the fetch size, or decrease the maximum message" + " size the broker will allow.", + copied_record_too_large_partitions, + self.config['max_partition_fetch_bytes']) + + def fetched_records(self): + """Returns previously fetched records and updates consumed offsets. + + Incompatible with iterator interface - use one or the other, not both. + + Raises: + OffsetOutOfRangeError: if no subscription offset_reset_strategy + InvalidMessageError: if message crc validation fails (check_crcs + must be set to True) + RecordTooLargeError: if a message is larger than the currently + configured max_partition_fetch_bytes + TopicAuthorizationError: if consumer is not authorized to fetch + messages from the topic + AssertionError: if used with iterator (incompatible) + + Returns: + dict: {TopicPartition: [messages]} + """ + assert self._iterator is None, ( + 'fetched_records is incompatible with message iterator') + if self._subscriptions.needs_partition_assignment: + return {} + + drained = collections.defaultdict(list) + self._raise_if_offset_out_of_range() + self._raise_if_unauthorized_topics() + self._raise_if_record_too_large() + + # Loop over the records deque + while self._records: + (fetch_offset, tp, messages) = self._records.popleft() + + if not self._subscriptions.is_assigned(tp): + # this can happen when a rebalance happened before + # fetched records are returned to the consumer's poll call + log.debug("Not returning fetched records for partition %s" + " since it is no longer assigned", tp) + continue + + # note that the position should always be available + # as long as the partition is still assigned + position = self._subscriptions.assignment[tp].position + if not self._subscriptions.is_fetchable(tp): + # this can happen when a partition is paused before + # fetched records are returned to the consumer's poll call + log.debug("Not returning fetched records for assigned partition" + " %s since it is no longer fetchable", tp) + + elif fetch_offset == position: + next_offset = messages[-1][0] + 1 + log.log(0, "Returning fetched records at offset %d for assigned" + " partition %s and update position to %s", position, + tp, next_offset) + self._subscriptions.assignment[tp].position = next_offset + + for record in self._unpack_message_set(tp, messages): + # Fetched compressed messages may include additional records + if record.offset < fetch_offset: + continue + drained[tp].append(record) + else: + # these records aren't next in line based on the last consumed + # position, ignore them they must be from an obsolete request + log.debug("Ignoring fetched records for %s at offset %s since" + " the current position is %d", tp, fetch_offset, + position) + return dict(drained) + + def _unpack_message_set(self, tp, messages): + for offset, size, msg in messages: + if self.config['check_crcs'] and not msg.validate_crc(): + raise Errors.InvalidMessageError(msg) + elif msg.is_compressed(): + for record in self._unpack_message_set(tp, msg.decompress()): + yield record + else: + try: + key, value = self._deserialize(msg) + # If the deserializer raises StopIteration, it is erroneously + # caught by the generator. We want all exceptions to be raised + # back to the user. See Issue 545 + except StopIteration as e: + log.exception('Deserializer raised StopIteration: %s', e) + raise Exception('Deserializer raised StopIteration') + yield ConsumerRecord(tp.topic, tp.partition, offset, key, value) + + def _message_generator(self): + """Iterate over fetched_records""" + if self._subscriptions.needs_partition_assignment: + raise StopIteration('Subscription needs partition assignment') + + while self._records: + + # Check on each iteration since this is a generator + self._raise_if_offset_out_of_range() + self._raise_if_unauthorized_topics() + self._raise_if_record_too_large() + + # Send additional FetchRequests when the internal queue is low + # this should enable moderate pipelining + if len(self._records) <= self.config['iterator_refetch_records']: + self._init_fetches() + + (fetch_offset, tp, messages) = self._records.popleft() + + if not self._subscriptions.is_assigned(tp): + # this can happen when a rebalance happened before + # fetched records are returned + log.debug("Not returning fetched records for partition %s" + " since it is no longer assigned", tp) + continue + + # note that the consumed position should always be available + # as long as the partition is still assigned + position = self._subscriptions.assignment[tp].position + if not self._subscriptions.is_fetchable(tp): + # this can happen when a partition consumption paused before + # fetched records are returned + log.debug("Not returning fetched records for assigned partition" + " %s since it is no longer fetchable", tp) + + elif fetch_offset == position: + log.log(0, "Returning fetched records at offset %d for assigned" + " partition %s", position, tp) + for msg in self._unpack_message_set(tp, messages): + + # Because we are in a generator, it is possible for + # subscription state to change between yield calls + # so we need to re-check on each loop + # this should catch assignment changes, pauses + # and resets via seek_to_beginning / seek_to_end + if not self._subscriptions.is_fetchable(tp): + log.debug("Not returning fetched records for partition %s" + " since it is no longer fetchable", tp) + break + + # Compressed messagesets may include earlier messages + # It is also possible that the user called seek() + elif msg.offset != self._subscriptions.assignment[tp].position: + continue + + self._subscriptions.assignment[tp].position = msg.offset + 1 + yield msg + else: + # these records aren't next in line based on the last consumed + # position, ignore them they must be from an obsolete request + log.debug("Ignoring fetched records for %s at offset %s", + tp, fetch_offset) + + def __iter__(self): # pylint: disable=non-iterator-returned + return self + + def __next__(self): + if not self._iterator: + self._iterator = self._message_generator() + try: + return next(self._iterator) + except StopIteration: + self._iterator = None + raise + + def _deserialize(self, msg): + if self.config['key_deserializer']: + key = self.config['key_deserializer'](msg.key) # pylint: disable-msg=not-callable + else: + key = msg.key + if self.config['value_deserializer']: + value = self.config['value_deserializer'](msg.value) # pylint: disable-msg=not-callable + else: + value = msg.value + return key, value + + def _send_offset_request(self, partition, timestamp): + """Fetch a single offset before the given timestamp for the partition. + + Arguments: + partition (TopicPartition): partition that needs fetching offset + timestamp (int): timestamp for fetching offset + + Returns: + Future: resolves to the corresponding offset + """ + node_id = self._client.cluster.leader_for_partition(partition) + if node_id is None: + log.debug("Partition %s is unknown for fetching offset," + " wait for metadata refresh", partition) + return Future().failure(Errors.StaleMetadata(partition)) + elif node_id == -1: + log.debug("Leader for partition %s unavailable for fetching offset," + " wait for metadata refresh", partition) + return Future().failure(Errors.LeaderNotAvailableError(partition)) + + request = OffsetRequest( + -1, [(partition.topic, [(partition.partition, timestamp, 1)])] + ) + # Client returns a future that only fails on network issues + # so create a separate future and attach a callback to update it + # based on response error codes + future = Future() + if not self._client.ready(node_id): + return future.failure(Errors.NodeNotReadyError(node_id)) + + _f = self._client.send(node_id, request) + _f.add_callback(self._handle_offset_response, partition, future) + _f.add_errback(lambda e: future.failure(e)) + return future + + def _handle_offset_response(self, partition, future, response): + """Callback for the response of the list offset call above. + + Arguments: + partition (TopicPartition): The partition that was fetched + future (Future): the future to update based on response + response (OffsetResponse): response from the server + + Raises: + AssertionError: if response does not match partition + """ + topic, partition_info = response.topics[0] + assert len(response.topics) == 1 and len(partition_info) == 1, ( + 'OffsetResponse should only be for a single topic-partition') + + part, error_code, offsets = partition_info[0] + assert topic == partition.topic and part == partition.partition, ( + 'OffsetResponse partition does not match OffsetRequest partition') + + error_type = Errors.for_code(error_code) + if error_type is Errors.NoError: + assert len(offsets) == 1, 'Expected OffsetResponse with one offset' + offset = offsets[0] + log.debug("Fetched offset %d for partition %s", offset, partition) + future.success(offset) + elif error_type in (Errors.NotLeaderForPartitionError, + Errors.UnknownTopicOrPartitionError): + log.warning("Attempt to fetch offsets for partition %s failed due" + " to obsolete leadership information, retrying.", + partition) + future.failure(error_type(partition)) + else: + log.error("Attempt to fetch offsets for partition %s failed due to:" + " %s", partition, error_type) + future.failure(error_type(partition)) + + def _create_fetch_requests(self): + """Create fetch requests for all assigned partitions, grouped by node. + + FetchRequests skipped if no leader, or node has requests in flight + + Returns: + dict: {node_id: FetchRequest, ...} + """ + # create the fetch info as a dict of lists of partition info tuples + # which can be passed to FetchRequest() via .items() + fetchable = collections.defaultdict(lambda: collections.defaultdict(list)) + + for partition in self._subscriptions.fetchable_partitions(): + node_id = self._client.cluster.leader_for_partition(partition) + if node_id is None or node_id == -1: + log.debug("No leader found for partition %s." + " Requesting metadata update", partition) + self._client.cluster.request_update() + elif self._client.in_flight_request_count(node_id) == 0: + # fetch if there is a leader and no in-flight requests + position = self._subscriptions.assignment[partition].position + partition_info = ( + partition.partition, + position, + self.config['max_partition_fetch_bytes'] + ) + fetchable[node_id][partition.topic].append(partition_info) + log.debug("Adding fetch request for partition %s at offset %d", + partition, position) + + requests = {} + for node_id, partition_data in six.iteritems(fetchable): + requests[node_id] = FetchRequest( + -1, # replica_id + self.config['fetch_max_wait_ms'], + self.config['fetch_min_bytes'], + partition_data.items()) + return requests + + def _handle_fetch_response(self, request, response): + """The callback for fetch completion""" + #total_bytes = 0 + #total_count = 0 + + fetch_offsets = {} + for topic, partitions in request.topics: + for partition, offset, _ in partitions: + fetch_offsets[TopicPartition(topic, partition)] = offset + + for topic, partitions in response.topics: + for partition, error_code, highwater, messages in partitions: + tp = TopicPartition(topic, partition) + error_type = Errors.for_code(error_code) + if not self._subscriptions.is_fetchable(tp): + # this can happen when a rebalance happened or a partition + # consumption paused while fetch is still in-flight + log.debug("Ignoring fetched records for partition %s" + " since it is no longer fetchable", tp) + + elif error_type is Errors.NoError: + self._subscriptions.assignment[tp].highwater = highwater + + # we are interested in this fetch only if the beginning + # offset matches the current consumed position + fetch_offset = fetch_offsets[tp] + position = self._subscriptions.assignment[tp].position + if position is None or position != fetch_offset: + log.debug("Discarding fetch response for partition %s" + " since its offset %d does not match the" + " expected offset %d", tp, fetch_offset, + position) + continue + + partial = None + if messages and isinstance(messages[-1][-1], PartialMessage): + partial = messages.pop() + + if messages: + log.debug("Adding fetched record for partition %s with" + " offset %d to buffered record list", tp, + position) + self._records.append((fetch_offset, tp, messages)) + #last_offset, _, _ = messages[-1] + #self.sensors.records_fetch_lag.record(highwater - last_offset) + elif partial: + # we did not read a single message from a non-empty + # buffer because that message's size is larger than + # fetch size, in this case record this exception + self._record_too_large_partitions[tp] = fetch_offset + + # TODO: bytes metrics + #self.sensors.record_topic_fetch_metrics(tp.topic, num_bytes, parsed.size()); + #totalBytes += num_bytes; + #totalCount += parsed.size(); + elif error_type in (Errors.NotLeaderForPartitionError, + Errors.UnknownTopicOrPartitionError): + self._client.cluster.request_update() + elif error_type is Errors.OffsetOutOfRangeError: + fetch_offset = fetch_offsets[tp] + if self._subscriptions.has_default_offset_reset_policy(): + self._subscriptions.need_offset_reset(tp) + else: + self._offset_out_of_range_partitions[tp] = fetch_offset + log.info("Fetch offset %s is out of range, resetting offset", + fetch_offset) + elif error_type is Errors.TopicAuthorizationFailedError: + log.warn("Not authorized to read from topic %s.", tp.topic) + self._unauthorized_topics.add(tp.topic) + elif error_type is Errors.UnknownError: + log.warn("Unknown error fetching data for topic-partition %s", tp) + else: + raise error_type('Unexpected error while fetching data') + + """TOOD - metrics + self.sensors.bytesFetched.record(totalBytes) + self.sensors.recordsFetched.record(totalCount) + self.sensors.fetchThrottleTimeSensor.record(response.getThrottleTime()) + self.sensors.fetchLatency.record(resp.requestLatencyMs()) + + +class FetchManagerMetrics(object): + def __init__(self, metrics, prefix): + self.metrics = metrics + self.group_name = prefix + "-fetch-manager-metrics" + + self.bytes_fetched = metrics.sensor("bytes-fetched") + self.bytes_fetched.add(metrics.metricName("fetch-size-avg", self.group_name, + "The average number of bytes fetched per request"), metrics.Avg()) + self.bytes_fetched.add(metrics.metricName("fetch-size-max", self.group_name, + "The maximum number of bytes fetched per request"), metrics.Max()) + self.bytes_fetched.add(metrics.metricName("bytes-consumed-rate", self.group_name, + "The average number of bytes consumed per second"), metrics.Rate()) + + self.records_fetched = self.metrics.sensor("records-fetched") + self.records_fetched.add(metrics.metricName("records-per-request-avg", self.group_name, + "The average number of records in each request"), metrics.Avg()) + self.records_fetched.add(metrics.metricName("records-consumed-rate", self.group_name, + "The average number of records consumed per second"), metrics.Rate()) + + self.fetch_latency = metrics.sensor("fetch-latency") + self.fetch_latency.add(metrics.metricName("fetch-latency-avg", self.group_name, + "The average time taken for a fetch request."), metrics.Avg()) + self.fetch_latency.add(metrics.metricName("fetch-latency-max", self.group_name, + "The max time taken for any fetch request."), metrics.Max()) + self.fetch_latency.add(metrics.metricName("fetch-rate", self.group_name, + "The number of fetch requests per second."), metrics.Rate(metrics.Count())) + + self.records_fetch_lag = metrics.sensor("records-lag") + self.records_fetch_lag.add(metrics.metricName("records-lag-max", self.group_name, + "The maximum lag in terms of number of records for any partition in self window"), metrics.Max()) + + self.fetch_throttle_time_sensor = metrics.sensor("fetch-throttle-time") + self.fetch_throttle_time_sensor.add(metrics.metricName("fetch-throttle-time-avg", self.group_name, + "The average throttle time in ms"), metrics.Avg()) + self.fetch_throttle_time_sensor.add(metrics.metricName("fetch-throttle-time-max", self.group_name, + "The maximum throttle time in ms"), metrics.Max()) + + def record_topic_fetch_metrics(topic, num_bytes, num_records): + # record bytes fetched + name = '.'.join(["topic", topic, "bytes-fetched"]) + self.metrics[name].record(num_bytes); + + # record records fetched + name = '.'.join(["topic", topic, "records-fetched"]) + self.metrics[name].record(num_records) + """ diff -Nru python-kafka-python-0.9.2/kafka/consumer/group.py python-kafka-python-1.0.1/kafka/consumer/group.py --- python-kafka-python-0.9.2/kafka/consumer/group.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/consumer/group.py 2016-02-18 16:38:17.000000000 +0000 @@ -0,0 +1,789 @@ +from __future__ import absolute_import + +import copy +import logging +import time + +import six + +from kafka.client_async import KafkaClient +from kafka.consumer.fetcher import Fetcher +from kafka.consumer.subscription_state import SubscriptionState +from kafka.coordinator.consumer import ConsumerCoordinator +from kafka.coordinator.assignors.range import RangePartitionAssignor +from kafka.coordinator.assignors.roundrobin import RoundRobinPartitionAssignor +from kafka.protocol.offset import OffsetResetStrategy +from kafka.version import __version__ + +log = logging.getLogger(__name__) + + +class KafkaConsumer(six.Iterator): + """Consume records from a Kafka cluster. + + The consumer will transparently handle the failure of servers in the Kafka + cluster, and adapt as topic-partitions are created or migrate between + brokers. It also interacts with the assigned kafka Group Coordinator node + to allow multiple consumers to load balance consumption of topics (requires + kafka >= 0.9.0.0). + + Arguments: + *topics (str): optional list of topics to subscribe to. If not set, + call subscribe() or assign() before consuming records. + + Keyword Arguments: + bootstrap_servers: 'host[:port]' string (or list of 'host[:port]' + strings) that the consumer should contact to bootstrap initial + cluster metadata. This does not have to be the full node list. + It just needs to have at least one broker that will respond to a + Metadata API Request. Default port is 9092. If no servers are + specified, will default to localhost:9092. + client_id (str): a name for this client. This string is passed in + each request to servers and can be used to identify specific + server-side log entries that correspond to this client. Also + submitted to GroupCoordinator for logging with respect to + consumer group administration. Default: 'kafka-python-{version}' + group_id (str or None): name of the consumer group to join for dynamic + partition assignment (if enabled), and to use for fetching and + committing offsets. If None, auto-partition assignment (via + group coordinator) and offset commits are disabled. + Default: 'kafka-python-default-group' + key_deserializer (callable): Any callable that takes a + raw message key and returns a deserialized key. + value_deserializer (callable): Any callable that takes a + raw message value and returns a deserialized value. + fetch_min_bytes (int): Minimum amount of data the server should + return for a fetch request, otherwise wait up to + fetch_max_wait_ms for more data to accumulate. Default: 1. + fetch_max_wait_ms (int): The maximum amount of time in milliseconds + the server will block before answering the fetch request if + there isn't sufficient data to immediately satisfy the + requirement given by fetch_min_bytes. Default: 500. + max_partition_fetch_bytes (int): The maximum amount of data + per-partition the server will return. The maximum total memory + used for a request = #partitions * max_partition_fetch_bytes. + This size must be at least as large as the maximum message size + the server allows or else it is possible for the producer to + send messages larger than the consumer can fetch. If that + happens, the consumer can get stuck trying to fetch a large + message on a certain partition. Default: 1048576. + request_timeout_ms (int): Client request timeout in milliseconds. + Default: 40000. + retry_backoff_ms (int): Milliseconds to backoff when retrying on + errors. Default: 100. + reconnect_backoff_ms (int): The amount of time in milliseconds to + wait before attempting to reconnect to a given host. + Default: 50. + max_in_flight_requests_per_connection (int): Requests are pipelined + to kafka brokers up to this number of maximum requests per + broker connection. Default: 5. + auto_offset_reset (str): A policy for resetting offsets on + OffsetOutOfRange errors: 'earliest' will move to the oldest + available message, 'latest' will move to the most recent. Any + ofther value will raise the exception. Default: 'latest'. + enable_auto_commit (bool): If true the consumer's offset will be + periodically committed in the background. Default: True. + auto_commit_interval_ms (int): milliseconds between automatic + offset commits, if enable_auto_commit is True. Default: 5000. + default_offset_commit_callback (callable): called as + callback(offsets, response) response will be either an Exception + or a OffsetCommitResponse struct. This callback can be used to + trigger custom actions when a commit request completes. + check_crcs (bool): Automatically check the CRC32 of the records + consumed. This ensures no on-the-wire or on-disk corruption to + the messages occurred. This check adds some overhead, so it may + be disabled in cases seeking extreme performance. Default: True + metadata_max_age_ms (int): The period of time in milliseconds after + which we force a refresh of metadata even if we haven't seen any + partition leadership changes to proactively discover any new + brokers or partitions. Default: 300000 + partition_assignment_strategy (list): List of objects to use to + distribute partition ownership amongst consumer instances when + group management is used. + Default: [RangePartitionAssignor, RoundRobinPartitionAssignor] + heartbeat_interval_ms (int): The expected time in milliseconds + between heartbeats to the consumer coordinator when using + Kafka's group management feature. Heartbeats are used to ensure + that the consumer's session stays active and to facilitate + rebalancing when new consumers join or leave the group. The + value must be set lower than session_timeout_ms, but typically + should be set no higher than 1/3 of that value. It can be + adjusted even lower to control the expected time for normal + rebalances. Default: 3000 + session_timeout_ms (int): The timeout used to detect failures when + using Kafka's group managementment facilities. Default: 30000 + send_buffer_bytes (int): The size of the TCP send buffer + (SO_SNDBUF) to use when sending data. Default: None (relies on + system defaults). The java client defaults to 131072. + receive_buffer_bytes (int): The size of the TCP receive buffer + (SO_RCVBUF) to use when reading data. Default: None (relies on + system defaults). The java client defaults to 32768. + consumer_timeout_ms (int): number of millisecond to throw a timeout + exception to the consumer if no message is available for + consumption. Default: -1 (dont throw exception) + api_version (str): specify which kafka API version to use. + 0.9 enables full group coordination features; 0.8.2 enables + kafka-storage offset commits; 0.8.1 enables zookeeper-storage + offset commits; 0.8.0 is what is left. If set to 'auto', will + attempt to infer the broker version by probing various APIs. + Default: auto + + Note: + Configuration parameters are described in more detail at + https://kafka.apache.org/090/configuration.html#newconsumerconfigs + """ + DEFAULT_CONFIG = { + 'bootstrap_servers': 'localhost', + 'client_id': 'kafka-python-' + __version__, + 'group_id': 'kafka-python-default-group', + 'key_deserializer': None, + 'value_deserializer': None, + 'fetch_max_wait_ms': 500, + 'fetch_min_bytes': 1, + 'max_partition_fetch_bytes': 1 * 1024 * 1024, + 'request_timeout_ms': 40 * 1000, + 'retry_backoff_ms': 100, + 'reconnect_backoff_ms': 50, + 'max_in_flight_requests_per_connection': 5, + 'auto_offset_reset': 'latest', + 'enable_auto_commit': True, + 'auto_commit_interval_ms': 5000, + 'check_crcs': True, + 'metadata_max_age_ms': 5 * 60 * 1000, + 'partition_assignment_strategy': (RangePartitionAssignor, RoundRobinPartitionAssignor), + 'heartbeat_interval_ms': 3000, + 'session_timeout_ms': 30000, + 'send_buffer_bytes': None, + 'receive_buffer_bytes': None, + 'consumer_timeout_ms': -1, + 'api_version': 'auto', + 'connections_max_idle_ms': 9 * 60 * 1000, # not implemented yet + #'metric_reporters': None, + #'metrics_num_samples': 2, + #'metrics_sample_window_ms': 30000, + } + + def __init__(self, *topics, **configs): + self.config = copy.copy(self.DEFAULT_CONFIG) + for key in self.config: + if key in configs: + self.config[key] = configs.pop(key) + + # Only check for extra config keys in top-level class + assert not configs, 'Unrecognized configs: %s' % configs + + deprecated = {'smallest': 'earliest', 'largest': 'latest' } + if self.config['auto_offset_reset'] in deprecated: + new_config = deprecated[self.config['auto_offset_reset']] + log.warning('use auto_offset_reset=%s (%s is deprecated)', + new_config, self.config['auto_offset_reset']) + self.config['auto_offset_reset'] = new_config + + self._client = KafkaClient(**self.config) + + # Check Broker Version if not set explicitly + if self.config['api_version'] == 'auto': + self.config['api_version'] = self._client.check_version() + assert self.config['api_version'] in ('0.9', '0.8.2', '0.8.1', '0.8.0'), 'Unrecognized api version' + + # Convert api_version config to tuple for easy comparisons + self.config['api_version'] = tuple( + map(int, self.config['api_version'].split('.'))) + + self._subscription = SubscriptionState(self.config['auto_offset_reset']) + self._fetcher = Fetcher( + self._client, self._subscription, **self.config) + self._coordinator = ConsumerCoordinator( + self._client, self._subscription, + assignors=self.config['partition_assignment_strategy'], + **self.config) + self._closed = False + self._iterator = None + self._consumer_timeout = float('inf') + + #self.metrics = None + if topics: + self._subscription.subscribe(topics=topics) + self._client.set_topics(topics) + + def assign(self, partitions): + """Manually assign a list of TopicPartitions to this consumer. + + Arguments: + partitions (list of TopicPartition): assignment for this instance. + + Raises: + IllegalStateError: if consumer has already called subscribe() + + Warning: + It is not possible to use both manual partition assignment with + assign() and group assignment with subscribe(). + + Note: + This interface does not support incremental assignment and will + replace the previous assignment (if there was one). + + Note: + Manual topic assignment through this method does not use the + consumer's group management functionality. As such, there will be + no rebalance operation triggered when group membership or cluster + and topic metadata change. + """ + self._subscription.assign_from_user(partitions) + self._client.set_topics([tp.topic for tp in partitions]) + + def assignment(self): + """Get the TopicPartitions currently assigned to this consumer. + + If partitions were directly assigned using assign(), then this will + simply return the same partitions that were previously assigned. + If topics were subscribed using subscribe(), then this will give the + set of topic partitions currently assigned to the consumer (which may + be none if the assignment hasn't happened yet, or if the partitions are + in the process of being reassigned). + + Returns: + set: {TopicPartition, ...} + """ + return self._subscription.assigned_partitions() + + def close(self): + """Close the consumer, waiting indefinitely for any needed cleanup.""" + if self._closed: + return + log.debug("Closing the KafkaConsumer.") + self._closed = True + self._coordinator.close() + #self.metrics.close() + self._client.close() + try: + self.config['key_deserializer'].close() + except AttributeError: + pass + try: + self.config['value_deserializer'].close() + except AttributeError: + pass + log.debug("The KafkaConsumer has closed.") + + def commit_async(self, offsets=None, callback=None): + """Commit offsets to kafka asynchronously, optionally firing callback + + This commits offsets only to Kafka. The offsets committed using this API + will be used on the first fetch after every rebalance and also on + startup. As such, if you need to store offsets in anything other than + Kafka, this API should not be used. To avoid re-processing the last + message read if a consumer is restarted, the committed offset should be + the next message your application should consume, i.e.: last_offset + 1. + + This is an asynchronous call and will not block. Any errors encountered + are either passed to the callback (if provided) or discarded. + + Arguments: + offsets (dict, optional): {TopicPartition: OffsetAndMetadata} dict + to commit with the configured group_id. Defaults to current + consumed offsets for all subscribed partitions. + callback (callable, optional): called as callback(offsets, response) + with response as either an Exception or a OffsetCommitResponse + struct. This callback can be used to trigger custom actions when + a commit request completes. + + Returns: + kafka.future.Future + """ + assert self.config['api_version'] >= (0, 8, 1), 'Requires >= Kafka 0.8.1' + assert self.config['group_id'] is not None, 'Requires group_id' + if offsets is None: + offsets = self._subscription.all_consumed_offsets() + log.debug("Committing offsets: %s", offsets) + future = self._coordinator.commit_offsets_async( + offsets, callback=callback) + return future + + def commit(self, offsets=None): + """Commit offsets to kafka, blocking until success or error + + This commits offsets only to Kafka. The offsets committed using this API + will be used on the first fetch after every rebalance and also on + startup. As such, if you need to store offsets in anything other than + Kafka, this API should not be used. To avoid re-processing the last + message read if a consumer is restarted, the committed offset should be + the next message your application should consume, i.e.: last_offset + 1. + + Blocks until either the commit succeeds or an unrecoverable error is + encountered (in which case it is thrown to the caller). + + Currently only supports kafka-topic offset storage (not zookeeper) + + Arguments: + offsets (dict, optional): {TopicPartition: OffsetAndMetadata} dict + to commit with the configured group_id. Defaults to current + consumed offsets for all subscribed partitions. + """ + assert self.config['api_version'] >= (0, 8, 1), 'Requires >= Kafka 0.8.1' + assert self.config['group_id'] is not None, 'Requires group_id' + if offsets is None: + offsets = self._subscription.all_consumed_offsets() + self._coordinator.commit_offsets_sync(offsets) + + def committed(self, partition): + """Get the last committed offset for the given partition + + This offset will be used as the position for the consumer + in the event of a failure. + + This call may block to do a remote call if the partition in question + isn't assigned to this consumer or if the consumer hasn't yet + initialized its cache of committed offsets. + + Arguments: + partition (TopicPartition): the partition to check + + Returns: + The last committed offset, or None if there was no prior commit. + """ + assert self.config['api_version'] >= (0, 8, 1), 'Requires >= Kafka 0.8.1' + assert self.config['group_id'] is not None, 'Requires group_id' + if self._subscription.is_assigned(partition): + committed = self._subscription.assignment[partition].committed + if committed is None: + self._coordinator.refresh_committed_offsets_if_needed() + committed = self._subscription.assignment[partition].committed + else: + commit_map = self._coordinator.fetch_committed_offsets([partition]) + if partition in commit_map: + committed = commit_map[partition].offset + else: + committed = None + return committed + + def topics(self): + """Get all topics the user is authorized to view. + + Returns: + set: topics + """ + cluster = self._client.cluster + if self._client._metadata_refresh_in_progress and self._client._topics: + future = cluster.request_update() + self._client.poll(future=future) + stash = cluster.need_all_topic_metadata + cluster.need_all_topic_metadata = True + future = cluster.request_update() + self._client.poll(future=future) + cluster.need_all_topic_metadata = stash + return cluster.topics() + + def partitions_for_topic(self, topic): + """Get metadata about the partitions for a given topic. + + Arguments: + topic (str): topic to check + + Returns: + set: partition ids + """ + return self._client.cluster.partitions_for_topic(topic) + + def poll(self, timeout_ms=0): + """Fetch data from assigned topics / partitions. + + Records are fetched and returned in batches by topic-partition. + On each poll, consumer will try to use the last consumed offset as the + starting offset and fetch sequentially. The last consumed offset can be + manually set through seek(partition, offset) or automatically set as + the last committed offset for the subscribed list of partitions. + + Incompatible with iterator interface -- use one or the other, not both. + + Arguments: + timeout_ms (int, optional): milliseconds spent waiting in poll if + data is not available in the buffer. If 0, returns immediately + with any records that are available currently in the buffer, + else returns empty. Must not be negative. Default: 0 + + Returns: + dict: topic to list of records since the last fetch for the + subscribed list of topics and partitions + """ + assert timeout_ms >= 0, 'Timeout must not be negative' + assert self._iterator is None, 'Incompatible with iterator interface' + + # poll for new data until the timeout expires + start = time.time() + remaining = timeout_ms + while True: + records = self._poll_once(remaining) + if records: + # before returning the fetched records, we can send off the + # next round of fetches and avoid block waiting for their + # responses to enable pipelining while the user is handling the + # fetched records. + self._fetcher.init_fetches() + return records + + elapsed_ms = (time.time() - start) * 1000 + remaining = timeout_ms - elapsed_ms + + if remaining <= 0: + return {} + + def _poll_once(self, timeout_ms): + """ + Do one round of polling. In addition to checking for new data, this does + any needed heart-beating, auto-commits, and offset updates. + + Arguments: + timeout_ms (int): The maximum time in milliseconds to block + + Returns: + dict: map of topic to list of records (may be empty) + """ + if self.config['group_id'] is not None: + if self.config['api_version'] >= (0, 8, 2): + self._coordinator.ensure_coordinator_known() + + if self.config['api_version'] >= (0, 9): + # ensure we have partitions assigned if we expect to + if self._subscription.partitions_auto_assigned(): + self._coordinator.ensure_active_group() + + # fetch positions if we have partitions we're subscribed to that we + # don't know the offset for + if not self._subscription.has_all_fetch_positions(): + self._update_fetch_positions(self._subscription.missing_fetch_positions()) + + # init any new fetches (won't resend pending fetches) + records = self._fetcher.fetched_records() + + # if data is available already, e.g. from a previous network client + # poll() call to commit, then just return it immediately + if records: + return records + + self._fetcher.init_fetches() + self._client.poll(timeout_ms) + return self._fetcher.fetched_records() + + def position(self, partition): + """Get the offset of the next record that will be fetched + + Arguments: + partition (TopicPartition): partition to check + + Returns: + int: offset + """ + assert self._subscription.is_assigned(partition), 'Partition is not assigned' + offset = self._subscription.assignment[partition].position + if offset is None: + self._update_fetch_positions(partition) + offset = self._subscription.assignment[partition].position + return offset + + def highwater(self, partition): + """Last known highwater offset for a partition + + A highwater offset is the offset that will be assigned to the next + message that is produced. It may be useful for calculating lag, by + comparing with the reported position. Note that both position and + highwater refer to the *next* offset -- i.e., highwater offset is + one greater than the newest available message. + + Highwater offsets are returned in FetchResponse messages, so will + not be available if not FetchRequests have been sent for this partition + yet. + + Arguments: + partition (TopicPartition): partition to check + + Returns: + int or None: offset if available + """ + assert self._subscription.is_assigned(partition), 'Partition is not assigned' + return self._subscription.assignment[partition].highwater + + def pause(self, *partitions): + """Suspend fetching from the requested partitions. + + Future calls to poll() will not return any records from these partitions + until they have been resumed using resume(). Note that this method does + not affect partition subscription. In particular, it does not cause a + group rebalance when automatic assignment is used. + + Arguments: + *partitions (TopicPartition): partitions to pause + """ + for partition in partitions: + log.debug("Pausing partition %s", partition) + self._subscription.pause(partition) + + def resume(self, *partitions): + """Resume fetching from the specified (paused) partitions. + + Arguments: + *partitions (TopicPartition): partitions to resume + """ + for partition in partitions: + log.debug("Resuming partition %s", partition) + self._subscription.resume(partition) + + def seek(self, partition, offset): + """Manually specify the fetch offset for a TopicPartition. + + Overrides the fetch offsets that the consumer will use on the next + poll(). If this API is invoked for the same partition more than once, + the latest offset will be used on the next poll(). Note that you may + lose data if this API is arbitrarily used in the middle of consumption, + to reset the fetch offsets. + + Arguments: + partition (TopicPartition): partition for seek operation + offset (int): message offset in partition + + Raises: + AssertionError: if offset is not an int >= 0; or if partition is not + currently assigned. + """ + assert isinstance(offset, int) and offset >= 0, 'Offset must be >= 0' + assert partition in self._subscription.assigned_partitions(), 'Unassigned partition' + log.debug("Seeking to offset %s for partition %s", offset, partition) + self._subscription.assignment[partition].seek(offset) + + def seek_to_beginning(self, *partitions): + """Seek to the oldest available offset for partitions. + + Arguments: + *partitions: optionally provide specific TopicPartitions, otherwise + default to all assigned partitions + + Raises: + AssertionError: if any partition is not currently assigned, or if + no partitions are assigned + """ + if not partitions: + partitions = self._subscription.assigned_partitions() + assert partitions, 'No partitions are currently assigned' + else: + for p in partitions: + assert p in self._subscription.assigned_partitions(), 'Unassigned partition' + + for tp in partitions: + log.debug("Seeking to beginning of partition %s", tp) + self._subscription.need_offset_reset(tp, OffsetResetStrategy.EARLIEST) + + def seek_to_end(self, *partitions): + """Seek to the most recent available offset for partitions. + + Arguments: + *partitions: optionally provide specific TopicPartitions, otherwise + default to all assigned partitions + + Raises: + AssertionError: if any partition is not currently assigned, or if + no partitions are assigned + """ + if not partitions: + partitions = self._subscription.assigned_partitions() + assert partitions, 'No partitions are currently assigned' + else: + for p in partitions: + assert p in self._subscription.assigned_partitions(), 'Unassigned partition' + + for tp in partitions: + log.debug("Seeking to end of partition %s", tp) + self._subscription.need_offset_reset(tp, OffsetResetStrategy.LATEST) + + def subscribe(self, topics=(), pattern=None, listener=None): + """Subscribe to a list of topics, or a topic regex pattern + + Partitions will be dynamically assigned via a group coordinator. + Topic subscriptions are not incremental: this list will replace the + current assignment (if there is one). + + This method is incompatible with assign() + + Arguments: + topics (list): List of topics for subscription. + pattern (str): Pattern to match available topics. You must provide + either topics or pattern, but not both. + listener (ConsumerRebalanceListener): Optionally include listener + callback, which will be called before and after each rebalance + operation. + + As part of group management, the consumer will keep track of the + list of consumers that belong to a particular group and will + trigger a rebalance operation if one of the following events + trigger: + + * Number of partitions change for any of the subscribed topics + * Topic is created or deleted + * An existing member of the consumer group dies + * A new member is added to the consumer group + + When any of these events are triggered, the provided listener + will be invoked first to indicate that the consumer's assignment + has been revoked, and then again when the new assignment has + been received. Note that this listener will immediately override + any listener set in a previous call to subscribe. It is + guaranteed, however, that the partitions revoked/assigned + through this interface are from topics subscribed in this call. + + Raises: + IllegalStateError: if called after previously calling assign() + AssertionError: if neither topics or pattern is provided + TypeError: if listener is not a ConsumerRebalanceListener + """ + # SubscriptionState handles error checking + self._subscription.subscribe(topics=topics, + pattern=pattern, + listener=listener) + + # regex will need all topic metadata + if pattern is not None: + self._client.cluster.need_all_topic_metadata = True + self._client.set_topics([]) + log.debug("Subscribed to topic pattern: %s", pattern) + else: + self._client.cluster.need_all_topic_metadata = False + self._client.set_topics(self._subscription.group_subscription()) + log.debug("Subscribed to topic(s): %s", topics) + + def subscription(self): + """Get the current topic subscription. + + Returns: + set: {topic, ...} + """ + return self._subscription.subscription + + def unsubscribe(self): + """Unsubscribe from all topics and clear all assigned partitions.""" + self._subscription.unsubscribe() + self._coordinator.close() + self._client.cluster.need_all_topic_metadata = False + self._client.set_topics([]) + log.debug("Unsubscribed all topics or patterns and assigned partitions") + + def _update_fetch_positions(self, partitions): + """ + Set the fetch position to the committed position (if there is one) + or reset it using the offset reset policy the user has configured. + + Arguments: + partitions (List[TopicPartition]): The partitions that need + updating fetch positions + + Raises: + NoOffsetForPartitionError: If no offset is stored for a given + partition and no offset reset policy is defined + """ + if (self.config['api_version'] >= (0, 8, 1) + and self.config['group_id'] is not None): + + # refresh commits for all assigned partitions + self._coordinator.refresh_committed_offsets_if_needed() + + # then do any offset lookups in case some positions are not known + self._fetcher.update_fetch_positions(partitions) + + def _message_generator(self): + assert self.assignment() or self.subscription() is not None, 'No topic subscription or manual partition assignment' + while time.time() < self._consumer_timeout: + if self.config['group_id'] is not None: + if self.config['api_version'] >= (0, 8, 2): + self._coordinator.ensure_coordinator_known() + + if self.config['api_version'] >= (0, 9): + # ensure we have partitions assigned if we expect to + if self._subscription.partitions_auto_assigned(): + self._coordinator.ensure_active_group() + + # fetch positions if we have partitions we're subscribed to that we + # don't know the offset for + if not self._subscription.has_all_fetch_positions(): + partitions = self._subscription.missing_fetch_positions() + self._update_fetch_positions(partitions) + + poll_ms = 1000 * (self._consumer_timeout - time.time()) + if not self._fetcher.in_flight_fetches(): + poll_ms = 0 + self._client.poll(poll_ms) + + # We need to make sure we at least keep up with scheduled tasks, + # like heartbeats, auto-commits, and metadata refreshes + timeout_at = self._next_timeout() + + if self.config['api_version'] >= (0, 9): + if self.config['group_id'] is not None and not self.assignment(): + sleep_time = max(timeout_at - time.time(), 0) + if sleep_time > 0 and not self._client.in_flight_request_count(): + log.debug('No partitions assigned; sleeping for %s', sleep_time) + time.sleep(sleep_time) + continue + + if time.time() > timeout_at: + continue + + for msg in self._fetcher: + yield msg + if time.time() > timeout_at: + log.debug("internal iterator timeout - breaking for poll") + break + + # an else block on a for loop only executes if there was no break + # so this should only be called on a StopIteration from the fetcher + # and we assume that it is safe to init_fetches when fetcher is done + # i.e., there are no more records stored internally + else: + self._fetcher.init_fetches() + + def _next_timeout(self): + return min(self._consumer_timeout, + self._client._delayed_tasks.next_at() + time.time(), + self._client.cluster.ttl() / 1000.0 + time.time()) + + def __iter__(self): # pylint: disable=non-iterator-returned + return self + + def __next__(self): + if not self._iterator: + self._iterator = self._message_generator() + + self._set_consumer_timeout() + try: + return next(self._iterator) + except StopIteration: + self._iterator = None + raise + + def _set_consumer_timeout(self): + # consumer_timeout_ms can be used to stop iteration early + if self.config['consumer_timeout_ms'] >= 0: + self._consumer_timeout = time.time() + ( + self.config['consumer_timeout_ms'] / 1000.0) + + # old KafkaConsumer methods are deprecated + def configure(self, **configs): + raise NotImplementedError( + 'deprecated -- initialize a new consumer') + + def set_topic_partitions(self, *topics): + raise NotImplementedError( + 'deprecated -- use subscribe() or assign()') + + def fetch_messages(self): + raise NotImplementedError( + 'deprecated -- use poll() or iterator interface') + + def get_partition_offsets(self, topic, partition, + request_time_ms, max_num_offsets): + raise NotImplementedError( + 'deprecated -- send an OffsetRequest with KafkaClient') + + def offsets(self, group=None): + raise NotImplementedError('deprecated -- use committed(partition)') + + def task_done(self, message): + raise NotImplementedError( + 'deprecated -- commit offsets manually if needed') diff -Nru python-kafka-python-0.9.2/kafka/consumer/__init__.py python-kafka-python-1.0.1/kafka/consumer/__init__.py --- python-kafka-python-0.9.2/kafka/consumer/__init__.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/consumer/__init__.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,7 @@ +from .simple import SimpleConsumer +from .multiprocess import MultiProcessConsumer +from .group import KafkaConsumer + +__all__ = [ + 'SimpleConsumer', 'MultiProcessConsumer', 'KafkaConsumer' +] diff -Nru python-kafka-python-0.9.2/kafka/consumer/multiprocess.py python-kafka-python-1.0.1/kafka/consumer/multiprocess.py --- python-kafka-python-0.9.2/kafka/consumer/multiprocess.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/consumer/multiprocess.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,295 @@ +from __future__ import absolute_import + +from collections import namedtuple +import logging +from multiprocessing import Process, Manager as MPManager +import time +import warnings + +from six.moves import queue + +from ..common import KafkaError +from .base import ( + Consumer, + AUTO_COMMIT_MSG_COUNT, AUTO_COMMIT_INTERVAL, + NO_MESSAGES_WAIT_TIME_SECONDS, + FULL_QUEUE_WAIT_TIME_SECONDS, + MAX_BACKOFF_SECONDS, +) +from .simple import SimpleConsumer + + +log = logging.getLogger(__name__) + +Events = namedtuple("Events", ["start", "pause", "exit"]) + + +def _mp_consume(client, group, topic, message_queue, size, events, **consumer_options): + """ + A child process worker which consumes messages based on the + notifications given by the controller process + + NOTE: Ideally, this should have been a method inside the Consumer + class. However, multiprocessing module has issues in windows. The + functionality breaks unless this function is kept outside of a class + """ + + # Initial interval for retries in seconds. + interval = 1 + while not events.exit.is_set(): + try: + # Make the child processes open separate socket connections + client.reinit() + + # We will start consumers without auto-commit. Auto-commit will be + # done by the master controller process. + consumer = SimpleConsumer(client, group, topic, + auto_commit=False, + auto_commit_every_n=None, + auto_commit_every_t=None, + **consumer_options) + + # Ensure that the consumer provides the partition information + consumer.provide_partition_info() + + while True: + # Wait till the controller indicates us to start consumption + events.start.wait() + + # If we are asked to quit, do so + if events.exit.is_set(): + break + + # Consume messages and add them to the queue. If the controller + # indicates a specific number of messages, follow that advice + count = 0 + + message = consumer.get_message() + if message: + while True: + try: + message_queue.put(message, timeout=FULL_QUEUE_WAIT_TIME_SECONDS) + break + except queue.Full: + if events.exit.is_set(): break + + count += 1 + + # We have reached the required size. The controller might have + # more than what he needs. Wait for a while. + # Without this logic, it is possible that we run into a big + # loop consuming all available messages before the controller + # can reset the 'start' event + if count == size.value: + events.pause.wait() + + else: + # In case we did not receive any message, give up the CPU for + # a while before we try again + time.sleep(NO_MESSAGES_WAIT_TIME_SECONDS) + + consumer.stop() + + except KafkaError as e: + # Retry with exponential backoff + log.error("Problem communicating with Kafka (%s), retrying in %d seconds..." % (e, interval)) + time.sleep(interval) + interval = interval*2 if interval*2 < MAX_BACKOFF_SECONDS else MAX_BACKOFF_SECONDS + + +class MultiProcessConsumer(Consumer): + """ + A consumer implementation that consumes partitions for a topic in + parallel using multiple processes + + Arguments: + client: a connected SimpleClient + group: a name for this consumer, used for offset storage and must be unique + If you are connecting to a server that does not support offset + commit/fetch (any prior to 0.8.1.1), then you *must* set this to None + topic: the topic to consume + + Keyword Arguments: + partitions: An optional list of partitions to consume the data from + auto_commit: default True. Whether or not to auto commit the offsets + auto_commit_every_n: default 100. How many messages to consume + before a commit + auto_commit_every_t: default 5000. How much time (in milliseconds) to + wait before commit + num_procs: Number of processes to start for consuming messages. + The available partitions will be divided among these processes + partitions_per_proc: Number of partitions to be allocated per process + (overrides num_procs) + + Auto commit details: + If both auto_commit_every_n and auto_commit_every_t are set, they will + reset one another when one is triggered. These triggers simply call the + commit method on this class. A manual call to commit will also reset + these triggers + """ + def __init__(self, client, group, topic, + partitions=None, + auto_commit=True, + auto_commit_every_n=AUTO_COMMIT_MSG_COUNT, + auto_commit_every_t=AUTO_COMMIT_INTERVAL, + num_procs=1, + partitions_per_proc=0, + **simple_consumer_options): + + warnings.warn('This class has been deprecated and will be removed in a' + ' future release. Use KafkaConsumer instead', + DeprecationWarning) + + # Initiate the base consumer class + super(MultiProcessConsumer, self).__init__( + client, group, topic, + partitions=partitions, + auto_commit=auto_commit, + auto_commit_every_n=auto_commit_every_n, + auto_commit_every_t=auto_commit_every_t) + + # Variables for managing and controlling the data flow from + # consumer child process to master + manager = MPManager() + self.queue = manager.Queue(1024) # Child consumers dump messages into this + self.events = Events( + start = manager.Event(), # Indicates the consumers to start fetch + exit = manager.Event(), # Requests the consumers to shutdown + pause = manager.Event()) # Requests the consumers to pause fetch + self.size = manager.Value('i', 0) # Indicator of number of messages to fetch + + # dict.keys() returns a view in py3 + it's not a thread-safe operation + # http://blog.labix.org/2008/06/27/watch-out-for-listdictkeys-in-python-3 + # It's safer to copy dict as it only runs during the init. + partitions = list(self.offsets.copy().keys()) + + # By default, start one consumer process for all partitions + # The logic below ensures that + # * we do not cross the num_procs limit + # * we have an even distribution of partitions among processes + + if partitions_per_proc: + num_procs = len(partitions) / partitions_per_proc + if num_procs * partitions_per_proc < len(partitions): + num_procs += 1 + + # The final set of chunks + chunks = [partitions[proc::num_procs] for proc in range(num_procs)] + + self.procs = [] + for chunk in chunks: + options = {'partitions': list(chunk)} + if simple_consumer_options: + simple_consumer_options.pop('partitions', None) + options.update(simple_consumer_options) + + args = (client.copy(), self.group, self.topic, self.queue, + self.size, self.events) + proc = Process(target=_mp_consume, args=args, kwargs=options) + proc.daemon = True + proc.start() + self.procs.append(proc) + + def __repr__(self): + return '' % \ + (self.group, self.topic, len(self.procs)) + + def stop(self): + # Set exit and start off all waiting consumers + self.events.exit.set() + self.events.pause.set() + self.events.start.set() + + for proc in self.procs: + proc.join() + proc.terminate() + + super(MultiProcessConsumer, self).stop() + + def __iter__(self): + """ + Iterator to consume the messages available on this consumer + """ + # Trigger the consumer procs to start off. + # We will iterate till there are no more messages available + self.size.value = 0 + self.events.pause.set() + + while True: + self.events.start.set() + try: + # We will block for a small while so that the consumers get + # a chance to run and put some messages in the queue + # TODO: This is a hack and will make the consumer block for + # at least one second. Need to find a better way of doing this + partition, message = self.queue.get(block=True, timeout=1) + except queue.Empty: + break + + # Count, check and commit messages if necessary + self.offsets[partition] = message.offset + 1 + self.events.start.clear() + self.count_since_commit += 1 + self._auto_commit() + yield message + + self.events.start.clear() + + def get_messages(self, count=1, block=True, timeout=10): + """ + Fetch the specified number of messages + + Keyword Arguments: + count: Indicates the maximum number of messages to be fetched + block: If True, the API will block till all messages are fetched. + If block is a positive integer the API will block until that + many messages are fetched. + timeout: When blocking is requested the function will block for + the specified time (in seconds) until count messages is + fetched. If None, it will block forever. + """ + messages = [] + + # Give a size hint to the consumers. Each consumer process will fetch + # a maximum of "count" messages. This will fetch more messages than + # necessary, but these will not be committed to kafka. Also, the extra + # messages can be provided in subsequent runs + self.size.value = count + self.events.pause.clear() + + if timeout is not None: + max_time = time.time() + timeout + + new_offsets = {} + while count > 0 and (timeout is None or timeout > 0): + # Trigger consumption only if the queue is empty + # By doing this, we will ensure that consumers do not + # go into overdrive and keep consuming thousands of + # messages when the user might need only a few + if self.queue.empty(): + self.events.start.set() + + block_next_call = block is True or block > len(messages) + try: + partition, message = self.queue.get(block_next_call, + timeout) + except queue.Empty: + break + + _msg = (partition, message) if self.partition_info else message + messages.append(_msg) + new_offsets[partition] = message.offset + 1 + count -= 1 + if timeout is not None: + timeout = max_time - time.time() + + self.size.value = 0 + self.events.start.clear() + self.events.pause.set() + + # Update and commit offsets if necessary + self.offsets.update(new_offsets) + self.count_since_commit += len(messages) + self._auto_commit() + + return messages diff -Nru python-kafka-python-0.9.2/kafka/consumer/simple.py python-kafka-python-1.0.1/kafka/consumer/simple.py --- python-kafka-python-0.9.2/kafka/consumer/simple.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/consumer/simple.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,444 @@ +from __future__ import absolute_import + +try: + from itertools import zip_longest as izip_longest, repeat # pylint: disable=E0611 +except ImportError: + from itertools import izip_longest as izip_longest, repeat # pylint: disable=E0611 +import logging +import sys +import time +import warnings + +import six +from six.moves import queue + +from .base import ( + Consumer, + FETCH_DEFAULT_BLOCK_TIMEOUT, + AUTO_COMMIT_MSG_COUNT, + AUTO_COMMIT_INTERVAL, + FETCH_MIN_BYTES, + FETCH_BUFFER_SIZE_BYTES, + MAX_FETCH_BUFFER_SIZE_BYTES, + FETCH_MAX_WAIT_TIME, + ITER_TIMEOUT_SECONDS, + NO_MESSAGES_WAIT_TIME_SECONDS +) +from ..common import ( + FetchRequestPayload, KafkaError, OffsetRequestPayload, + ConsumerFetchSizeTooSmall, ConsumerNoMoreData, + UnknownTopicOrPartitionError, NotLeaderForPartitionError, + OffsetOutOfRangeError, FailedPayloadsError, check_error +) +from kafka.protocol.message import PartialMessage + + +log = logging.getLogger(__name__) + + +class FetchContext(object): + """ + Class for managing the state of a consumer during fetch + """ + def __init__(self, consumer, block, timeout): + warnings.warn('deprecated - this class will be removed in a future' + ' release', DeprecationWarning) + self.consumer = consumer + self.block = block + + if block: + if not timeout: + timeout = FETCH_DEFAULT_BLOCK_TIMEOUT + self.timeout = timeout * 1000 + + def __enter__(self): + """Set fetch values based on blocking status""" + self.orig_fetch_max_wait_time = self.consumer.fetch_max_wait_time + self.orig_fetch_min_bytes = self.consumer.fetch_min_bytes + if self.block: + self.consumer.fetch_max_wait_time = self.timeout + self.consumer.fetch_min_bytes = 1 + else: + self.consumer.fetch_min_bytes = 0 + + def __exit__(self, type, value, traceback): + """Reset values""" + self.consumer.fetch_max_wait_time = self.orig_fetch_max_wait_time + self.consumer.fetch_min_bytes = self.orig_fetch_min_bytes + + +class SimpleConsumer(Consumer): + """ + A simple consumer implementation that consumes all/specified partitions + for a topic + + Arguments: + client: a connected SimpleClient + group: a name for this consumer, used for offset storage and must be unique + If you are connecting to a server that does not support offset + commit/fetch (any prior to 0.8.1.1), then you *must* set this to None + topic: the topic to consume + + Keyword Arguments: + partitions: An optional list of partitions to consume the data from + + auto_commit: default True. Whether or not to auto commit the offsets + + auto_commit_every_n: default 100. How many messages to consume + before a commit + + auto_commit_every_t: default 5000. How much time (in milliseconds) to + wait before commit + fetch_size_bytes: number of bytes to request in a FetchRequest + + buffer_size: default 4K. Initial number of bytes to tell kafka we + have available. This will double as needed. + + max_buffer_size: default 16K. Max number of bytes to tell kafka we have + available. None means no limit. + + iter_timeout: default None. How much time (in seconds) to wait for a + message in the iterator before exiting. None means no + timeout, so it will wait forever. + + auto_offset_reset: default largest. Reset partition offsets upon + OffsetOutOfRangeError. Valid values are largest and smallest. + Otherwise, do not reset the offsets and raise OffsetOutOfRangeError. + + Auto commit details: + If both auto_commit_every_n and auto_commit_every_t are set, they will + reset one another when one is triggered. These triggers simply call the + commit method on this class. A manual call to commit will also reset + these triggers + """ + def __init__(self, client, group, topic, auto_commit=True, partitions=None, + auto_commit_every_n=AUTO_COMMIT_MSG_COUNT, + auto_commit_every_t=AUTO_COMMIT_INTERVAL, + fetch_size_bytes=FETCH_MIN_BYTES, + buffer_size=FETCH_BUFFER_SIZE_BYTES, + max_buffer_size=MAX_FETCH_BUFFER_SIZE_BYTES, + iter_timeout=None, + auto_offset_reset='largest'): + warnings.warn('deprecated - this class will be removed in a future' + ' release. Use KafkaConsumer instead.', + DeprecationWarning) + super(SimpleConsumer, self).__init__( + client, group, topic, + partitions=partitions, + auto_commit=auto_commit, + auto_commit_every_n=auto_commit_every_n, + auto_commit_every_t=auto_commit_every_t) + + if max_buffer_size is not None and buffer_size > max_buffer_size: + raise ValueError('buffer_size (%d) is greater than ' + 'max_buffer_size (%d)' % + (buffer_size, max_buffer_size)) + self.buffer_size = buffer_size + self.max_buffer_size = max_buffer_size + self.fetch_max_wait_time = FETCH_MAX_WAIT_TIME + self.fetch_min_bytes = fetch_size_bytes + self.fetch_offsets = self.offsets.copy() + self.iter_timeout = iter_timeout + self.auto_offset_reset = auto_offset_reset + self.queue = queue.Queue() + + def __repr__(self): + return '' % \ + (self.group, self.topic, str(self.offsets.keys())) + + def reset_partition_offset(self, partition): + """Update offsets using auto_offset_reset policy (smallest|largest) + + Arguments: + partition (int): the partition for which offsets should be updated + + Returns: Updated offset on success, None on failure + """ + LATEST = -1 + EARLIEST = -2 + if self.auto_offset_reset == 'largest': + reqs = [OffsetRequestPayload(self.topic, partition, LATEST, 1)] + elif self.auto_offset_reset == 'smallest': + reqs = [OffsetRequestPayload(self.topic, partition, EARLIEST, 1)] + else: + # Let's raise an reasonable exception type if user calls + # outside of an exception context + if sys.exc_info() == (None, None, None): + raise OffsetOutOfRangeError('Cannot reset partition offsets without a ' + 'valid auto_offset_reset setting ' + '(largest|smallest)') + # Otherwise we should re-raise the upstream exception + # b/c it typically includes additional data about + # the request that triggered it, and we do not want to drop that + raise # pylint: disable=E0704 + + # send_offset_request + log.info('Resetting topic-partition offset to %s for %s:%d', + self.auto_offset_reset, self.topic, partition) + try: + (resp, ) = self.client.send_offset_request(reqs) + except KafkaError as e: + log.error('%s sending offset request for %s:%d', + e.__class__.__name__, self.topic, partition) + else: + self.offsets[partition] = resp.offsets[0] + self.fetch_offsets[partition] = resp.offsets[0] + return resp.offsets[0] + + def seek(self, offset, whence=None, partition=None): + """ + Alter the current offset in the consumer, similar to fseek + + Arguments: + offset: how much to modify the offset + whence: where to modify it from, default is None + + * None is an absolute offset + * 0 is relative to the earliest available offset (head) + * 1 is relative to the current offset + * 2 is relative to the latest known offset (tail) + + partition: modify which partition, default is None. + If partition is None, would modify all partitions. + """ + + if whence is None: # set an absolute offset + if partition is None: + for tmp_partition in self.offsets: + self.offsets[tmp_partition] = offset + else: + self.offsets[partition] = offset + elif whence == 1: # relative to current position + if partition is None: + for tmp_partition, _offset in self.offsets.items(): + self.offsets[tmp_partition] = _offset + offset + else: + self.offsets[partition] += offset + elif whence in (0, 2): # relative to beginning or end + reqs = [] + deltas = {} + if partition is None: + # divide the request offset by number of partitions, + # distribute the remained evenly + (delta, rem) = divmod(offset, len(self.offsets)) + for tmp_partition, r in izip_longest(self.offsets.keys(), + repeat(1, rem), + fillvalue=0): + deltas[tmp_partition] = delta + r + + for tmp_partition in self.offsets.keys(): + if whence == 0: + reqs.append(OffsetRequestPayload(self.topic, tmp_partition, -2, 1)) + elif whence == 2: + reqs.append(OffsetRequestPayload(self.topic, tmp_partition, -1, 1)) + else: + pass + else: + deltas[partition] = offset + if whence == 0: + reqs.append(OffsetRequestPayload(self.topic, partition, -2, 1)) + elif whence == 2: + reqs.append(OffsetRequestPayload(self.topic, partition, -1, 1)) + else: + pass + + resps = self.client.send_offset_request(reqs) + for resp in resps: + self.offsets[resp.partition] = \ + resp.offsets[0] + deltas[resp.partition] + else: + raise ValueError('Unexpected value for `whence`, %d' % whence) + + # Reset queue and fetch offsets since they are invalid + self.fetch_offsets = self.offsets.copy() + self.count_since_commit += 1 + if self.auto_commit: + self.commit() + + self.queue = queue.Queue() + + def get_messages(self, count=1, block=True, timeout=0.1): + """ + Fetch the specified number of messages + + Keyword Arguments: + count: Indicates the maximum number of messages to be fetched + block: If True, the API will block till all messages are fetched. + If block is a positive integer the API will block until that + many messages are fetched. + timeout: When blocking is requested the function will block for + the specified time (in seconds) until count messages is + fetched. If None, it will block forever. + """ + messages = [] + if timeout is not None: + timeout += time.time() + + new_offsets = {} + log.debug('getting %d messages', count) + while len(messages) < count: + block_time = timeout - time.time() + log.debug('calling _get_message block=%s timeout=%s', block, block_time) + block_next_call = block is True or block > len(messages) + result = self._get_message(block_next_call, block_time, + get_partition_info=True, + update_offset=False) + log.debug('got %s from _get_messages', result) + if not result: + if block_next_call and (timeout is None or time.time() <= timeout): + continue + break + + partition, message = result + _msg = (partition, message) if self.partition_info else message + messages.append(_msg) + new_offsets[partition] = message.offset + 1 + + # Update and commit offsets if necessary + self.offsets.update(new_offsets) + self.count_since_commit += len(messages) + self._auto_commit() + log.debug('got %d messages: %s', len(messages), messages) + return messages + + def get_message(self, block=True, timeout=0.1, get_partition_info=None): + return self._get_message(block, timeout, get_partition_info) + + def _get_message(self, block=True, timeout=0.1, get_partition_info=None, + update_offset=True): + """ + If no messages can be fetched, returns None. + If get_partition_info is None, it defaults to self.partition_info + If get_partition_info is True, returns (partition, message) + If get_partition_info is False, returns message + """ + start_at = time.time() + while self.queue.empty(): + # We're out of messages, go grab some more. + log.debug('internal queue empty, fetching more messages') + with FetchContext(self, block, timeout): + self._fetch() + + if not block or time.time() > (start_at + timeout): + break + + try: + partition, message = self.queue.get_nowait() + + if update_offset: + # Update partition offset + self.offsets[partition] = message.offset + 1 + + # Count, check and commit messages if necessary + self.count_since_commit += 1 + self._auto_commit() + + if get_partition_info is None: + get_partition_info = self.partition_info + if get_partition_info: + return partition, message + else: + return message + except queue.Empty: + log.debug('internal queue empty after fetch - returning None') + return None + + def __iter__(self): + if self.iter_timeout is None: + timeout = ITER_TIMEOUT_SECONDS + else: + timeout = self.iter_timeout + + while True: + message = self.get_message(True, timeout) + if message: + yield message + elif self.iter_timeout is None: + # We did not receive any message yet but we don't have a + # timeout, so give up the CPU for a while before trying again + time.sleep(NO_MESSAGES_WAIT_TIME_SECONDS) + else: + # Timed out waiting for a message + break + + def _fetch(self): + # Create fetch request payloads for all the partitions + partitions = dict((p, self.buffer_size) + for p in self.fetch_offsets.keys()) + while partitions: + requests = [] + for partition, buffer_size in six.iteritems(partitions): + requests.append(FetchRequestPayload(self.topic, partition, + self.fetch_offsets[partition], + buffer_size)) + # Send request + responses = self.client.send_fetch_request( + requests, + max_wait_time=int(self.fetch_max_wait_time), + min_bytes=self.fetch_min_bytes, + fail_on_error=False + ) + + retry_partitions = {} + for resp in responses: + + try: + check_error(resp) + except UnknownTopicOrPartitionError: + log.error('UnknownTopicOrPartitionError for %s:%d', + resp.topic, resp.partition) + self.client.reset_topic_metadata(resp.topic) + raise + except NotLeaderForPartitionError: + log.error('NotLeaderForPartitionError for %s:%d', + resp.topic, resp.partition) + self.client.reset_topic_metadata(resp.topic) + continue + except OffsetOutOfRangeError: + log.warning('OffsetOutOfRangeError for %s:%d. ' + 'Resetting partition offset...', + resp.topic, resp.partition) + self.reset_partition_offset(resp.partition) + # Retry this partition + retry_partitions[resp.partition] = partitions[resp.partition] + continue + except FailedPayloadsError as e: + log.warning('FailedPayloadsError for %s:%d', + e.payload.topic, e.payload.partition) + # Retry this partition + retry_partitions[e.payload.partition] = partitions[e.payload.partition] + continue + + partition = resp.partition + buffer_size = partitions[partition] + + # Check for partial message + if resp.messages and isinstance(resp.messages[-1].message, PartialMessage): + + # If buffer is at max and all we got was a partial message + # raise ConsumerFetchSizeTooSmall + if (self.max_buffer_size is not None and + buffer_size == self.max_buffer_size and + len(resp.messages) == 1): + + log.error('Max fetch size %d too small', self.max_buffer_size) + raise ConsumerFetchSizeTooSmall() + + if self.max_buffer_size is None: + buffer_size *= 2 + else: + buffer_size = min(buffer_size * 2, self.max_buffer_size) + log.warning('Fetch size too small, increase to %d (2x) ' + 'and retry', buffer_size) + retry_partitions[partition] = buffer_size + resp.messages.pop() + + for message in resp.messages: + if message.offset < self.fetch_offsets[partition]: + log.debug('Skipping message %s because its offset is less than the consumer offset', + message) + continue + # Put the message in our queue + self.queue.put((partition, message)) + self.fetch_offsets[partition] = message.offset + 1 + partitions = retry_partitions diff -Nru python-kafka-python-0.9.2/kafka/consumer/subscription_state.py python-kafka-python-1.0.1/kafka/consumer/subscription_state.py --- python-kafka-python-0.9.2/kafka/consumer/subscription_state.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/consumer/subscription_state.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,454 @@ +from __future__ import absolute_import + +import abc +import logging +import re + +import six + +from kafka.common import IllegalStateError, OffsetAndMetadata +from kafka.protocol.offset import OffsetResetStrategy + +log = logging.getLogger(__name__) + + +class SubscriptionState(object): + """ + A class for tracking the topics, partitions, and offsets for the consumer. + A partition is "assigned" either directly with assign_from_user() (manual + assignment) or with assign_from_subscribed() (automatic assignment from + subscription). + + Once assigned, the partition is not considered "fetchable" until its initial + position has been set with seek(). Fetchable partitions track a fetch + position which is used to set the offset of the next fetch, and a consumed + position which is the last offset that has been returned to the user. You + can suspend fetching from a partition through pause() without affecting the + fetched/consumed offsets. The partition will remain unfetchable until the + resume() is used. You can also query the pause state independently with + is_paused(). + + Note that pause state as well as fetch/consumed positions are not preserved + when partition assignment is changed whether directly by the user or + through a group rebalance. + + This class also maintains a cache of the latest commit position for each of + the assigned partitions. This is updated through committed() and can be used + to set the initial fetch position (e.g. Fetcher._reset_offset() ). + """ + _SUBSCRIPTION_EXCEPTION_MESSAGE = ("Subscription to topics, partitions and" + " pattern are mutually exclusive") + + def __init__(self, offset_reset_strategy='earliest'): + """Initialize a SubscriptionState instance + + Keyword Arguments: + offset_reset_strategy: 'earliest' or 'latest', otherwise + exception will be raised when fetching an offset that is no + longer available. Default: 'earliest' + """ + try: + offset_reset_strategy = getattr(OffsetResetStrategy, + offset_reset_strategy.upper()) + except AttributeError: + log.warning('Unrecognized offset_reset_strategy, using NONE') + offset_reset_strategy = OffsetResetStrategy.NONE + self._default_offset_reset_strategy = offset_reset_strategy + + self.subscription = None # set() or None + self.subscribed_pattern = None # regex str or None + self._group_subscription = set() + self._user_assignment = set() + self.assignment = dict() + self.needs_partition_assignment = False + self.listener = None + + # initialize to true for the consumers to fetch offset upon starting up + self.needs_fetch_committed_offsets = True + + def subscribe(self, topics=(), pattern=None, listener=None): + """Subscribe to a list of topics, or a topic regex pattern. + + Partitions will be dynamically assigned via a group coordinator. + Topic subscriptions are not incremental: this list will replace the + current assignment (if there is one). + + This method is incompatible with assign_from_user() + + Arguments: + topics (list): List of topics for subscription. + pattern (str): Pattern to match available topics. You must provide + either topics or pattern, but not both. + listener (ConsumerRebalanceListener): Optionally include listener + callback, which will be called before and after each rebalance + operation. + + As part of group management, the consumer will keep track of the + list of consumers that belong to a particular group and will + trigger a rebalance operation if one of the following events + trigger: + + * Number of partitions change for any of the subscribed topics + * Topic is created or deleted + * An existing member of the consumer group dies + * A new member is added to the consumer group + + When any of these events are triggered, the provided listener + will be invoked first to indicate that the consumer's assignment + has been revoked, and then again when the new assignment has + been received. Note that this listener will immediately override + any listener set in a previous call to subscribe. It is + guaranteed, however, that the partitions revoked/assigned + through this interface are from topics subscribed in this call. + """ + if self._user_assignment or (topics and pattern): + raise IllegalStateError(self._SUBSCRIPTION_EXCEPTION_MESSAGE) + assert topics or pattern, 'Must provide topics or pattern' + + if pattern: + log.info('Subscribing to pattern: /%s/', pattern) + self.subscription = set() + self.subscribed_pattern = re.compile(pattern) + else: + self.change_subscription(topics) + + if listener and not isinstance(listener, ConsumerRebalanceListener): + raise TypeError('listener must be a ConsumerRebalanceListener') + self.listener = listener + + def change_subscription(self, topics): + """Change the topic subscription. + + Arguments: + topics (list of str): topics for subscription + + Raises: + IllegalStateErrror: if assign_from_user has been used already + """ + if self._user_assignment: + raise IllegalStateError(self._SUBSCRIPTION_EXCEPTION_MESSAGE) + + if self.subscription == set(topics): + log.warning("subscription unchanged by change_subscription(%s)", + topics) + return + + log.info('Updating subscribed topics to: %s', topics) + self.subscription = set(topics) + self._group_subscription.update(topics) + self.needs_partition_assignment = True + + # Remove any assigned partitions which are no longer subscribed to + for tp in set(self.assignment.keys()): + if tp.topic not in self.subscription: + del self.assignment[tp] + + def group_subscribe(self, topics): + """Add topics to the current group subscription. + + This is used by the group leader to ensure that it receives metadata + updates for all topics that any member of the group is subscribed to. + + Arguments: + topics (list of str): topics to add to the group subscription + """ + if self._user_assignment: + raise IllegalStateError(self._SUBSCRIPTION_EXCEPTION_MESSAGE) + self._group_subscription.update(topics) + + def mark_for_reassignment(self): + if self._user_assignment: + raise IllegalStateError(self._SUBSCRIPTION_EXCEPTION_MESSAGE) + assert self.subscription is not None, 'Subscription required' + self._group_subscription.intersection_update(self.subscription) + self.needs_partition_assignment = True + + def assign_from_user(self, partitions): + """Manually assign a list of TopicPartitions to this consumer. + + This interface does not allow for incremental assignment and will + replace the previous assignment (if there was one). + + Manual topic assignment through this method does not use the consumer's + group management functionality. As such, there will be no rebalance + operation triggered when group membership or cluster and topic metadata + change. Note that it is not possible to use both manual partition + assignment with assign() and group assignment with subscribe(). + + Arguments: + partitions (list of TopicPartition): assignment for this instance. + + Raises: + IllegalStateError: if consumer has already called subscribe() + """ + if self.subscription is not None: + raise IllegalStateError(self._SUBSCRIPTION_EXCEPTION_MESSAGE) + + self._user_assignment.clear() + self._user_assignment.update(partitions) + + for partition in partitions: + if partition not in self.assignment: + self._add_assigned_partition(partition) + + for tp in set(self.assignment.keys()) - self._user_assignment: + del self.assignment[tp] + + self.needs_partition_assignment = False + + def assign_from_subscribed(self, assignments): + """Update the assignment to the specified partitions + + This method is called by the coordinator to dynamically assign + partitions based on the consumer's topic subscription. This is different + from assign_from_user() which directly sets the assignment from a + user-supplied TopicPartition list. + + Arguments: + assignments (list of TopicPartition): partitions to assign to this + consumer instance. + """ + if self.subscription is None: + raise IllegalStateError(self._SUBSCRIPTION_EXCEPTION_MESSAGE) + + for tp in assignments: + if tp.topic not in self.subscription: + raise ValueError("Assigned partition %s for non-subscribed topic." % tp) + self.assignment.clear() + for tp in assignments: + self._add_assigned_partition(tp) + self.needs_partition_assignment = False + log.info("Updated partition assignment: %s", assignments) + + def unsubscribe(self): + """Clear all topic subscriptions and partition assignments""" + self.subscription = None + self._user_assignment.clear() + self.assignment.clear() + self.needs_partition_assignment = True + self.subscribed_pattern = None + + def group_subscription(self): + """Get the topic subscription for the group. + + For the leader, this will include the union of all member subscriptions. + For followers, it is the member's subscription only. + + This is used when querying topic metadata to detect metadata changes + that would require rebalancing (the leader fetches metadata for all + topics in the group so that it can do partition assignment). + + Returns: + set: topics + """ + return self._group_subscription + + def seek(self, partition, offset): + """Manually specify the fetch offset for a TopicPartition. + + Overrides the fetch offsets that the consumer will use on the next + poll(). If this API is invoked for the same partition more than once, + the latest offset will be used on the next poll(). Note that you may + lose data if this API is arbitrarily used in the middle of consumption, + to reset the fetch offsets. + + Arguments: + partition (TopicPartition): partition for seek operation + offset (int): message offset in partition + """ + self.assignment[partition].seek(offset) + + def assigned_partitions(self): + """Return set of TopicPartitions in current assignment.""" + return set(self.assignment.keys()) + + def fetchable_partitions(self): + """Return set of TopicPartitions that should be Fetched.""" + fetchable = set() + for partition, state in six.iteritems(self.assignment): + if state.is_fetchable(): + fetchable.add(partition) + return fetchable + + def partitions_auto_assigned(self): + """Return True unless user supplied partitions manually.""" + return self.subscription is not None + + def all_consumed_offsets(self): + """Returns consumed offsets as {TopicPartition: OffsetAndMetadata}""" + all_consumed = {} + for partition, state in six.iteritems(self.assignment): + if state.has_valid_position: + all_consumed[partition] = OffsetAndMetadata(state.position, '') + return all_consumed + + def need_offset_reset(self, partition, offset_reset_strategy=None): + """Mark partition for offset reset using specified or default strategy. + + Arguments: + partition (TopicPartition): partition to mark + offset_reset_strategy (OffsetResetStrategy, optional) + """ + if offset_reset_strategy is None: + offset_reset_strategy = self._default_offset_reset_strategy + self.assignment[partition].await_reset(offset_reset_strategy) + + def has_default_offset_reset_policy(self): + """Return True if default offset reset policy is Earliest or Latest""" + return self._default_offset_reset_strategy != OffsetResetStrategy.NONE + + def is_offset_reset_needed(self, partition): + return self.assignment[partition].awaiting_reset + + def has_all_fetch_positions(self): + for state in self.assignment.values(): + if not state.has_valid_position: + return False + return True + + def missing_fetch_positions(self): + missing = set() + for partition, state in six.iteritems(self.assignment): + if not state.has_valid_position: + missing.add(partition) + return missing + + def is_assigned(self, partition): + return partition in self.assignment + + def is_paused(self, partition): + return partition in self.assignment and self.assignment[partition].paused + + def is_fetchable(self, partition): + return partition in self.assignment and self.assignment[partition].is_fetchable() + + def pause(self, partition): + self.assignment[partition].pause() + + def resume(self, partition): + self.assignment[partition].resume() + + def _add_assigned_partition(self, partition): + self.assignment[partition] = TopicPartitionState() + + +class TopicPartitionState(object): + def __init__(self): + self.committed = None # last committed position + self.has_valid_position = False # whether we have valid position + self.paused = False # whether this partition has been paused by the user + self.awaiting_reset = False # whether we are awaiting reset + self.reset_strategy = None # the reset strategy if awaitingReset is set + self._position = None # offset exposed to the user + self.highwater = None + + def _set_position(self, offset): + assert self.has_valid_position, 'Valid position required' + self._position = offset + + def _get_position(self): + return self._position + + position = property(_get_position, _set_position, None, "last position") + + def await_reset(self, strategy): + self.awaiting_reset = True + self.reset_strategy = strategy + self._position = None + self.has_valid_position = False + + def seek(self, offset): + self._position = offset + self.awaiting_reset = False + self.reset_strategy = None + self.has_valid_position = True + + def pause(self): + self.paused = True + + def resume(self): + self.paused = False + + def is_fetchable(self): + return not self.paused and self.has_valid_position + + +class ConsumerRebalanceListener(object): + """ + A callback interface that the user can implement to trigger custom actions + when the set of partitions assigned to the consumer changes. + + This is applicable when the consumer is having Kafka auto-manage group + membership. If the consumer's directly assign partitions, those + partitions will never be reassigned and this callback is not applicable. + + When Kafka is managing the group membership, a partition re-assignment will + be triggered any time the members of the group changes or the subscription + of the members changes. This can occur when processes die, new process + instances are added or old instances come back to life after failure. + Rebalances can also be triggered by changes affecting the subscribed + topics (e.g. when then number of partitions is administratively adjusted). + + There are many uses for this functionality. One common use is saving offsets + in a custom store. By saving offsets in the on_partitions_revoked(), call we + can ensure that any time partition assignment changes the offset gets saved. + + Another use is flushing out any kind of cache of intermediate results the + consumer may be keeping. For example, consider a case where the consumer is + subscribed to a topic containing user page views, and the goal is to count + the number of page views per users for each five minute window. Let's say + the topic is partitioned by the user id so that all events for a particular + user will go to a single consumer instance. The consumer can keep in memory + a running tally of actions per user and only flush these out to a remote + data store when its cache gets too big. However if a partition is reassigned + it may want to automatically trigger a flush of this cache, before the new + owner takes over consumption. + + This callback will execute in the user thread as part of the Consumer.poll() + whenever partition assignment changes. + + It is guaranteed that all consumer processes will invoke + on_partitions_revoked() prior to any process invoking + on_partitions_assigned(). So if offsets or other state is saved in the + on_partitions_revoked() call, it should be saved by the time the process + taking over that partition has their on_partitions_assigned() callback + called to load the state. + """ + __metaclass__ = abc.ABCMeta + + @abc.abstractmethod + def on_partitions_revoked(self, revoked): + """ + A callback method the user can implement to provide handling of offset + commits to a customized store on the start of a rebalance operation. + This method will be called before a rebalance operation starts and + after the consumer stops fetching data. It is recommended that offsets + should be committed in this callback to either Kafka or a custom offset + store to prevent duplicate data. + + NOTE: This method is only called before rebalances. It is not called + prior to KafkaConsumer.close() + + Arguments: + revoked (list of TopicPartition): the partitions that were assigned + to the consumer on the last rebalance + """ + pass + + @abc.abstractmethod + def on_partitions_assigned(self, assigned): + """ + A callback method the user can implement to provide handling of + customized offsets on completion of a successful partition + re-assignment. This method will be called after an offset re-assignment + completes and before the consumer starts fetching data. + + It is guaranteed that all the processes in a consumer group will execute + their on_partitions_revoked() callback before any instance executes its + on_partitions_assigned() callback. + + Arguments: + assigned (list of TopicPartition): the partitions assigned to the + consumer (may include partitions that were previously assigned) + """ + pass diff -Nru python-kafka-python-0.9.2/kafka/consumer.py python-kafka-python-1.0.1/kafka/consumer.py --- python-kafka-python-0.9.2/kafka/consumer.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/consumer.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,690 +0,0 @@ -from __future__ import absolute_import - -from itertools import izip_longest, repeat -import logging -import time -import numbers -from threading import Lock -from multiprocessing import Process, Queue as MPQueue, Event, Value -from Queue import Empty, Queue - -import kafka -from kafka.common import ( - FetchRequest, - OffsetRequest, OffsetCommitRequest, - OffsetFetchRequest, - ConsumerFetchSizeTooSmall, ConsumerNoMoreData -) - -from kafka.util import ReentrantTimer - -log = logging.getLogger("kafka") - -AUTO_COMMIT_MSG_COUNT = 100 -AUTO_COMMIT_INTERVAL = 5000 - -FETCH_DEFAULT_BLOCK_TIMEOUT = 1 -FETCH_MAX_WAIT_TIME = 100 -FETCH_MIN_BYTES = 4096 -FETCH_BUFFER_SIZE_BYTES = 4096 -MAX_FETCH_BUFFER_SIZE_BYTES = FETCH_BUFFER_SIZE_BYTES * 8 - -ITER_TIMEOUT_SECONDS = 60 -NO_MESSAGES_WAIT_TIME_SECONDS = 0.1 - - -class FetchContext(object): - """ - Class for managing the state of a consumer during fetch - """ - def __init__(self, consumer, block, timeout): - self.consumer = consumer - self.block = block - - if block: - if not timeout: - timeout = FETCH_DEFAULT_BLOCK_TIMEOUT - self.timeout = timeout * 1000 - - def __enter__(self): - """Set fetch values based on blocking status""" - self.orig_fetch_max_wait_time = self.consumer.fetch_max_wait_time - self.orig_fetch_min_bytes = self.consumer.fetch_min_bytes - if self.block: - self.consumer.fetch_max_wait_time = self.timeout - self.consumer.fetch_min_bytes = 1 - else: - self.consumer.fetch_min_bytes = 0 - - def __exit__(self, type, value, traceback): - """Reset values""" - self.consumer.fetch_max_wait_time = self.orig_fetch_max_wait_time - self.consumer.fetch_min_bytes = self.orig_fetch_min_bytes - - -class Consumer(object): - """ - Base class to be used by other consumers. Not to be used directly - - This base class provides logic for - * initialization and fetching metadata of partitions - * Auto-commit logic - * APIs for fetching pending message count - """ - def __init__(self, client, group, topic, partitions=None, auto_commit=True, - auto_commit_every_n=AUTO_COMMIT_MSG_COUNT, - auto_commit_every_t=AUTO_COMMIT_INTERVAL): - - self.client = client - self.topic = topic - self.group = group - self.client.load_metadata_for_topics(topic) - self.offsets = {} - - if not partitions: - partitions = self.client.topic_partitions[topic] - else: - assert all(isinstance(x, numbers.Integral) for x in partitions) - - # Variables for handling offset commits - self.commit_lock = Lock() - self.commit_timer = None - self.count_since_commit = 0 - self.auto_commit = auto_commit - self.auto_commit_every_n = auto_commit_every_n - self.auto_commit_every_t = auto_commit_every_t - - # Set up the auto-commit timer - if auto_commit is True and auto_commit_every_t is not None: - self.commit_timer = ReentrantTimer(auto_commit_every_t, - self.commit) - self.commit_timer.start() - - if auto_commit: - self.fetch_last_known_offsets(partitions) - else: - for partition in partitions: - self.offsets[partition] = 0 - - def fetch_last_known_offsets(self, partitions=None): - if not partitions: - partitions = self.client.topic_partitions[self.topic] - - def get_or_init_offset_callback(resp): - try: - kafka.common.check_error(resp) - return resp.offset - except kafka.common.UnknownTopicOrPartitionError: - return 0 - - for partition in partitions: - req = OffsetFetchRequest(self.topic, partition) - (offset,) = self.client.send_offset_fetch_request(self.group, [req], - callback=get_or_init_offset_callback, - fail_on_error=False) - self.offsets[partition] = offset - self.fetch_offsets = self.offsets.copy() - - def commit(self, partitions=None): - """ - Commit offsets for this consumer - - partitions: list of partitions to commit, default is to commit - all of them - """ - - # short circuit if nothing happened. This check is kept outside - # to prevent un-necessarily acquiring a lock for checking the state - if self.count_since_commit == 0: - return - - with self.commit_lock: - # Do this check again, just in case the state has changed - # during the lock acquiring timeout - if self.count_since_commit == 0: - return - - reqs = [] - if not partitions: # commit all partitions - partitions = self.offsets.keys() - - for partition in partitions: - offset = self.offsets[partition] - log.debug("Commit offset %d in SimpleConsumer: " - "group=%s, topic=%s, partition=%s" % - (offset, self.group, self.topic, partition)) - - reqs.append(OffsetCommitRequest(self.topic, partition, - offset, None)) - - resps = self.client.send_offset_commit_request(self.group, reqs) - for resp in resps: - kafka.common.check_error(resp) - - self.count_since_commit = 0 - - def _auto_commit(self): - """ - Check if we have to commit based on number of messages and commit - """ - - # Check if we are supposed to do an auto-commit - if not self.auto_commit or self.auto_commit_every_n is None: - return - - if self.count_since_commit >= self.auto_commit_every_n: - self.commit() - - def stop(self): - if self.commit_timer is not None: - self.commit_timer.stop() - self.commit() - - def pending(self, partitions=None): - """ - Gets the pending message count - - partitions: list of partitions to check for, default is to check all - """ - if not partitions: - partitions = self.offsets.keys() - - total = 0 - reqs = [] - - for partition in partitions: - reqs.append(OffsetRequest(self.topic, partition, -1, 1)) - - resps = self.client.send_offset_request(reqs) - for resp in resps: - partition = resp.partition - pending = resp.offsets[0] - offset = self.offsets[partition] - total += pending - offset - (1 if offset > 0 else 0) - - return total - - -class SimpleConsumer(Consumer): - """ - A simple consumer implementation that consumes all/specified partitions - for a topic - - client: a connected KafkaClient - group: a name for this consumer, used for offset storage and must be unique - topic: the topic to consume - partitions: An optional list of partitions to consume the data from - - auto_commit: default True. Whether or not to auto commit the offsets - auto_commit_every_n: default 100. How many messages to consume - before a commit - auto_commit_every_t: default 5000. How much time (in milliseconds) to - wait before commit - fetch_size_bytes: number of bytes to request in a FetchRequest - buffer_size: default 4K. Initial number of bytes to tell kafka we - have available. This will double as needed. - max_buffer_size: default 16K. Max number of bytes to tell kafka we have - available. None means no limit. - iter_timeout: default None. How much time (in seconds) to wait for a - message in the iterator before exiting. None means no - timeout, so it will wait forever. - - Auto commit details: - If both auto_commit_every_n and auto_commit_every_t are set, they will - reset one another when one is triggered. These triggers simply call the - commit method on this class. A manual call to commit will also reset - these triggers - """ - def __init__(self, client, group, topic, auto_commit=True, partitions=None, - auto_commit_every_n=AUTO_COMMIT_MSG_COUNT, - auto_commit_every_t=AUTO_COMMIT_INTERVAL, - fetch_size_bytes=FETCH_MIN_BYTES, - buffer_size=FETCH_BUFFER_SIZE_BYTES, - max_buffer_size=MAX_FETCH_BUFFER_SIZE_BYTES, - iter_timeout=None): - super(SimpleConsumer, self).__init__( - client, group, topic, - partitions=partitions, - auto_commit=auto_commit, - auto_commit_every_n=auto_commit_every_n, - auto_commit_every_t=auto_commit_every_t) - - if max_buffer_size is not None and buffer_size > max_buffer_size: - raise ValueError("buffer_size (%d) is greater than " - "max_buffer_size (%d)" % - (buffer_size, max_buffer_size)) - self.buffer_size = buffer_size - self.max_buffer_size = max_buffer_size - self.partition_info = False # Do not return partition info in msgs - self.fetch_max_wait_time = FETCH_MAX_WAIT_TIME - self.fetch_min_bytes = fetch_size_bytes - self.fetch_offsets = self.offsets.copy() - self.iter_timeout = iter_timeout - self.queue = Queue() - - def __repr__(self): - return '' % \ - (self.group, self.topic, str(self.offsets.keys())) - - def provide_partition_info(self): - """ - Indicates that partition info must be returned by the consumer - """ - self.partition_info = True - - def seek(self, offset, whence): - """ - Alter the current offset in the consumer, similar to fseek - - offset: how much to modify the offset - whence: where to modify it from - 0 is relative to the earliest available offset (head) - 1 is relative to the current offset - 2 is relative to the latest known offset (tail) - """ - - if whence == 1: # relative to current position - for partition, _offset in self.offsets.items(): - self.offsets[partition] = _offset + offset - elif whence in (0, 2): # relative to beginning or end - # divide the request offset by number of partitions, - # distribute the remained evenly - (delta, rem) = divmod(offset, len(self.offsets)) - deltas = {} - for partition, r in izip_longest(self.offsets.keys(), - repeat(1, rem), fillvalue=0): - deltas[partition] = delta + r - - reqs = [] - for partition in self.offsets.keys(): - if whence == 0: - reqs.append(OffsetRequest(self.topic, partition, -2, 1)) - elif whence == 2: - reqs.append(OffsetRequest(self.topic, partition, -1, 1)) - else: - pass - - resps = self.client.send_offset_request(reqs) - for resp in resps: - self.offsets[resp.partition] = \ - resp.offsets[0] + deltas[resp.partition] - else: - raise ValueError("Unexpected value for `whence`, %d" % whence) - - # Reset queue and fetch offsets since they are invalid - self.fetch_offsets = self.offsets.copy() - if self.auto_commit: - self.count_since_commit += 1 - self.commit() - - self.queue = Queue() - - def get_messages(self, count=1, block=True, timeout=0.1): - """ - Fetch the specified number of messages - - count: Indicates the maximum number of messages to be fetched - block: If True, the API will block till some messages are fetched. - timeout: If block is True, the function will block for the specified - time (in seconds) until count messages is fetched. If None, - it will block forever. - """ - messages = [] - if timeout is not None: - max_time = time.time() + timeout - - new_offsets = {} - while count > 0 and (timeout is None or timeout > 0): - result = self._get_message(block, timeout, get_partition_info=True, - update_offset=False) - if result: - partition, message = result - if self.partition_info: - messages.append(result) - else: - messages.append(message) - new_offsets[partition] = message.offset + 1 - count -= 1 - else: - # Ran out of messages for the last request. - if not block: - # If we're not blocking, break. - break - if timeout is not None: - # If we're blocking and have a timeout, reduce it to the - # appropriate value - timeout = max_time - time.time() - - # Update and commit offsets if necessary - self.offsets.update(new_offsets) - self.count_since_commit += len(messages) - self._auto_commit() - return messages - - def get_message(self, block=True, timeout=0.1, get_partition_info=None): - return self._get_message(block, timeout, get_partition_info) - - def _get_message(self, block=True, timeout=0.1, get_partition_info=None, - update_offset=True): - """ - If no messages can be fetched, returns None. - If get_partition_info is None, it defaults to self.partition_info - If get_partition_info is True, returns (partition, message) - If get_partition_info is False, returns message - """ - if self.queue.empty(): - # We're out of messages, go grab some more. - with FetchContext(self, block, timeout): - self._fetch() - try: - partition, message = self.queue.get_nowait() - - if update_offset: - # Update partition offset - self.offsets[partition] = message.offset + 1 - - # Count, check and commit messages if necessary - self.count_since_commit += 1 - self._auto_commit() - - if get_partition_info is None: - get_partition_info = self.partition_info - if get_partition_info: - return partition, message - else: - return message - except Empty: - return None - - def __iter__(self): - if self.iter_timeout is None: - timeout = ITER_TIMEOUT_SECONDS - else: - timeout = self.iter_timeout - - while True: - message = self.get_message(True, timeout) - if message: - yield message - elif self.iter_timeout is None: - # We did not receive any message yet but we don't have a - # timeout, so give up the CPU for a while before trying again - time.sleep(NO_MESSAGES_WAIT_TIME_SECONDS) - else: - # Timed out waiting for a message - break - - def _fetch(self): - # Create fetch request payloads for all the partitions - partitions = dict((p, self.buffer_size) - for p in self.fetch_offsets.keys()) - while partitions: - requests = [] - for partition, buffer_size in partitions.iteritems(): - requests.append(FetchRequest(self.topic, partition, - self.fetch_offsets[partition], - buffer_size)) - # Send request - responses = self.client.send_fetch_request( - requests, - max_wait_time=int(self.fetch_max_wait_time), - min_bytes=self.fetch_min_bytes) - - retry_partitions = {} - for resp in responses: - partition = resp.partition - buffer_size = partitions[partition] - try: - for message in resp.messages: - # Put the message in our queue - self.queue.put((partition, message)) - self.fetch_offsets[partition] = message.offset + 1 - except ConsumerFetchSizeTooSmall: - if (self.max_buffer_size is not None and - buffer_size == self.max_buffer_size): - log.error("Max fetch size %d too small", - self.max_buffer_size) - raise - if self.max_buffer_size is None: - buffer_size *= 2 - else: - buffer_size = max(buffer_size * 2, - self.max_buffer_size) - log.warn("Fetch size too small, increase to %d (2x) " - "and retry", buffer_size) - retry_partitions[partition] = buffer_size - except ConsumerNoMoreData as e: - log.debug("Iteration was ended by %r", e) - except StopIteration: - # Stop iterating through this partition - log.debug("Done iterating over partition %s" % partition) - partitions = retry_partitions - -def _mp_consume(client, group, topic, chunk, queue, start, exit, pause, size): - """ - A child process worker which consumes messages based on the - notifications given by the controller process - - NOTE: Ideally, this should have been a method inside the Consumer - class. However, multiprocessing module has issues in windows. The - functionality breaks unless this function is kept outside of a class - """ - - # Make the child processes open separate socket connections - client.reinit() - - # We will start consumers without auto-commit. Auto-commit will be - # done by the master controller process. - consumer = SimpleConsumer(client, group, topic, - partitions=chunk, - auto_commit=False, - auto_commit_every_n=None, - auto_commit_every_t=None) - - # Ensure that the consumer provides the partition information - consumer.provide_partition_info() - - while True: - # Wait till the controller indicates us to start consumption - start.wait() - - # If we are asked to quit, do so - if exit.is_set(): - break - - # Consume messages and add them to the queue. If the controller - # indicates a specific number of messages, follow that advice - count = 0 - - message = consumer.get_message() - if message: - queue.put(message) - count += 1 - - # We have reached the required size. The controller might have - # more than what he needs. Wait for a while. - # Without this logic, it is possible that we run into a big - # loop consuming all available messages before the controller - # can reset the 'start' event - if count == size.value: - pause.wait() - - else: - # In case we did not receive any message, give up the CPU for - # a while before we try again - time.sleep(NO_MESSAGES_WAIT_TIME_SECONDS) - - consumer.stop() - - -class MultiProcessConsumer(Consumer): - """ - A consumer implementation that consumes partitions for a topic in - parallel using multiple processes - - client: a connected KafkaClient - group: a name for this consumer, used for offset storage and must be unique - topic: the topic to consume - - auto_commit: default True. Whether or not to auto commit the offsets - auto_commit_every_n: default 100. How many messages to consume - before a commit - auto_commit_every_t: default 5000. How much time (in milliseconds) to - wait before commit - num_procs: Number of processes to start for consuming messages. - The available partitions will be divided among these processes - partitions_per_proc: Number of partitions to be allocated per process - (overrides num_procs) - - Auto commit details: - If both auto_commit_every_n and auto_commit_every_t are set, they will - reset one another when one is triggered. These triggers simply call the - commit method on this class. A manual call to commit will also reset - these triggers - """ - def __init__(self, client, group, topic, auto_commit=True, - auto_commit_every_n=AUTO_COMMIT_MSG_COUNT, - auto_commit_every_t=AUTO_COMMIT_INTERVAL, - num_procs=1, partitions_per_proc=0): - - # Initiate the base consumer class - super(MultiProcessConsumer, self).__init__( - client, group, topic, - partitions=None, - auto_commit=auto_commit, - auto_commit_every_n=auto_commit_every_n, - auto_commit_every_t=auto_commit_every_t) - - # Variables for managing and controlling the data flow from - # consumer child process to master - self.queue = MPQueue(1024) # Child consumers dump messages into this - self.start = Event() # Indicates the consumers to start fetch - self.exit = Event() # Requests the consumers to shutdown - self.pause = Event() # Requests the consumers to pause fetch - self.size = Value('i', 0) # Indicator of number of messages to fetch - - partitions = self.offsets.keys() - - # If unspecified, start one consumer per partition - # The logic below ensures that - # * we do not cross the num_procs limit - # * we have an even distribution of partitions among processes - if not partitions_per_proc: - partitions_per_proc = round(len(partitions) * 1.0 / num_procs) - if partitions_per_proc < num_procs * 0.5: - partitions_per_proc += 1 - - # The final set of chunks - chunker = lambda *x: [] + list(x) - chunks = map(chunker, *[iter(partitions)] * int(partitions_per_proc)) - - self.procs = [] - for chunk in chunks: - chunk = filter(lambda x: x is not None, chunk) - args = (client.copy(), - group, topic, chunk, - self.queue, self.start, self.exit, - self.pause, self.size) - - proc = Process(target=_mp_consume, args=args) - proc.daemon = True - proc.start() - self.procs.append(proc) - - def __repr__(self): - return '' % \ - (self.group, self.topic, len(self.procs)) - - def stop(self): - # Set exit and start off all waiting consumers - self.exit.set() - self.pause.set() - self.start.set() - - for proc in self.procs: - proc.join() - proc.terminate() - - super(MultiProcessConsumer, self).stop() - - def __iter__(self): - """ - Iterator to consume the messages available on this consumer - """ - # Trigger the consumer procs to start off. - # We will iterate till there are no more messages available - self.size.value = 0 - self.pause.set() - - while True: - self.start.set() - try: - # We will block for a small while so that the consumers get - # a chance to run and put some messages in the queue - # TODO: This is a hack and will make the consumer block for - # at least one second. Need to find a better way of doing this - partition, message = self.queue.get(block=True, timeout=1) - except Empty: - break - - # Count, check and commit messages if necessary - self.offsets[partition] = message.offset + 1 - self.start.clear() - self.count_since_commit += 1 - self._auto_commit() - yield message - - self.start.clear() - - def get_messages(self, count=1, block=True, timeout=10): - """ - Fetch the specified number of messages - - count: Indicates the maximum number of messages to be fetched - block: If True, the API will block till some messages are fetched. - timeout: If block is True, the function will block for the specified - time (in seconds) until count messages is fetched. If None, - it will block forever. - """ - messages = [] - - # Give a size hint to the consumers. Each consumer process will fetch - # a maximum of "count" messages. This will fetch more messages than - # necessary, but these will not be committed to kafka. Also, the extra - # messages can be provided in subsequent runs - self.size.value = count - self.pause.clear() - - if timeout is not None: - max_time = time.time() + timeout - - new_offsets = {} - while count > 0 and (timeout is None or timeout > 0): - # Trigger consumption only if the queue is empty - # By doing this, we will ensure that consumers do not - # go into overdrive and keep consuming thousands of - # messages when the user might need only a few - if self.queue.empty(): - self.start.set() - - try: - partition, message = self.queue.get(block, timeout) - except Empty: - break - - messages.append(message) - new_offsets[partition] = message.offset + 1 - count -= 1 - if timeout is not None: - timeout = max_time - time.time() - - self.size.value = 0 - self.start.clear() - self.pause.set() - - # Update and commit offsets if necessary - self.offsets.update(new_offsets) - self.count_since_commit += len(messages) - self._auto_commit() - - return messages diff -Nru python-kafka-python-0.9.2/kafka/context.py python-kafka-python-1.0.1/kafka/context.py --- python-kafka-python-0.9.2/kafka/context.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/context.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,175 @@ +""" +Context manager to commit/rollback consumer offsets. +""" +from logging import getLogger + +from kafka.common import check_error, OffsetCommitRequestPayload, OffsetOutOfRangeError + + +class OffsetCommitContext(object): + """ + Provides commit/rollback semantics around a `SimpleConsumer`. + + Usage assumes that `auto_commit` is disabled, that messages are consumed in + batches, and that the consuming process will record its own successful + processing of each message. Both the commit and rollback operations respect + a "high-water mark" to ensure that last unsuccessfully processed message + will be retried. + + Example: + + .. code:: python + + consumer = SimpleConsumer(client, group, topic, auto_commit=False) + consumer.provide_partition_info() + consumer.fetch_last_known_offsets() + + while some_condition: + with OffsetCommitContext(consumer) as context: + messages = consumer.get_messages(count, block=False) + + for partition, message in messages: + if can_process(message): + context.mark(partition, message.offset) + else: + break + + if not context: + sleep(delay) + + + These semantics allow for deferred message processing (e.g. if `can_process` + compares message time to clock time) and for repeated processing of the last + unsuccessful message (until some external error is resolved). + """ + + def __init__(self, consumer): + """ + :param consumer: an instance of `SimpleConsumer` + """ + self.consumer = consumer + self.initial_offsets = None + self.high_water_mark = None + self.logger = getLogger("kafka.context") + + def mark(self, partition, offset): + """ + Set the high-water mark in the current context. + + In order to know the current partition, it is helpful to initialize + the consumer to provide partition info via: + + .. code:: python + + consumer.provide_partition_info() + + """ + max_offset = max(offset + 1, self.high_water_mark.get(partition, 0)) + + self.logger.debug("Setting high-water mark to: %s", + {partition: max_offset}) + + self.high_water_mark[partition] = max_offset + + def __nonzero__(self): + """ + Return whether any operations were marked in the context. + """ + return bool(self.high_water_mark) + + def __enter__(self): + """ + Start a new context: + + - Record the initial offsets for rollback + - Reset the high-water mark + """ + self.initial_offsets = dict(self.consumer.offsets) + self.high_water_mark = dict() + + self.logger.debug("Starting context at: %s", self.initial_offsets) + + return self + + def __exit__(self, exc_type, exc_value, traceback): + """ + End a context. + + - If there was no exception, commit up to the current high-water mark. + - If there was an offset of range error, attempt to find the correct + initial offset. + - If there was any other error, roll back to the initial offsets. + """ + if exc_type is None: + self.commit() + elif isinstance(exc_value, OffsetOutOfRangeError): + self.handle_out_of_range() + return True + else: + self.rollback() + + def commit(self): + """ + Commit this context's offsets: + + - If the high-water mark has moved, commit up to and position the + consumer at the high-water mark. + - Otherwise, reset to the consumer to the initial offsets. + """ + if self.high_water_mark: + self.logger.info("Committing offsets: %s", self.high_water_mark) + self.commit_partition_offsets(self.high_water_mark) + self.update_consumer_offsets(self.high_water_mark) + else: + self.update_consumer_offsets(self.initial_offsets) + + def rollback(self): + """ + Rollback this context: + + - Position the consumer at the initial offsets. + """ + self.logger.info("Rolling back context: %s", self.initial_offsets) + self.update_consumer_offsets(self.initial_offsets) + + def commit_partition_offsets(self, partition_offsets): + """ + Commit explicit partition/offset pairs. + """ + self.logger.debug("Committing partition offsets: %s", partition_offsets) + + commit_requests = [ + OffsetCommitRequestPayload(self.consumer.topic, partition, offset, None) + for partition, offset in partition_offsets.items() + ] + commit_responses = self.consumer.client.send_offset_commit_request( + self.consumer.group, + commit_requests, + ) + for commit_response in commit_responses: + check_error(commit_response) + + def update_consumer_offsets(self, partition_offsets): + """ + Update consumer offsets to explicit positions. + """ + self.logger.debug("Updating consumer offsets to: %s", partition_offsets) + + for partition, offset in partition_offsets.items(): + self.consumer.offsets[partition] = offset + + # consumer keeps other offset states beyond its `offsets` dictionary, + # a relative seek with zero delta forces the consumer to reset to the + # current value of the `offsets` dictionary + self.consumer.seek(0, 1) + + def handle_out_of_range(self): + """ + Handle out of range condition by seeking to the beginning of valid + ranges. + + This assumes that an out of range doesn't happen by seeking past the end + of valid ranges -- which is far less likely. + """ + self.logger.info("Seeking beginning of partition on out of range error") + self.consumer.seek(0, 0) diff -Nru python-kafka-python-0.9.2/kafka/coordinator/assignors/abstract.py python-kafka-python-1.0.1/kafka/coordinator/assignors/abstract.py --- python-kafka-python-0.9.2/kafka/coordinator/assignors/abstract.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/coordinator/assignors/abstract.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,54 @@ +import abc +import logging + +log = logging.getLogger(__name__) + + +class AbstractPartitionAssignor(object): + """ + Abstract assignor implementation which does some common grunt work (in particular collecting + partition counts which are always needed in assignors). + """ + + @abc.abstractproperty + def name(self): + """.name should be a string identifying the assignor""" + pass + + @abc.abstractmethod + def assign(self, cluster, members): + """Perform group assignment given cluster metadata and member subscriptions + + Arguments: + cluster (ClusterMetadata): metadata for use in assignment + members (dict of {member_id: MemberMetadata}): decoded metadata for + each member in the group. + + Returns: + dict: {member_id: MemberAssignment} + """ + pass + + @abc.abstractmethod + def metadata(self, topics): + """Generate ProtocolMetadata to be submitted via JoinGroupRequest. + + Arguments: + topics (set): a member's subscribed topics + + Returns: + MemberMetadata struct + """ + pass + + @abc.abstractmethod + def on_assignment(self, assignment): + """Callback that runs on each assignment. + + This method can be used to update internal state, if any, of the + partition assignor. + + Arguments: + assignment (MemberAssignment): the member's assignment + """ + pass diff -Nru python-kafka-python-0.9.2/kafka/coordinator/assignors/range.py python-kafka-python-1.0.1/kafka/coordinator/assignors/range.py --- python-kafka-python-0.9.2/kafka/coordinator/assignors/range.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/coordinator/assignors/range.py 2016-02-18 16:38:17.000000000 +0000 @@ -0,0 +1,77 @@ +import collections +import logging + +import six + +from .abstract import AbstractPartitionAssignor +from ..protocol import ConsumerProtocolMemberMetadata, ConsumerProtocolMemberAssignment + +log = logging.getLogger(__name__) + + +class RangePartitionAssignor(AbstractPartitionAssignor): + """ + The range assignor works on a per-topic basis. For each topic, we lay out + the available partitions in numeric order and the consumers in + lexicographic order. We then divide the number of partitions by the total + number of consumers to determine the number of partitions to assign to each + consumer. If it does not evenly divide, then the first few consumers will + have one extra partition. + + For example, suppose there are two consumers C0 and C1, two topics t0 and + t1, and each topic has 3 partitions, resulting in partitions t0p0, t0p1, + t0p2, t1p0, t1p1, and t1p2. + + The assignment will be: + C0: [t0p0, t0p1, t1p0, t1p1] + C1: [t0p2, t1p2] + """ + name = 'range' + version = 0 + + @classmethod + def assign(cls, cluster, member_metadata): + consumers_per_topic = collections.defaultdict(list) + for member, metadata in six.iteritems(member_metadata): + for topic in metadata.subscription: + consumers_per_topic[topic].append(member) + + # construct {member_id: {topic: [partition, ...]}} + assignment = collections.defaultdict(dict) + + for topic, consumers_for_topic in six.iteritems(consumers_per_topic): + partitions = cluster.partitions_for_topic(topic) + if partitions is None: + log.warning('No partition metadata for topic %s', topic) + continue + partitions = sorted(list(partitions)) + partitions_for_topic = len(partitions) + consumers_for_topic.sort() + + partitions_per_consumer = len(partitions) // len(consumers_for_topic) + consumers_with_extra = len(partitions) % len(consumers_for_topic) + + for i in range(len(consumers_for_topic)): + start = partitions_per_consumer * i + start += min(i, consumers_with_extra) + length = partitions_per_consumer + if not i + 1 > consumers_with_extra: + length += 1 + member = consumers_for_topic[i] + assignment[member][topic] = partitions[start:start+length] + + protocol_assignment = {} + for member_id in member_metadata: + protocol_assignment[member_id] = ConsumerProtocolMemberAssignment( + cls.version, + sorted(assignment[member_id].items()), + b'') + return protocol_assignment + + @classmethod + def metadata(cls, topics): + return ConsumerProtocolMemberMetadata(cls.version, list(topics), b'') + + @classmethod + def on_assignment(cls, assignment): + pass diff -Nru python-kafka-python-0.9.2/kafka/coordinator/assignors/roundrobin.py python-kafka-python-1.0.1/kafka/coordinator/assignors/roundrobin.py --- python-kafka-python-0.9.2/kafka/coordinator/assignors/roundrobin.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/coordinator/assignors/roundrobin.py 2016-02-18 16:38:17.000000000 +0000 @@ -0,0 +1,79 @@ +import collections +import itertools +import logging + +import six + +from .abstract import AbstractPartitionAssignor +from ...common import TopicPartition +from ..protocol import ConsumerProtocolMemberMetadata, ConsumerProtocolMemberAssignment + +log = logging.getLogger(__name__) + + +class RoundRobinPartitionAssignor(AbstractPartitionAssignor): + """ + The roundrobin assignor lays out all the available partitions and all the + available consumers. It then proceeds to do a roundrobin assignment from + partition to consumer. If the subscriptions of all consumer instances are + identical, then the partitions will be uniformly distributed. (i.e., the + partition ownership counts will be within a delta of exactly one across all + consumers.) + + For example, suppose there are two consumers C0 and C1, two topics t0 and + t1, and each topic has 3 partitions, resulting in partitions t0p0, t0p1, + t0p2, t1p0, t1p1, and t1p2. + + The assignment will be: + C0: [t0p0, t0p2, t1p1] + C1: [t0p1, t1p0, t1p2] + """ + name = 'roundrobin' + version = 0 + + @classmethod + def assign(cls, cluster, member_metadata): + all_topics = set() + for metadata in six.itervalues(member_metadata): + all_topics.update(metadata.subscription) + + all_topic_partitions = [] + for topic in all_topics: + partitions = cluster.partitions_for_topic(topic) + if partitions is None: + log.warning('No partition metadata for topic %s', topic) + continue + for partition in partitions: + all_topic_partitions.append(TopicPartition(topic, partition)) + all_topic_partitions.sort() + + # construct {member_id: {topic: [partition, ...]}} + assignment = collections.defaultdict(lambda: collections.defaultdict(list)) + + member_iter = itertools.cycle(sorted(member_metadata.keys())) + for partition in all_topic_partitions: + member_id = next(member_iter) + + # Because we constructed all_topic_partitions from the set of + # member subscribed topics, we should be safe assuming that + # each topic in all_topic_partitions is in at least one member + # subscription; otherwise this could yield an infinite loop + while partition.topic not in member_metadata[member_id].subscription: + member_id = next(member_iter) + assignment[member_id][partition.topic].append(partition.partition) + + protocol_assignment = {} + for member_id in member_metadata: + protocol_assignment[member_id] = ConsumerProtocolMemberAssignment( + cls.version, + sorted(assignment[member_id].items()), + b'') + return protocol_assignment + + @classmethod + def metadata(cls, topics): + return ConsumerProtocolMemberMetadata(cls.version, list(topics), b'') + + @classmethod + def on_assignment(cls, assignment): + pass diff -Nru python-kafka-python-0.9.2/kafka/coordinator/base.py python-kafka-python-1.0.1/kafka/coordinator/base.py --- python-kafka-python-0.9.2/kafka/coordinator/base.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/coordinator/base.py 2016-02-18 16:38:17.000000000 +0000 @@ -0,0 +1,690 @@ +import abc +import copy +import logging +import time +import weakref + +import six + +import kafka.common as Errors +from kafka.future import Future +from kafka.protocol.commit import (GroupCoordinatorRequest, + OffsetCommitRequest_v2 as OffsetCommitRequest) +from kafka.protocol.group import (HeartbeatRequest, JoinGroupRequest, + LeaveGroupRequest, SyncGroupRequest) +from .heartbeat import Heartbeat + +log = logging.getLogger('kafka.coordinator') + + +class BaseCoordinator(object): + """ + BaseCoordinator implements group management for a single group member + by interacting with a designated Kafka broker (the coordinator). Group + semantics are provided by extending this class. See ConsumerCoordinator + for example usage. + + From a high level, Kafka's group management protocol consists of the + following sequence of actions: + + 1. Group Registration: Group members register with the coordinator providing + their own metadata (such as the set of topics they are interested in). + + 2. Group/Leader Selection: The coordinator select the members of the group + and chooses one member as the leader. + + 3. State Assignment: The leader collects the metadata from all the members + of the group and assigns state. + + 4. Group Stabilization: Each member receives the state assigned by the + leader and begins processing. + + To leverage this protocol, an implementation must define the format of + metadata provided by each member for group registration in group_protocols() + and the format of the state assignment provided by the leader in + _perform_assignment() and which becomes available to members in + _on_join_complete(). + """ + + DEFAULT_CONFIG = { + 'group_id': 'kafka-python-default-group', + 'session_timeout_ms': 30000, + 'heartbeat_interval_ms': 3000, + 'retry_backoff_ms': 100, + } + + def __init__(self, client, **configs): + """ + Keyword Arguments: + group_id (str): name of the consumer group to join for dynamic + partition assignment (if enabled), and to use for fetching and + committing offsets. Default: 'kafka-python-default-group' + session_timeout_ms (int): The timeout used to detect failures when + using Kafka's group managementment facilities. Default: 30000 + heartbeat_interval_ms (int): The expected time in milliseconds + between heartbeats to the consumer coordinator when using + Kafka's group management feature. Heartbeats are used to ensure + that the consumer's session stays active and to facilitate + rebalancing when new consumers join or leave the group. The + value must be set lower than session_timeout_ms, but typically + should be set no higher than 1/3 of that value. It can be + adjusted even lower to control the expected time for normal + rebalances. Default: 3000 + retry_backoff_ms (int): Milliseconds to backoff when retrying on + errors. Default: 100. + """ + self.config = copy.copy(self.DEFAULT_CONFIG) + for key in self.config: + if key in configs: + self.config[key] = configs[key] + + self._client = client + self.generation = OffsetCommitRequest.DEFAULT_GENERATION_ID + self.member_id = JoinGroupRequest.UNKNOWN_MEMBER_ID + self.group_id = self.config['group_id'] + self.coordinator_id = None + self.rejoin_needed = True + self.needs_join_prepare = True + self.heartbeat = Heartbeat(**self.config) + self.heartbeat_task = HeartbeatTask(weakref.proxy(self)) + #self.sensors = GroupCoordinatorMetrics(metrics, metric_group_prefix, metric_tags) + + def __del__(self): + self.heartbeat_task.disable() + + @abc.abstractmethod + def protocol_type(self): + """ + Unique identifier for the class of protocols implements + (e.g. "consumer" or "connect"). + + Returns: + str: protocol type name + """ + pass + + @abc.abstractmethod + def group_protocols(self): + """Return the list of supported group protocols and metadata. + + This list is submitted by each group member via a JoinGroupRequest. + The order of the protocols in the list indicates the preference of the + protocol (the first entry is the most preferred). The coordinator takes + this preference into account when selecting the generation protocol + (generally more preferred protocols will be selected as long as all + members support them and there is no disagreement on the preference). + + Note: metadata must be type bytes or support an encode() method + + Returns: + list: [(protocol, metadata), ...] + """ + pass + + @abc.abstractmethod + def _on_join_prepare(self, generation, member_id): + """Invoked prior to each group join or rejoin. + + This is typically used to perform any cleanup from the previous + generation (such as committing offsets for the consumer) + + Arguments: + generation (int): The previous generation or -1 if there was none + member_id (str): The identifier of this member in the previous group + or '' if there was none + """ + pass + + @abc.abstractmethod + def _perform_assignment(self, leader_id, protocol, members): + """Perform assignment for the group. + + This is used by the leader to push state to all the members of the group + (e.g. to push partition assignments in the case of the new consumer) + + Arguments: + leader_id (str): The id of the leader (which is this member) + protocol (str): the chosen group protocol (assignment strategy) + members (list): [(member_id, metadata_bytes)] from + JoinGroupResponse. metadata_bytes are associated with the chosen + group protocol, and the Coordinator subclass is responsible for + decoding metadata_bytes based on that protocol. + + Returns: + dict: {member_id: assignment}; assignment must either be bytes + or have an encode() method to convert to bytes + """ + pass + + @abc.abstractmethod + def _on_join_complete(self, generation, member_id, protocol, + member_assignment_bytes): + """Invoked when a group member has successfully joined a group. + + Arguments: + generation (int): the generation that was joined + member_id (str): the identifier for the local member in the group + protocol (str): the protocol selected by the coordinator + member_assignment_bytes (bytes): the protocol-encoded assignment + propagated from the group leader. The Coordinator instance is + responsible for decoding based on the chosen protocol. + """ + pass + + def coordinator_unknown(self): + """Check if we know who the coordinator is and have an active connection + + Side-effect: reset coordinator_id to None if connection failed + + Returns: + bool: True if the coordinator is unknown + """ + if self.coordinator_id is None: + return True + + if self._client.is_disconnected(self.coordinator_id): + self.coordinator_dead() + return True + + return not self._client.ready(self.coordinator_id) + + def ensure_coordinator_known(self): + """Block until the coordinator for this group is known + (and we have an active connection -- java client uses unsent queue). + """ + while self.coordinator_unknown(): + + # Dont look for a new coordinator node if we are just waiting + # for connection to finish + if self.coordinator_id is not None: + self._client.poll() + continue + + future = self._send_group_metadata_request() + self._client.poll(future=future) + + if future.failed(): + if isinstance(future.exception, + Errors.GroupCoordinatorNotAvailableError): + continue + elif future.retriable(): + metadata_update = self._client.cluster.request_update() + self._client.poll(future=metadata_update, sleep=True) + else: + raise future.exception # pylint: disable-msg=raising-bad-type + + def need_rejoin(self): + """Check whether the group should be rejoined (e.g. if metadata changes) + + Returns: + bool: True if it should, False otherwise + """ + return self.rejoin_needed + + def ensure_active_group(self): + """Ensure that the group is active (i.e. joined and synced)""" + if not self.need_rejoin(): + return + + if self.needs_join_prepare: + self._on_join_prepare(self.generation, self.member_id) + self.needs_join_prepare = False + + while self.need_rejoin(): + self.ensure_coordinator_known() + + future = self._perform_group_join() + self._client.poll(future=future) + + if future.succeeded(): + member_assignment_bytes = future.value + self._on_join_complete(self.generation, self.member_id, + self.protocol, member_assignment_bytes) + self.needs_join_prepare = True + self.heartbeat_task.reset() + else: + assert future.failed() + exception = future.exception + if isinstance(exception, (Errors.UnknownMemberIdError, + Errors.RebalanceInProgressError, + Errors.IllegalGenerationError)): + continue + elif not future.retriable(): + raise exception # pylint: disable-msg=raising-bad-type + time.sleep(self.config['retry_backoff_ms'] / 1000.0) + + def _perform_group_join(self): + """Join the group and return the assignment for the next generation. + + This function handles both JoinGroup and SyncGroup, delegating to + _perform_assignment() if elected leader by the coordinator. + + Returns: + Future: resolves to the encoded-bytes assignment returned from the + group leader + """ + if self.coordinator_unknown(): + e = Errors.GroupCoordinatorNotAvailableError(self.coordinator_id) + return Future().failure(e) + + # send a join group request to the coordinator + log.debug("(Re-)joining group %s", self.group_id) + request = JoinGroupRequest( + self.group_id, + self.config['session_timeout_ms'], + self.member_id, + self.protocol_type(), + [(protocol, + metadata if isinstance(metadata, bytes) else metadata.encode()) + for protocol, metadata in self.group_protocols()]) + + # create the request for the coordinator + log.debug("Issuing request (%s) to coordinator %s", request, self.coordinator_id) + future = Future() + _f = self._client.send(self.coordinator_id, request) + _f.add_callback(self._handle_join_group_response, future) + _f.add_errback(self._failed_request, self.coordinator_id, + request, future) + return future + + def _failed_request(self, node_id, request, future, error): + log.error('Error sending %s to node %s [%s] -- marking coordinator dead', + request.__class__.__name__, node_id, error) + self.coordinator_dead() + future.failure(error) + + def _handle_join_group_response(self, future, response): + error_type = Errors.for_code(response.error_code) + if error_type is Errors.NoError: + self.member_id = response.member_id + self.generation = response.generation_id + self.rejoin_needed = False + self.protocol = response.group_protocol + log.info("Joined group '%s' (generation %s) with member_id %s", + self.group_id, self.generation, self.member_id) + #self.sensors.join_latency.record(response.requestLatencyMs()) + if response.leader_id == response.member_id: + log.info("Elected group leader -- performing partition" + " assignments using %s", self.protocol) + self._on_join_leader(response).chain(future) + else: + self._on_join_follower().chain(future) + + elif error_type is Errors.GroupLoadInProgressError: + log.debug("Attempt to join group %s rejected since coordinator is" + " loading the group.", self.group_id) + # backoff and retry + future.failure(error_type(response)) + elif error_type is Errors.UnknownMemberIdError: + # reset the member id and retry immediately + error = error_type(self.member_id) + self.member_id = JoinGroupRequest.UNKNOWN_MEMBER_ID + log.info("Attempt to join group %s failed due to unknown member id," + " resetting and retrying.", self.group_id) + future.failure(error) + elif error_type in (Errors.GroupCoordinatorNotAvailableError, + Errors.NotCoordinatorForGroupError): + # re-discover the coordinator and retry with backoff + self.coordinator_dead() + log.info("Attempt to join group %s failed due to obsolete " + "coordinator information, retrying.", self.group_id) + future.failure(error_type()) + elif error_type in (Errors.InconsistentGroupProtocolError, + Errors.InvalidSessionTimeoutError, + Errors.InvalidGroupIdError): + # log the error and re-throw the exception + error = error_type(response) + log.error("Attempt to join group %s failed due to: %s", + self.group_id, error) + future.failure(error) + elif error_type is Errors.GroupAuthorizationFailedError: + future.failure(error_type(self.group_id)) + else: + # unexpected error, throw the exception + error = error_type() + log.error("Unexpected error in join group response: %s", error) + future.failure(error) + + def _on_join_follower(self): + # send follower's sync group with an empty assignment + request = SyncGroupRequest( + self.group_id, + self.generation, + self.member_id, + {}) + log.debug("Issuing follower SyncGroup (%s) to coordinator %s", + request, self.coordinator_id) + return self._send_sync_group_request(request) + + def _on_join_leader(self, response): + """ + Perform leader synchronization and send back the assignment + for the group via SyncGroupRequest + + Arguments: + response (JoinResponse): broker response to parse + + Returns: + Future: resolves to member assignment encoded-bytes + """ + try: + group_assignment = self._perform_assignment(response.leader_id, + response.group_protocol, + response.members) + except Exception as e: + return Future().failure(e) + + request = SyncGroupRequest( + self.group_id, + self.generation, + self.member_id, + [(member_id, + assignment if isinstance(assignment, bytes) else assignment.encode()) + for member_id, assignment in six.iteritems(group_assignment)]) + + log.debug("Issuing leader SyncGroup (%s) to coordinator %s", + request, self.coordinator_id) + return self._send_sync_group_request(request) + + def _send_sync_group_request(self, request): + if self.coordinator_unknown(): + return Future().failure(Errors.GroupCoordinatorNotAvailableError()) + future = Future() + _f = self._client.send(self.coordinator_id, request) + _f.add_callback(self._handle_sync_group_response, future) + _f.add_errback(self._failed_request, self.coordinator_id, + request, future) + return future + + def _handle_sync_group_response(self, future, response): + error_type = Errors.for_code(response.error_code) + if error_type is Errors.NoError: + log.debug("Received successful sync group response for group %s: %s", + self.group_id, response) + #self.sensors.syncLatency.record(response.requestLatencyMs()) + future.success(response.member_assignment) + return + + # Always rejoin on error + self.rejoin_needed = True + if error_type is Errors.GroupAuthorizationFailedError: + future.failure(error_type(self.group_id)) + elif error_type is Errors.RebalanceInProgressError: + log.info("SyncGroup for group %s failed due to coordinator" + " rebalance, rejoining the group", self.group_id) + future.failure(error_type(self.group_id)) + elif error_type in (Errors.UnknownMemberIdError, + Errors.IllegalGenerationError): + error = error_type() + log.info("SyncGroup for group %s failed due to %s," + " rejoining the group", self.group_id, error) + self.member_id = JoinGroupRequest.UNKNOWN_MEMBER_ID + future.failure(error) + elif error_type in (Errors.GroupCoordinatorNotAvailableError, + Errors.NotCoordinatorForGroupError): + error = error_type() + log.info("SyncGroup for group %s failed due to %s, will find new" + " coordinator and rejoin", self.group_id, error) + self.coordinator_dead() + future.failure(error) + else: + error = error_type() + log.error("Unexpected error from SyncGroup: %s", error) + future.failure(error) + + def _send_group_metadata_request(self): + """Discover the current coordinator for the group. + + Returns: + Future: resolves to the node id of the coordinator + """ + node_id = self._client.least_loaded_node() + if node_id is None or not self._client.ready(node_id): + return Future().failure(Errors.NoBrokersAvailable()) + + log.debug("Issuing group metadata request to broker %s", node_id) + request = GroupCoordinatorRequest(self.group_id) + future = Future() + _f = self._client.send(node_id, request) + _f.add_callback(self._handle_group_coordinator_response, future) + _f.add_errback(self._failed_request, node_id, request, future) + return future + + def _handle_group_coordinator_response(self, future, response): + log.debug("Group metadata response %s", response) + if not self.coordinator_unknown(): + # We already found the coordinator, so ignore the request + log.debug("Coordinator already known -- ignoring metadata response") + future.success(self.coordinator_id) + return + + error_type = Errors.for_code(response.error_code) + if error_type is Errors.NoError: + ok = self._client.cluster.add_group_coordinator(self.group_id, response) + if not ok: + # This could happen if coordinator metadata is different + # than broker metadata + future.failure(Errors.IllegalStateError()) + return + + self.coordinator_id = response.coordinator_id + self._client.ready(self.coordinator_id) + + # start sending heartbeats only if we have a valid generation + if self.generation > 0: + self.heartbeat_task.reset() + future.success(self.coordinator_id) + elif error_type is Errors.GroupCoordinatorNotAvailableError: + log.debug("Group Coordinator Not Available; retry") + future.failure(error_type()) + elif error_type is Errors.GroupAuthorizationFailedError: + error = error_type(self.group_id) + log.error("Group Coordinator Request failed: %s", error) + future.failure(error) + else: + error = error_type() + log.error("Unrecognized failure in Group Coordinator Request: %s", + error) + future.failure(error) + + def coordinator_dead(self, error=None): + """Mark the current coordinator as dead.""" + if self.coordinator_id is not None: + log.info("Marking the coordinator dead (node %s): %s.", + self.coordinator_id, error) + self.coordinator_id = None + + def close(self): + """Close the coordinator, leave the current group + and reset local generation/memberId.""" + try: + self._client.unschedule(self.heartbeat_task) + except KeyError: + pass + if not self.coordinator_unknown() and self.generation > 0: + # this is a minimal effort attempt to leave the group. we do not + # attempt any resending if the request fails or times out. + request = LeaveGroupRequest(self.group_id, self.member_id) + future = self._client.send(self.coordinator_id, request) + future.add_callback(self._handle_leave_group_response) + future.add_errback(log.error, "LeaveGroup request failed: %s") + self._client.poll(future=future) + + self.generation = OffsetCommitRequest.DEFAULT_GENERATION_ID + self.member_id = JoinGroupRequest.UNKNOWN_MEMBER_ID + self.rejoin_needed = True + + def _handle_leave_group_response(self, response): + error_type = Errors.for_code(response.error_code) + if error_type is Errors.NoError: + log.info("LeaveGroup request succeeded") + else: + log.error("LeaveGroup request failed: %s", error_type()) + + def _send_heartbeat_request(self): + """Send a heartbeat request""" + request = HeartbeatRequest(self.group_id, self.generation, self.member_id) + log.debug("Heartbeat: %s[%s] %s", request.group, request.generation_id, request.member_id) #pylint: disable-msg=no-member + future = Future() + _f = self._client.send(self.coordinator_id, request) + _f.add_callback(self._handle_heartbeat_response, future) + _f.add_errback(self._failed_request, self.coordinator_id, + request, future) + return future + + def _handle_heartbeat_response(self, future, response): + #self.sensors.heartbeat_latency.record(response.requestLatencyMs()) + error_type = Errors.for_code(response.error_code) + if error_type is Errors.NoError: + log.debug("Received successful heartbeat response.") + future.success(None) + elif error_type in (Errors.GroupCoordinatorNotAvailableError, + Errors.NotCoordinatorForGroupError): + log.info("Heartbeat failed: coordinator is either not started or" + " not valid; will refresh metadata and retry") + self.coordinator_dead() + future.failure(error_type()) + elif error_type is Errors.RebalanceInProgressError: + log.info("Heartbeat failed: group is rebalancing; re-joining group") + self.rejoin_needed = True + future.failure(error_type()) + elif error_type is Errors.IllegalGenerationError: + log.info("Heartbeat failed: local generation id is not current;" + " re-joining group") + self.rejoin_needed = True + future.failure(error_type()) + elif error_type is Errors.UnknownMemberIdError: + log.info("Heartbeat failed: local member_id was not recognized;" + " resetting and re-joining group") + self.member_id = JoinGroupRequest.UNKNOWN_MEMBER_ID + self.rejoin_needed = True + future.failure(error_type) + elif error_type is Errors.GroupAuthorizationFailedError: + error = error_type(self.group_id) + log.error("Heartbeat failed: authorization error: %s", error) + future.failure(error) + else: + error = error_type() + log.error("Heartbeat failed: Unhandled error: %s", error) + future.failure(error) + + +class HeartbeatTask(object): + def __init__(self, coordinator): + self._coordinator = coordinator + self._heartbeat = coordinator.heartbeat + self._client = coordinator._client + self._request_in_flight = False + + def disable(self): + try: + self._client.unschedule(self) + except KeyError: + pass + + def reset(self): + # start or restart the heartbeat task to be executed at the next chance + self._heartbeat.reset_session_timeout() + try: + self._client.unschedule(self) + except KeyError: + pass + if not self._request_in_flight: + self._client.schedule(self, time.time()) + + def __call__(self): + if (self._coordinator.generation < 0 or + self._coordinator.need_rejoin() or + self._coordinator.coordinator_unknown()): + # no need to send the heartbeat we're not using auto-assignment + # or if we are awaiting a rebalance + log.debug("Skipping heartbeat: no auto-assignment" + " or waiting on rebalance") + return + + if self._heartbeat.session_expired(): + # we haven't received a successful heartbeat in one session interval + # so mark the coordinator dead + log.error("Heartbeat session expired - marking coordinator dead") + self._coordinator.coordinator_dead() + return + + if not self._heartbeat.should_heartbeat(): + # we don't need to heartbeat now, so reschedule for when we do + ttl = self._heartbeat.ttl() + log.debug("Heartbeat task unneeded now, retrying in %s", ttl) + self._client.schedule(self, time.time() + ttl) + else: + self._heartbeat.sent_heartbeat() + self._request_in_flight = True + future = self._coordinator._send_heartbeat_request() + future.add_callback(self._handle_heartbeat_success) + future.add_errback(self._handle_heartbeat_failure) + + def _handle_heartbeat_success(self, v): + log.debug("Received successful heartbeat") + self._request_in_flight = False + self._heartbeat.received_heartbeat() + ttl = self._heartbeat.ttl() + self._client.schedule(self, time.time() + ttl) + + def _handle_heartbeat_failure(self, e): + log.debug("Heartbeat failed; retrying") + self._request_in_flight = False + etd = time.time() + self._coordinator.config['retry_backoff_ms'] / 1000.0 + self._client.schedule(self, etd) + +''' +class GroupCoordinatorMetrics(object): + def __init__(self, metrics, prefix, tags=None): + self.metrics = metrics + self.group_name = prefix + "-coordinator-metrics" + + self.heartbeat_latency = metrics.sensor("heartbeat-latency") + self.heartbeat_latency.add(metrics.metricName( + "heartbeat-response-time-max", self.group_name, + "The max time taken to receive a response to a heartbeat request", + tags), metrics.Max()) + self.heartbeat_latency.add(metrics.metricName( + "heartbeat-rate", self.group_name, + "The average number of heartbeats per second", + tags), metrics.Rate(metrics.Count())) + + self.join_latency = metrics.sensor("join-latency") + self.join_latency.add(metrics.metricName( + "join-time-avg", self.group_name, + "The average time taken for a group rejoin", + tags), metrics.Avg()) + self.join_latency.add(metrics.metricName( + "join-time-max", self.group_name, + "The max time taken for a group rejoin", + tags), metrics.Avg()) + self.join_latency.add(metrics.metricName( + "join-rate", self.group_name, + "The number of group joins per second", + tags), metrics.Rate(metrics.Count())) + + self.sync_latency = metrics.sensor("sync-latency") + self.sync_latency.add(metrics.metricName( + "sync-time-avg", self.group_name, + "The average time taken for a group sync", + tags), metrics.Avg()) + self.sync_latency.add(metrics.MetricName( + "sync-time-max", self.group_name, + "The max time taken for a group sync", + tags), metrics.Avg()) + self.sync_latency.add(metrics.metricName( + "sync-rate", self.group_name, + "The number of group syncs per second", + tags), metrics.Rate(metrics.Count())) + + """ + lastHeartbeat = Measurable( + measure=lambda _, value: value - heartbeat.last_heartbeat_send() + ) + metrics.addMetric(metrics.metricName( + "last-heartbeat-seconds-ago", self.group_name, + "The number of seconds since the last controller heartbeat", + tags), lastHeartbeat) + """ +''' diff -Nru python-kafka-python-0.9.2/kafka/coordinator/consumer.py python-kafka-python-1.0.1/kafka/coordinator/consumer.py --- python-kafka-python-0.9.2/kafka/coordinator/consumer.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/coordinator/consumer.py 2016-02-19 06:54:56.000000000 +0000 @@ -0,0 +1,734 @@ +from __future__ import absolute_import + +import copy +import collections +import logging +import time +import weakref + +import six + +from .base import BaseCoordinator +from .assignors.range import RangePartitionAssignor +from .assignors.roundrobin import RoundRobinPartitionAssignor +from .protocol import ConsumerProtocol +from ..common import OffsetAndMetadata, TopicPartition +from ..future import Future +from ..protocol.commit import ( + OffsetCommitRequest_v2, OffsetCommitRequest_v1, OffsetCommitRequest_v0, + OffsetFetchRequest_v0, OffsetFetchRequest_v1) +from ..util import WeakMethod + +import kafka.common as Errors + +log = logging.getLogger(__name__) + + +class ConsumerCoordinator(BaseCoordinator): + """This class manages the coordination process with the consumer coordinator.""" + DEFAULT_CONFIG = { + 'group_id': 'kafka-python-default-group', + 'enable_auto_commit': True, + 'auto_commit_interval_ms': 5000, + 'default_offset_commit_callback': lambda offsets, response: True, + 'assignors': (RangePartitionAssignor, RoundRobinPartitionAssignor), + 'session_timeout_ms': 30000, + 'heartbeat_interval_ms': 3000, + 'retry_backoff_ms': 100, + 'api_version': (0, 9), + } + + def __init__(self, client, subscription, **configs): + """Initialize the coordination manager. + + Keyword Arguments: + group_id (str): name of the consumer group to join for dynamic + partition assignment (if enabled), and to use for fetching and + committing offsets. Default: 'kafka-python-default-group' + enable_auto_commit (bool): If true the consumer's offset will be + periodically committed in the background. Default: True. + auto_commit_interval_ms (int): milliseconds between automatic + offset commits, if enable_auto_commit is True. Default: 5000. + default_offset_commit_callback (callable): called as + callback(offsets, response) response will be either an Exception + or a OffsetCommitResponse struct. This callback can be used to + trigger custom actions when a commit request completes. + assignors (list): List of objects to use to distribute partition + ownership amongst consumer instances when group management is + used. Default: [RangePartitionAssignor, RoundRobinPartitionAssignor] + heartbeat_interval_ms (int): The expected time in milliseconds + between heartbeats to the consumer coordinator when using + Kafka's group management feature. Heartbeats are used to ensure + that the consumer's session stays active and to facilitate + rebalancing when new consumers join or leave the group. The + value must be set lower than session_timeout_ms, but typically + should be set no higher than 1/3 of that value. It can be + adjusted even lower to control the expected time for normal + rebalances. Default: 3000 + session_timeout_ms (int): The timeout used to detect failures when + using Kafka's group managementment facilities. Default: 30000 + retry_backoff_ms (int): Milliseconds to backoff when retrying on + errors. Default: 100. + """ + super(ConsumerCoordinator, self).__init__(client, **configs) + self.config = copy.copy(self.DEFAULT_CONFIG) + for key in self.config: + if key in configs: + self.config[key] = configs[key] + + if self.config['api_version'] >= (0, 9) and self.config['group_id'] is not None: + assert self.config['assignors'], 'Coordinator requires assignors' + + self._subscription = subscription + self._partitions_per_topic = {} + self._cluster = client.cluster + self._cluster.request_update() + self._cluster.add_listener(WeakMethod(self._handle_metadata_update)) + + self._auto_commit_task = None + if self.config['enable_auto_commit']: + if self.config['api_version'] < (0, 8, 1): + log.warning('Broker version (%s) does not support offset' + ' commits; disabling auto-commit.', + self.config['api_version']) + elif self.config['group_id'] is None: + log.warning('group_id is None: disabling auto-commit.') + else: + interval = self.config['auto_commit_interval_ms'] / 1000.0 + self._auto_commit_task = AutoCommitTask(weakref.proxy(self), interval) + + # metrics=None, + # metric_group_prefix=None, + # metric_tags=None, + # self.sensors = ConsumerCoordinatorMetrics(metrics, metric_group_prefix, metric_tags) + + def __del__(self): + if hasattr(self, '_auto_commit_task') and self._auto_commit_task: + self._auto_commit_task.disable() + self._cluster.remove_listener(WeakMethod(self._handle_metadata_update)) + + def protocol_type(self): + return ConsumerProtocol.PROTOCOL_TYPE + + def group_protocols(self): + """Returns list of preferred (protocols, metadata)""" + topics = self._subscription.subscription + assert topics is not None, 'Consumer has not subscribed to topics' + metadata_list = [] + for assignor in self.config['assignors']: + metadata = assignor.metadata(topics) + group_protocol = (assignor.name, metadata) + metadata_list.append(group_protocol) + return metadata_list + + def _handle_metadata_update(self, cluster): + # if we encounter any unauthorized topics, raise an exception + # TODO + #if self._cluster.unauthorized_topics: + # raise TopicAuthorizationError(self._cluster.unauthorized_topics) + + if self._subscription.subscribed_pattern: + topics = [] + for topic in cluster.topics(): + if self._subscription.subscribed_pattern.match(topic): + topics.append(topic) + + self._subscription.change_subscription(topics) + self._client.set_topics(self._subscription.group_subscription()) + + # check if there are any changes to the metadata which should trigger + # a rebalance + if self._subscription_metadata_changed(): + + if (self.config['api_version'] >= (0, 9) + and self.config['group_id'] is not None): + + self._subscription.mark_for_reassignment() + + # If we haven't got group coordinator support, + # just assign all partitions locally + else: + self._subscription.assign_from_subscribed([ + TopicPartition(topic, partition) + for topic in self._subscription.subscription + for partition in self._partitions_per_topic[topic] + ]) + + def _subscription_metadata_changed(self): + if not self._subscription.partitions_auto_assigned(): + return False + + old_partitions_per_topic = self._partitions_per_topic + self._partitions_per_topic = {} + for topic in self._subscription.group_subscription(): + partitions = self._cluster.partitions_for_topic(topic) or [] + self._partitions_per_topic[topic] = set(partitions) + + if self._partitions_per_topic != old_partitions_per_topic: + return True + return False + + def _lookup_assignor(self, name): + for assignor in self.config['assignors']: + if assignor.name == name: + return assignor + return None + + def _on_join_complete(self, generation, member_id, protocol, + member_assignment_bytes): + assignor = self._lookup_assignor(protocol) + assert assignor, 'invalid assignment protocol: %s' % protocol + + assignment = ConsumerProtocol.ASSIGNMENT.decode(member_assignment_bytes) + + # set the flag to refresh last committed offsets + self._subscription.needs_fetch_committed_offsets = True + + # update partition assignment + self._subscription.assign_from_subscribed(assignment.partitions()) + + # give the assignor a chance to update internal state + # based on the received assignment + assignor.on_assignment(assignment) + + # restart the autocommit task if needed + if self.config['enable_auto_commit']: + self._auto_commit_task.enable() + + assigned = set(self._subscription.assigned_partitions()) + log.debug("Set newly assigned partitions %s", assigned) + + # execute the user's callback after rebalance + if self._subscription.listener: + try: + self._subscription.listener.on_partitions_assigned(assigned) + except Exception: + log.exception("User provided listener failed on partition" + " assignment: %s", assigned) + + def _perform_assignment(self, leader_id, assignment_strategy, members): + assignor = self._lookup_assignor(assignment_strategy) + assert assignor, 'Invalid assignment protocol: %s' % assignment_strategy + member_metadata = {} + all_subscribed_topics = set() + for member_id, metadata_bytes in members: + metadata = ConsumerProtocol.METADATA.decode(metadata_bytes) + member_metadata[member_id] = metadata + all_subscribed_topics.update(metadata.subscription) # pylint: disable-msg=no-member + + # the leader will begin watching for changes to any of the topics + # the group is interested in, which ensures that all metadata changes + # will eventually be seen + # Because assignment typically happens within response callbacks, + # we cannot block on metadata updates here (no recursion into poll()) + self._subscription.group_subscribe(all_subscribed_topics) + self._client.set_topics(self._subscription.group_subscription()) + + log.debug("Performing %s assignment for subscriptions %s", + assignor.name, member_metadata) + + assignments = assignor.assign(self._cluster, member_metadata) + + log.debug("Finished assignment: %s", assignments) + + group_assignment = {} + for member_id, assignment in six.iteritems(assignments): + group_assignment[member_id] = assignment + return group_assignment + + def _on_join_prepare(self, generation, member_id): + # commit offsets prior to rebalance if auto-commit enabled + self._maybe_auto_commit_offsets_sync() + + # execute the user's callback before rebalance + log.debug("Revoking previously assigned partitions %s", + self._subscription.assigned_partitions()) + if self._subscription.listener: + try: + revoked = set(self._subscription.assigned_partitions()) + self._subscription.listener.on_partitions_revoked(revoked) + except Exception: + log.exception("User provided subscription listener failed" + " on_partitions_revoked") + + self._subscription.mark_for_reassignment() + + def need_rejoin(self): + """Check whether the group should be rejoined + + Returns: + bool: True if consumer should rejoin group, False otherwise + """ + return (self._subscription.partitions_auto_assigned() and + (super(ConsumerCoordinator, self).need_rejoin() or + self._subscription.needs_partition_assignment)) + + def refresh_committed_offsets_if_needed(self): + """Fetch committed offsets for assigned partitions.""" + if self._subscription.needs_fetch_committed_offsets: + offsets = self.fetch_committed_offsets(self._subscription.assigned_partitions()) + for partition, offset in six.iteritems(offsets): + # verify assignment is still active + if self._subscription.is_assigned(partition): + self._subscription.assignment[partition].committed = offset.offset + self._subscription.needs_fetch_committed_offsets = False + + def fetch_committed_offsets(self, partitions): + """Fetch the current committed offsets for specified partitions + + Arguments: + partitions (list of TopicPartition): partitions to fetch + + Returns: + dict: {TopicPartition: OffsetAndMetadata} + """ + if not partitions: + return {} + + while True: + if self.config['api_version'] >= (0, 8, 2): + self.ensure_coordinator_known() + + # contact coordinator to fetch committed offsets + future = self._send_offset_fetch_request(partitions) + self._client.poll(future=future) + + if future.succeeded(): + return future.value + + if not future.retriable(): + raise future.exception # pylint: disable-msg=raising-bad-type + + time.sleep(self.config['retry_backoff_ms'] / 1000.0) + + def close(self): + try: + self._maybe_auto_commit_offsets_sync() + finally: + super(ConsumerCoordinator, self).close() + + def commit_offsets_async(self, offsets, callback=None): + """Commit specific offsets asynchronously. + + Arguments: + offsets (dict {TopicPartition: OffsetAndMetadata}): what to commit + callback (callable, optional): called as callback(offsets, response) + response will be either an Exception or a OffsetCommitResponse + struct. This callback can be used to trigger custom actions when + a commit request completes. + Returns: + Future: indicating whether the commit was successful or not + """ + assert self.config['api_version'] >= (0, 8, 1), 'Unsupported Broker API' + assert all(map(lambda k: isinstance(k, TopicPartition), offsets)) + assert all(map(lambda v: isinstance(v, OffsetAndMetadata), + offsets.values())) + if callback is None: + callback = self.config['default_offset_commit_callback'] + self._subscription.needs_fetch_committed_offsets = True + future = self._send_offset_commit_request(offsets) + future.add_both(callback, offsets) + return future + + def commit_offsets_sync(self, offsets): + """Commit specific offsets synchronously. + + This method will retry until the commit completes successfully or an + unrecoverable error is encountered. + + Arguments: + offsets (dict {TopicPartition: OffsetAndMetadata}): what to commit + + Raises error on failure + """ + assert self.config['api_version'] >= (0, 8, 1), 'Unsupported Broker API' + assert all(map(lambda k: isinstance(k, TopicPartition), offsets)) + assert all(map(lambda v: isinstance(v, OffsetAndMetadata), + offsets.values())) + if not offsets: + return + + while True: + if self.config['api_version'] >= (0, 8, 2): + self.ensure_coordinator_known() + + future = self._send_offset_commit_request(offsets) + self._client.poll(future=future) + + if future.succeeded(): + return future.value + + if not future.retriable(): + raise future.exception # pylint: disable-msg=raising-bad-type + + time.sleep(self.config['retry_backoff_ms'] / 1000.0) + + def _maybe_auto_commit_offsets_sync(self): + if self.config['api_version'] < (0, 8, 1): + return + if self.config['enable_auto_commit']: + # disable periodic commits prior to committing synchronously. note that they will + # be re-enabled after a rebalance completes + self._auto_commit_task.disable() + + try: + self.commit_offsets_sync(self._subscription.all_consumed_offsets()) + + # The three main group membership errors are known and should not + # require a stacktrace -- just a warning + except (Errors.UnknownMemberIdError, + Errors.IllegalGenerationError, + Errors.RebalanceInProgressError): + log.warning("Offset commit failed: group membership out of date" + " This is likely to cause duplicate message" + " delivery.") + except Exception: + log.exception("Offset commit failed: This is likely to cause" + " duplicate message delivery") + + def _send_offset_commit_request(self, offsets): + """Commit offsets for the specified list of topics and partitions. + + This is a non-blocking call which returns a request future that can be + polled in the case of a synchronous commit or ignored in the + asynchronous case. + + Arguments: + offsets (dict of {TopicPartition: OffsetAndMetadata}): what should + be committed + + Returns: + Future: indicating whether the commit was successful or not + """ + assert self.config['api_version'] >= (0, 8, 1), 'Unsupported Broker API' + assert all(map(lambda k: isinstance(k, TopicPartition), offsets)) + assert all(map(lambda v: isinstance(v, OffsetAndMetadata), + offsets.values())) + if not offsets: + log.debug('No offsets to commit') + return Future().success(True) + + if self.config['api_version'] >= (0, 8, 2): + if self.coordinator_unknown(): + return Future().failure(Errors.GroupCoordinatorNotAvailableError) + node_id = self.coordinator_id + else: + node_id = self._client.least_loaded_node() + + # create the offset commit request + offset_data = collections.defaultdict(dict) + for tp, offset in six.iteritems(offsets): + offset_data[tp.topic][tp.partition] = offset + + if self.config['api_version'] >= (0, 9): + request = OffsetCommitRequest_v2( + self.group_id, + self.generation, + self.member_id, + OffsetCommitRequest_v2.DEFAULT_RETENTION_TIME, + [( + topic, [( + partition, + offset.offset, + offset.metadata + ) for partition, offset in six.iteritems(partitions)] + ) for topic, partitions in six.iteritems(offset_data)] + ) + elif self.config['api_version'] >= (0, 8, 2): + request = OffsetCommitRequest_v1( + self.group_id, -1, '', + [( + topic, [( + partition, + offset.offset, + -1, + offset.metadata + ) for partition, offset in six.iteritems(partitions)] + ) for topic, partitions in six.iteritems(offset_data)] + ) + elif self.config['api_version'] >= (0, 8, 1): + request = OffsetCommitRequest_v0( + self.group_id, + [( + topic, [( + partition, + offset.offset, + offset.metadata + ) for partition, offset in six.iteritems(partitions)] + ) for topic, partitions in six.iteritems(offset_data)] + ) + + log.debug("Sending offset-commit request with %s to %s", + offsets, node_id) + + future = Future() + _f = self._client.send(node_id, request) + _f.add_callback(self._handle_offset_commit_response, offsets, future) + _f.add_errback(self._failed_request, node_id, request, future) + return future + + def _handle_offset_commit_response(self, offsets, future, response): + #self.sensors.commit_latency.record(response.requestLatencyMs()) + unauthorized_topics = set() + + for topic, partitions in response.topics: + for partition, error_code in partitions: + tp = TopicPartition(topic, partition) + offset = offsets[tp] + + error_type = Errors.for_code(error_code) + if error_type is Errors.NoError: + log.debug("Committed offset %s for partition %s", offset, tp) + if self._subscription.is_assigned(tp): + self._subscription.assignment[tp].committed = offset.offset + elif error_type is Errors.GroupAuthorizationFailedError: + log.error("OffsetCommit failed for group %s - %s", + self.group_id, error_type.__name__) + future.failure(error_type(self.group_id)) + return + elif error_type is Errors.TopicAuthorizationFailedError: + unauthorized_topics.add(topic) + elif error_type in (Errors.OffsetMetadataTooLargeError, + Errors.InvalidCommitOffsetSizeError): + # raise the error to the user + log.info("OffsetCommit failed for group %s on partition %s" + " due to %s, will retry", self.group_id, tp, + error_type.__name__) + future.failure(error_type()) + return + elif error_type is Errors.GroupLoadInProgressError: + # just retry + log.info("OffsetCommit failed for group %s because group is" + " initializing (%s), will retry", self.group_id, + error_type.__name__) + future.failure(error_type(self.group_id)) + return + elif error_type in (Errors.GroupCoordinatorNotAvailableError, + Errors.NotCoordinatorForGroupError, + Errors.RequestTimedOutError): + log.info("OffsetCommit failed for group %s due to a" + " coordinator error (%s), will find new coordinator" + " and retry", self.group_id, error_type.__name__) + self.coordinator_dead() + future.failure(error_type(self.group_id)) + return + elif error_type in (Errors.UnknownMemberIdError, + Errors.IllegalGenerationError, + Errors.RebalanceInProgressError): + # need to re-join group + error = error_type(self.group_id) + log.error("OffsetCommit failed for group %s due to group" + " error (%s), will rejoin", self.group_id, error) + self._subscription.mark_for_reassignment() + # Errors.CommitFailedError("Commit cannot be completed due to group rebalance")) + future.failure(error) + return + else: + log.error("OffsetCommit failed for group % on partition %s" + " with offset %s: %s", self.group_id, tp, offset, + error_type.__name__) + future.failure(error_type()) + return + + if unauthorized_topics: + log.error("OffsetCommit failed for unauthorized topics %s", + unauthorized_topics) + future.failure(Errors.TopicAuthorizationFailedError(unauthorized_topics)) + else: + future.success(True) + + def _send_offset_fetch_request(self, partitions): + """Fetch the committed offsets for a set of partitions. + + This is a non-blocking call. The returned future can be polled to get + the actual offsets returned from the broker. + + Arguments: + partitions (list of TopicPartition): the partitions to fetch + + Returns: + Future: resolves to dict of offsets: {TopicPartition: int} + """ + assert self.config['api_version'] >= (0, 8, 1), 'Unsupported Broker API' + assert all(map(lambda k: isinstance(k, TopicPartition), partitions)) + if not partitions: + return Future().success({}) + + if self.config['api_version'] >= (0, 8, 2): + if self.coordinator_unknown(): + return Future().failure(Errors.GroupCoordinatorNotAvailableError) + node_id = self.coordinator_id + else: + node_id = self._client.least_loaded_node() + + # Verify node is ready + if not self._client.ready(node_id): + log.debug("Node %s not ready -- failing offset fetch request", + node_id) + return Future().failure(Errors.NodeNotReadyError) + + log.debug("Fetching committed offsets for partitions: %s", partitions) + # construct the request + topic_partitions = collections.defaultdict(set) + for tp in partitions: + topic_partitions[tp.topic].add(tp.partition) + + if self.config['api_version'] >= (0, 8, 2): + request = OffsetFetchRequest_v1( + self.group_id, + list(topic_partitions.items()) + ) + else: + request = OffsetFetchRequest_v0( + self.group_id, + list(topic_partitions.items()) + ) + + # send the request with a callback + future = Future() + _f = self._client.send(node_id, request) + _f.add_callback(self._handle_offset_fetch_response, future) + _f.add_errback(self._failed_request, node_id, request, future) + return future + + def _handle_offset_fetch_response(self, future, response): + offsets = {} + for topic, partitions in response.topics: + for partition, offset, metadata, error_code in partitions: + tp = TopicPartition(topic, partition) + error_type = Errors.for_code(error_code) + if error_type is not Errors.NoError: + error = error_type() + log.debug("Error fetching offset for %s: %s", tp, error_type()) + if error_type is Errors.GroupLoadInProgressError: + # just retry + future.failure(error) + elif error_type is Errors.NotCoordinatorForGroupError: + # re-discover the coordinator and retry + self.coordinator_dead() + future.failure(error) + elif error_type in (Errors.UnknownMemberIdError, + Errors.IllegalGenerationError): + # need to re-join group + self._subscription.mark_for_reassignment() + future.failure(error) + elif error_type is Errors.UnknownTopicOrPartitionError: + log.warning("OffsetFetchRequest -- unknown topic %s" + " (have you committed any offsets yet?)", + topic) + continue + else: + log.error("Unknown error fetching offsets for %s: %s", + tp, error) + future.failure(error) + return + elif offset >= 0: + # record the position with the offset (-1 indicates no committed offset to fetch) + offsets[tp] = OffsetAndMetadata(offset, metadata) + else: + log.debug("No committed offset for partition %s", tp) + future.success(offsets) + + +class AutoCommitTask(object): + def __init__(self, coordinator, interval): + self._coordinator = coordinator + self._client = coordinator._client + self._interval = interval + self._enabled = False + self._request_in_flight = False + + def enable(self): + if self._enabled: + log.warning("AutoCommitTask is already enabled") + return + + self._enabled = True + if not self._request_in_flight: + self._client.schedule(self, time.time() + self._interval) + + def disable(self): + self._enabled = False + try: + self._client.unschedule(self) + except KeyError: + pass + + def _reschedule(self, at): + assert self._enabled, 'AutoCommitTask not enabled' + self._client.schedule(self, at) + + def __call__(self): + if not self._enabled: + return + + if self._coordinator.coordinator_unknown(): + log.debug("Cannot auto-commit offsets because the coordinator is" + " unknown, will retry after backoff") + backoff = self._coordinator.config['retry_backoff_ms'] / 1000.0 + self._client.schedule(self, time.time() + backoff) + return + + self._request_in_flight = True + self._coordinator.commit_offsets_async( + self._coordinator._subscription.all_consumed_offsets(), + self._handle_commit_response) + + def _handle_commit_response(self, offsets, result): + self._request_in_flight = False + if result is True: + log.debug("Successfully auto-committed offsets") + next_at = time.time() + self._interval + elif not isinstance(result, BaseException): + raise Errors.IllegalStateError( + 'Unrecognized result in _handle_commit_response: %s' + % result) + elif hasattr(result, 'retriable') and result.retriable: + log.debug("Failed to auto-commit offsets: %s, will retry" + " immediately", result) + next_at = time.time() + else: + log.warning("Auto offset commit failed: %s", result) + next_at = time.time() + self._interval + + if not self._enabled: + log.warning("Skipping auto-commit reschedule -- it is disabled") + return + self._reschedule(next_at) + + +# TODO +""" +class ConsumerCoordinatorMetrics(object): + def __init__(self, metrics, prefix, tags): + self.metrics = metrics + self.group_name = prefix + "-coordinator-metrics" + + self.commit_latency = metrics.sensor("commit-latency") + self.commit_latency.add(metrics.MetricName( + "commit-latency-avg", self.group_name, + "The average time taken for a commit request", + tags), metrics.Avg()) + self.commit_latency.add(metrics.MetricName( + "commit-latency-max", self.group_name, + "The max time taken for a commit request", + tags), metrics.Max()) + self.commit_latency.add(metrics.MetricName( + "commit-rate", self.group_name, + "The number of commit calls per second", + tags), metrics.Rate(metrics.Count())) + + ''' + def _num_partitions(config, now): + new Measurable() { + public double measure(MetricConfig config, long now) { + return subscriptions.assignedPartitions().size(); + } + }; + metrics.addMetric(new MetricName("assigned-partitions", + this.metricGrpName, + "The number of partitions currently assigned to this consumer", + tags), + numParts); + ''' +""" diff -Nru python-kafka-python-0.9.2/kafka/coordinator/heartbeat.py python-kafka-python-1.0.1/kafka/coordinator/heartbeat.py --- python-kafka-python-0.9.2/kafka/coordinator/heartbeat.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/coordinator/heartbeat.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,47 @@ +import copy +import time + +import kafka.common as Errors + + +class Heartbeat(object): + DEFAULT_CONFIG = { + 'heartbeat_interval_ms': 3000, + 'session_timeout_ms': 30000, + } + + def __init__(self, **configs): + self.config = copy.copy(self.DEFAULT_CONFIG) + for key in self.config: + if key in configs: + self.config[key] = configs[key] + + assert (self.config['heartbeat_interval_ms'] + <= self.config['session_timeout_ms']), ( + 'Heartbeat interval must be lower than the session timeout') + + self.interval = self.config['heartbeat_interval_ms'] / 1000.0 + self.timeout = self.config['session_timeout_ms'] / 1000.0 + self.last_send = 0 + self.last_receive = 0 + self.last_reset = time.time() + + def sent_heartbeat(self): + self.last_send = time.time() + + def received_heartbeat(self): + self.last_receive = time.time() + + def ttl(self): + last_beat = max(self.last_send, self.last_reset) + return max(0, last_beat + self.interval - time.time()) + + def should_heartbeat(self): + return self.ttl() == 0 + + def session_expired(self): + last_recv = max(self.last_receive, self.last_reset) + return (time.time() - last_recv) > self.timeout + + def reset_session_timeout(self): + self.last_reset = time.time() diff -Nru python-kafka-python-0.9.2/kafka/coordinator/protocol.py python-kafka-python-1.0.1/kafka/coordinator/protocol.py --- python-kafka-python-0.9.2/kafka/coordinator/protocol.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/coordinator/protocol.py 2016-02-18 16:38:17.000000000 +0000 @@ -0,0 +1,33 @@ +from __future__ import absolute_import + +from kafka.common import TopicPartition +from kafka.protocol.struct import Struct +from kafka.protocol.types import Array, Bytes, Int16, Int32, Schema, String + + +class ConsumerProtocolMemberMetadata(Struct): + SCHEMA = Schema( + ('version', Int16), + ('subscription', Array(String('utf-8'))), + ('user_data', Bytes)) + + +class ConsumerProtocolMemberAssignment(Struct): + SCHEMA = Schema( + ('version', Int16), + ('assignment', Array( + ('topic', String('utf-8')), + ('partitions', Array(Int32)))), + ('user_data', Bytes)) + + def partitions(self): + return [TopicPartition(topic, partition) + for topic, partitions in self.assignment # pylint: disable-msg=no-member + for partition in partitions] + + +class ConsumerProtocol(object): + PROTOCOL_TYPE = 'consumer' + ASSIGNMENT_STRATEGIES = ('range', 'roundrobin') + METADATA = ConsumerProtocolMemberMetadata + ASSIGNMENT = ConsumerProtocolMemberAssignment diff -Nru python-kafka-python-0.9.2/kafka/future.py python-kafka-python-1.0.1/kafka/future.py --- python-kafka-python-0.9.2/kafka/future.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/future.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,79 @@ +import functools +import logging + +import kafka.common as Errors + +log = logging.getLogger(__name__) + + +class Future(object): + def __init__(self): + self.is_done = False + self.value = None + self.exception = None + self._callbacks = [] + self._errbacks = [] + + def succeeded(self): + return self.is_done and not self.exception + + def failed(self): + return self.is_done and self.exception + + def retriable(self): + try: + return self.exception.retriable + except AttributeError: + return False + + def success(self, value): + assert not self.is_done, 'Future is already complete' + self.value = value + self.is_done = True + for f in self._callbacks: + try: + f(value) + except Exception: + log.exception('Error processing callback') + return self + + def failure(self, e): + assert not self.is_done, 'Future is already complete' + self.exception = e if type(e) is not type else e() + assert isinstance(self.exception, BaseException), ( + 'future failed without an exception') + self.is_done = True + for f in self._errbacks: + try: + f(self.exception) + except Exception: + log.exception('Error processing errback') + return self + + def add_callback(self, f, *args, **kwargs): + if args or kwargs: + f = functools.partial(f, *args, **kwargs) + if self.is_done and not self.exception: + f(self.value) + else: + self._callbacks.append(f) + return self + + def add_errback(self, f, *args, **kwargs): + if args or kwargs: + f = functools.partial(f, *args, **kwargs) + if self.is_done and self.exception: + f(self.exception) + else: + self._errbacks.append(f) + return self + + def add_both(self, f, *args, **kwargs): + self.add_callback(f, *args, **kwargs) + self.add_errback(f, *args, **kwargs) + return self + + def chain(self, future): + self.add_callback(future.success) + self.add_errback(future.failure) + return self diff -Nru python-kafka-python-0.9.2/kafka/__init__.py python-kafka-python-1.0.1/kafka/__init__.py --- python-kafka-python-0.9.2/kafka/__init__.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/__init__.py 2016-02-17 18:37:58.000000000 +0000 @@ -1,23 +1,50 @@ __title__ = 'kafka' -# Use setuptools to get version from setup.py -import pkg_resources -__version__ = pkg_resources.require('kafka-python')[0].version -__author__ = 'David Arthur' +from .version import __version__ +__author__ = 'Dana Powers' __license__ = 'Apache License 2.0' -__copyright__ = 'Copyright 2014, David Arthur under Apache License, v2.0' +__copyright__ = 'Copyright 2016 Dana Powers, David Arthur, and Contributors' -from kafka.client import KafkaClient -from kafka.conn import KafkaConnection +# Set default logging handler to avoid "No handler found" warnings. +import logging +try: # Python 2.7+ + from logging import NullHandler +except ImportError: + class NullHandler(logging.Handler): + def emit(self, record): + pass + +logging.getLogger(__name__).addHandler(NullHandler()) + + +from kafka.consumer import KafkaConsumer +from kafka.producer import KafkaProducer +from kafka.conn import BrokerConnection from kafka.protocol import ( - create_message, create_gzip_message, create_snappy_message -) + create_message, create_gzip_message, create_snappy_message) +from kafka.partitioner import RoundRobinPartitioner, HashedPartitioner, Murmur2Partitioner +from kafka.common import TopicPartition + +# To be deprecated when KafkaProducer interface is released +from kafka.client import SimpleClient from kafka.producer import SimpleProducer, KeyedProducer -from kafka.partitioner import RoundRobinPartitioner, HashedPartitioner + +# deprecated in favor of KafkaConsumer from kafka.consumer import SimpleConsumer, MultiProcessConsumer + +import warnings +class KafkaClient(SimpleClient): + def __init__(self, *args, **kwargs): + warnings.warn('The legacy KafkaClient interface has been moved to' + ' kafka.SimpleClient - this import will break in a' + ' future release', DeprecationWarning) + super(KafkaClient, self).__init__(*args, **kwargs) + + __all__ = [ - 'KafkaClient', 'KafkaConnection', 'SimpleProducer', 'KeyedProducer', - 'RoundRobinPartitioner', 'HashedPartitioner', 'SimpleConsumer', - 'MultiProcessConsumer', 'create_message', 'create_gzip_message', - 'create_snappy_message' + 'KafkaConsumer', 'KafkaProducer', 'KafkaClient', 'BrokerConnection', + 'SimpleClient', 'SimpleProducer', 'KeyedProducer', + 'RoundRobinPartitioner', 'HashedPartitioner', + 'create_message', 'create_gzip_message', 'create_snappy_message', + 'SimpleConsumer', 'MultiProcessConsumer', ] diff -Nru python-kafka-python-0.9.2/kafka/partitioner/base.py python-kafka-python-1.0.1/kafka/partitioner/base.py --- python-kafka-python-0.9.2/kafka/partitioner/base.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/partitioner/base.py 2016-01-23 22:22:32.000000000 +0000 @@ -0,0 +1,24 @@ + +class Partitioner(object): + """ + Base class for a partitioner + """ + def __init__(self, partitions): + """ + Initialize the partitioner + + Arguments: + partitions: A list of available partitions (during startup) + """ + self.partitions = partitions + + def partition(self, key, partitions=None): + """ + Takes a string key and num_partitions as argument and returns + a partition to be used for the message + + Arguments: + key: the key to use for partitioning + partitions: (optional) a list of partitions. + """ + raise NotImplementedError('partition function has to be implemented') diff -Nru python-kafka-python-0.9.2/kafka/partitioner/default.py python-kafka-python-1.0.1/kafka/partitioner/default.py --- python-kafka-python-0.9.2/kafka/partitioner/default.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/partitioner/default.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,23 @@ +import random + +from .hashed import murmur2 + + +class DefaultPartitioner(object): + """Default partitioner. + + Hashes key to partition using murmur2 hashing (from java client) + If key is None, selects partition randomly from available, + or from all partitions if none are currently available + """ + @classmethod + def __call__(cls, key, all_partitions, available): + if key is None: + if available: + return random.choice(available) + return random.choice(all_partitions) + + idx = murmur2(key) + idx &= 0x7fffffff + idx %= len(all_partitions) + return all_partitions[idx] diff -Nru python-kafka-python-0.9.2/kafka/partitioner/hashed.py python-kafka-python-1.0.1/kafka/partitioner/hashed.py --- python-kafka-python-0.9.2/kafka/partitioner/hashed.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/partitioner/hashed.py 2016-01-23 22:22:32.000000000 +0000 @@ -0,0 +1,110 @@ +import six + +from .base import Partitioner + + +class Murmur2Partitioner(Partitioner): + """ + Implements a partitioner which selects the target partition based on + the hash of the key. Attempts to apply the same hashing + function as mainline java client. + """ + def partition(self, key, partitions=None): + if not partitions: + partitions = self.partitions + + # https://github.com/apache/kafka/blob/0.8.2/clients/src/main/java/org/apache/kafka/clients/producer/internals/Partitioner.java#L69 + idx = (murmur2(key) & 0x7fffffff) % len(partitions) + + return partitions[idx] + + +class LegacyPartitioner(Partitioner): + """DEPRECATED -- See Issue 374 + + Implements a partitioner which selects the target partition based on + the hash of the key + """ + def partition(self, key, partitions=None): + if not partitions: + partitions = self.partitions + size = len(partitions) + idx = hash(key) % size + + return partitions[idx] + + +# Default will change to Murmur2 in 0.10 release +HashedPartitioner = LegacyPartitioner + + +# https://github.com/apache/kafka/blob/0.8.2/clients/src/main/java/org/apache/kafka/common/utils/Utils.java#L244 +def murmur2(key): + """Pure-python Murmur2 implementation. + + Based on java client, see org.apache.kafka.common.utils.Utils.murmur2 + + Args: + key: if not a bytes type, encoded using default encoding + + Returns: MurmurHash2 of key bytearray + """ + + # Convert key to bytes or bytearray + if isinstance(key, bytearray) or (six.PY3 and isinstance(key, bytes)): + data = key + else: + data = bytearray(str(key).encode()) + + length = len(data) + seed = 0x9747b28c + # 'm' and 'r' are mixing constants generated offline. + # They're not really 'magic', they just happen to work well. + m = 0x5bd1e995 + r = 24 + + # Initialize the hash to a random value + h = seed ^ length + length4 = length // 4 + + for i in range(length4): + i4 = i * 4 + k = ((data[i4 + 0] & 0xff) + + ((data[i4 + 1] & 0xff) << 8) + + ((data[i4 + 2] & 0xff) << 16) + + ((data[i4 + 3] & 0xff) << 24)) + k &= 0xffffffff + k *= m + k &= 0xffffffff + k ^= (k % 0x100000000) >> r # k ^= k >>> r + k &= 0xffffffff + k *= m + k &= 0xffffffff + + h *= m + h &= 0xffffffff + h ^= k + h &= 0xffffffff + + # Handle the last few bytes of the input array + extra_bytes = length % 4 + if extra_bytes >= 3: + h ^= (data[(length & ~3) + 2] & 0xff) << 16 + h &= 0xffffffff + if extra_bytes >= 2: + h ^= (data[(length & ~3) + 1] & 0xff) << 8 + h &= 0xffffffff + if extra_bytes >= 1: + h ^= (data[length & ~3] & 0xff) + h &= 0xffffffff + h *= m + h &= 0xffffffff + + h ^= (h % 0x100000000) >> 13 # h >>> 13; + h &= 0xffffffff + h *= m + h &= 0xffffffff + h ^= (h % 0x100000000) >> 15 # h >>> 15; + h &= 0xffffffff + + return h diff -Nru python-kafka-python-0.9.2/kafka/partitioner/__init__.py python-kafka-python-1.0.1/kafka/partitioner/__init__.py --- python-kafka-python-0.9.2/kafka/partitioner/__init__.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/partitioner/__init__.py 2016-01-23 22:22:32.000000000 +0000 @@ -0,0 +1,7 @@ +from .roundrobin import RoundRobinPartitioner +from .hashed import HashedPartitioner, Murmur2Partitioner, LegacyPartitioner + +__all__ = [ + 'RoundRobinPartitioner', 'HashedPartitioner', 'Murmur2Partitioner', + 'LegacyPartitioner' +] diff -Nru python-kafka-python-0.9.2/kafka/partitioner/roundrobin.py python-kafka-python-1.0.1/kafka/partitioner/roundrobin.py --- python-kafka-python-0.9.2/kafka/partitioner/roundrobin.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/partitioner/roundrobin.py 2016-01-23 22:22:32.000000000 +0000 @@ -0,0 +1,23 @@ +from itertools import cycle + +from .base import Partitioner + +class RoundRobinPartitioner(Partitioner): + """ + Implements a round robin partitioner which sends data to partitions + in a round robin fashion + """ + def __init__(self, partitions): + super(RoundRobinPartitioner, self).__init__(partitions) + self.iterpart = cycle(partitions) + + def _set_partitions(self, partitions): + self.partitions = partitions + self.iterpart = cycle(partitions) + + def partition(self, key, partitions=None): + # Refresh the partition list if necessary + if partitions and self.partitions != partitions: + self._set_partitions(partitions) + + return next(self.iterpart) diff -Nru python-kafka-python-0.9.2/kafka/partitioner.py python-kafka-python-1.0.1/kafka/partitioner.py --- python-kafka-python-0.9.2/kafka/partitioner.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/partitioner.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,58 +0,0 @@ -from itertools import cycle - - -class Partitioner(object): - """ - Base class for a partitioner - """ - def __init__(self, partitions): - """ - Initialize the partitioner - - partitions - A list of available partitions (during startup) - """ - self.partitions = partitions - - def partition(self, key, partitions): - """ - Takes a string key and num_partitions as argument and returns - a partition to be used for the message - - partitions - The list of partitions is passed in every call. This - may look like an overhead, but it will be useful - (in future) when we handle cases like rebalancing - """ - raise NotImplementedError('partition function has to be implemented') - - -class RoundRobinPartitioner(Partitioner): - """ - Implements a round robin partitioner which sends data to partitions - in a round robin fashion - """ - def __init__(self, partitions): - super(RoundRobinPartitioner, self).__init__(partitions) - self.iterpart = cycle(partitions) - - def _set_partitions(self, partitions): - self.partitions = partitions - self.iterpart = cycle(partitions) - - def partition(self, key, partitions): - # Refresh the partition list if necessary - if self.partitions != partitions: - self._set_partitions(partitions) - - return self.iterpart.next() - - -class HashedPartitioner(Partitioner): - """ - Implements a partitioner which selects the target partition based on - the hash of the key - """ - def partition(self, key, partitions): - size = len(partitions) - idx = hash(key) % size - - return partitions[idx] diff -Nru python-kafka-python-0.9.2/kafka/producer/base.py python-kafka-python-1.0.1/kafka/producer/base.py --- python-kafka-python-0.9.2/kafka/producer/base.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/producer/base.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,475 @@ +from __future__ import absolute_import + +import atexit +import logging +import time + +try: + from queue import Empty, Full, Queue # pylint: disable=import-error +except ImportError: + from Queue import Empty, Full, Queue # pylint: disable=import-error +from collections import defaultdict + +from threading import Thread, Event + +import six + +from kafka.common import ( + ProduceRequestPayload, ProduceResponsePayload, TopicPartition, RetryOptions, + kafka_errors, UnsupportedCodecError, FailedPayloadsError, + RequestTimedOutError, AsyncProducerQueueFull, UnknownError, + RETRY_ERROR_TYPES, RETRY_BACKOFF_ERROR_TYPES, RETRY_REFRESH_ERROR_TYPES +) + +from kafka.protocol import CODEC_NONE, ALL_CODECS, create_message_set + +log = logging.getLogger('kafka.producer') + +BATCH_SEND_DEFAULT_INTERVAL = 20 +BATCH_SEND_MSG_COUNT = 20 + +# unlimited +ASYNC_QUEUE_MAXSIZE = 0 +ASYNC_QUEUE_PUT_TIMEOUT = 0 +# unlimited retries by default +ASYNC_RETRY_LIMIT = None +ASYNC_RETRY_BACKOFF_MS = 100 +ASYNC_RETRY_ON_TIMEOUTS = True +ASYNC_LOG_MESSAGES_ON_ERROR = True + +STOP_ASYNC_PRODUCER = -1 +ASYNC_STOP_TIMEOUT_SECS = 30 + +SYNC_FAIL_ON_ERROR_DEFAULT = True + + +def _send_upstream(queue, client, codec, batch_time, batch_size, + req_acks, ack_timeout, retry_options, stop_event, + log_messages_on_error=ASYNC_LOG_MESSAGES_ON_ERROR, + stop_timeout=ASYNC_STOP_TIMEOUT_SECS, + codec_compresslevel=None): + """Private method to manage producing messages asynchronously + + Listens on the queue for a specified number of messages or until + a specified timeout and then sends messages to the brokers in grouped + requests (one per broker). + + Messages placed on the queue should be tuples that conform to this format: + ((topic, partition), message, key) + + Currently does not mark messages with task_done. Do not attempt to join()! + + Arguments: + queue (threading.Queue): the queue from which to get messages + client (kafka.SimpleClient): instance to use for communicating + with brokers + codec (kafka.protocol.ALL_CODECS): compression codec to use + batch_time (int): interval in seconds to send message batches + batch_size (int): count of messages that will trigger an immediate send + req_acks: required acks to use with ProduceRequests. see server protocol + ack_timeout: timeout to wait for required acks. see server protocol + retry_options (RetryOptions): settings for retry limits, backoff etc + stop_event (threading.Event): event to monitor for shutdown signal. + when this event is 'set', the producer will stop sending messages. + log_messages_on_error (bool, optional): log stringified message-contents + on any produce error, otherwise only log a hash() of the contents, + defaults to True. + stop_timeout (int or float, optional): number of seconds to continue + retrying messages after stop_event is set, defaults to 30. + """ + request_tries = {} + + while not stop_event.is_set(): + try: + client.reinit() + except Exception as e: + log.warn('Async producer failed to connect to brokers; backoff for %s(ms) before retrying', retry_options.backoff_ms) + time.sleep(float(retry_options.backoff_ms) / 1000) + else: + break + + stop_at = None + while not (stop_event.is_set() and queue.empty() and not request_tries): + + # Handle stop_timeout + if stop_event.is_set(): + if not stop_at: + stop_at = stop_timeout + time.time() + if time.time() > stop_at: + log.debug('Async producer stopping due to stop_timeout') + break + + timeout = batch_time + count = batch_size + send_at = time.time() + timeout + msgset = defaultdict(list) + + # Merging messages will require a bit more work to manage correctly + # for now, dont look for new batches if we have old ones to retry + if request_tries: + count = 0 + log.debug('Skipping new batch collection to handle retries') + else: + log.debug('Batching size: %s, timeout: %s', count, timeout) + + # Keep fetching till we gather enough messages or a + # timeout is reached + while count > 0 and timeout >= 0: + try: + topic_partition, msg, key = queue.get(timeout=timeout) + except Empty: + break + + # Check if the controller has requested us to stop + if topic_partition == STOP_ASYNC_PRODUCER: + stop_event.set() + break + + # Adjust the timeout to match the remaining period + count -= 1 + timeout = send_at - time.time() + msgset[topic_partition].append((msg, key)) + + # Send collected requests upstream + for topic_partition, msg in msgset.items(): + messages = create_message_set(msg, codec, key, codec_compresslevel) + req = ProduceRequestPayload( + topic_partition.topic, + topic_partition.partition, + tuple(messages)) + request_tries[req] = 0 + + if not request_tries: + continue + + reqs_to_retry, error_cls = [], None + retry_state = { + 'do_backoff': False, + 'do_refresh': False + } + + def _handle_error(error_cls, request): + if issubclass(error_cls, RETRY_ERROR_TYPES) or (retry_options.retry_on_timeouts and issubclass(error_cls, RequestTimedOutError)): + reqs_to_retry.append(request) + if issubclass(error_cls, RETRY_BACKOFF_ERROR_TYPES): + retry_state['do_backoff'] |= True + if issubclass(error_cls, RETRY_REFRESH_ERROR_TYPES): + retry_state['do_refresh'] |= True + + requests = list(request_tries.keys()) + log.debug('Sending: %s', requests) + responses = client.send_produce_request(requests, + acks=req_acks, + timeout=ack_timeout, + fail_on_error=False) + + log.debug('Received: %s', responses) + for i, response in enumerate(responses): + error_cls = None + if isinstance(response, FailedPayloadsError): + error_cls = response.__class__ + orig_req = response.payload + + elif isinstance(response, ProduceResponsePayload) and response.error: + error_cls = kafka_errors.get(response.error, UnknownError) + orig_req = requests[i] + + if error_cls: + _handle_error(error_cls, orig_req) + log.error('%s sending ProduceRequestPayload (#%d of %d) ' + 'to %s:%d with msgs %s', + error_cls.__name__, (i + 1), len(requests), + orig_req.topic, orig_req.partition, + orig_req.messages if log_messages_on_error + else hash(orig_req.messages)) + + if not reqs_to_retry: + request_tries = {} + continue + + # doing backoff before next retry + if retry_state['do_backoff'] and retry_options.backoff_ms: + log.warn('Async producer backoff for %s(ms) before retrying', retry_options.backoff_ms) + time.sleep(float(retry_options.backoff_ms) / 1000) + + # refresh topic metadata before next retry + if retry_state['do_refresh']: + log.warn('Async producer forcing metadata refresh metadata before retrying') + try: + client.load_metadata_for_topics() + except Exception: + log.exception("Async producer couldn't reload topic metadata.") + + # Apply retry limit, dropping messages that are over + request_tries = dict( + (key, count + 1) + for (key, count) in request_tries.items() + if key in reqs_to_retry + and (retry_options.limit is None + or (count < retry_options.limit)) + ) + + # Log messages we are going to retry + for orig_req in request_tries.keys(): + log.info('Retrying ProduceRequestPayload to %s:%d with msgs %s', + orig_req.topic, orig_req.partition, + orig_req.messages if log_messages_on_error + else hash(orig_req.messages)) + + if request_tries or not queue.empty(): + log.error('Stopped producer with {0} unsent messages' + .format(len(request_tries) + queue.qsize())) + + +class Producer(object): + """ + Base class to be used by producers + + Arguments: + client (kafka.SimpleClient): instance to use for broker + communications. If async=True, the background thread will use + client.copy(), which is expected to return a thread-safe object. + codec (kafka.protocol.ALL_CODECS): compression codec to use. + req_acks (int, optional): A value indicating the acknowledgements that + the server must receive before responding to the request, + defaults to 1 (local ack). + ack_timeout (int, optional): millisecond timeout to wait for the + configured req_acks, defaults to 1000. + sync_fail_on_error (bool, optional): whether sync producer should + raise exceptions (True), or just return errors (False), + defaults to True. + async (bool, optional): send message using a background thread, + defaults to False. + batch_send_every_n (int, optional): If async is True, messages are + sent in batches of this size, defaults to 20. + batch_send_every_t (int or float, optional): If async is True, + messages are sent immediately after this timeout in seconds, even + if there are fewer than batch_send_every_n, defaults to 20. + async_retry_limit (int, optional): number of retries for failed messages + or None for unlimited, defaults to None / unlimited. + async_retry_backoff_ms (int, optional): milliseconds to backoff on + failed messages, defaults to 100. + async_retry_on_timeouts (bool, optional): whether to retry on + RequestTimedOutError, defaults to True. + async_queue_maxsize (int, optional): limit to the size of the + internal message queue in number of messages (not size), defaults + to 0 (no limit). + async_queue_put_timeout (int or float, optional): timeout seconds + for queue.put in send_messages for async producers -- will only + apply if async_queue_maxsize > 0 and the queue is Full, + defaults to 0 (fail immediately on full queue). + async_log_messages_on_error (bool, optional): set to False and the + async producer will only log hash() contents on failed produce + requests, defaults to True (log full messages). Hash logging + will not allow you to identify the specific message that failed, + but it will allow you to match failures with retries. + async_stop_timeout (int or float, optional): seconds to continue + attempting to send queued messages after producer.stop(), + defaults to 30. + + Deprecated Arguments: + batch_send (bool, optional): If True, messages are sent by a background + thread in batches, defaults to False. Deprecated, use 'async' + """ + ACK_NOT_REQUIRED = 0 # No ack is required + ACK_AFTER_LOCAL_WRITE = 1 # Send response after it is written to log + ACK_AFTER_CLUSTER_COMMIT = -1 # Send response after data is committed + DEFAULT_ACK_TIMEOUT = 1000 + + def __init__(self, client, + req_acks=ACK_AFTER_LOCAL_WRITE, + ack_timeout=DEFAULT_ACK_TIMEOUT, + codec=None, + codec_compresslevel=None, + sync_fail_on_error=SYNC_FAIL_ON_ERROR_DEFAULT, + async=False, + batch_send=False, # deprecated, use async + batch_send_every_n=BATCH_SEND_MSG_COUNT, + batch_send_every_t=BATCH_SEND_DEFAULT_INTERVAL, + async_retry_limit=ASYNC_RETRY_LIMIT, + async_retry_backoff_ms=ASYNC_RETRY_BACKOFF_MS, + async_retry_on_timeouts=ASYNC_RETRY_ON_TIMEOUTS, + async_queue_maxsize=ASYNC_QUEUE_MAXSIZE, + async_queue_put_timeout=ASYNC_QUEUE_PUT_TIMEOUT, + async_log_messages_on_error=ASYNC_LOG_MESSAGES_ON_ERROR, + async_stop_timeout=ASYNC_STOP_TIMEOUT_SECS): + + if async: + assert batch_send_every_n > 0 + assert batch_send_every_t > 0 + assert async_queue_maxsize >= 0 + + self.client = client + self.async = async + self.req_acks = req_acks + self.ack_timeout = ack_timeout + self.stopped = False + + if codec is None: + codec = CODEC_NONE + elif codec not in ALL_CODECS: + raise UnsupportedCodecError("Codec 0x%02x unsupported" % codec) + + self.codec = codec + self.codec_compresslevel = codec_compresslevel + + if self.async: + # Messages are sent through this queue + self.queue = Queue(async_queue_maxsize) + self.async_queue_put_timeout = async_queue_put_timeout + async_retry_options = RetryOptions( + limit=async_retry_limit, + backoff_ms=async_retry_backoff_ms, + retry_on_timeouts=async_retry_on_timeouts) + self.thread_stop_event = Event() + self.thread = Thread( + target=_send_upstream, + args=(self.queue, self.client.copy(), self.codec, + batch_send_every_t, batch_send_every_n, + self.req_acks, self.ack_timeout, + async_retry_options, self.thread_stop_event), + kwargs={'log_messages_on_error': async_log_messages_on_error, + 'stop_timeout': async_stop_timeout, + 'codec_compresslevel': self.codec_compresslevel} + ) + + # Thread will die if main thread exits + self.thread.daemon = True + self.thread.start() + + def cleanup(obj): + if not obj.stopped: + obj.stop() + self._cleanup_func = cleanup + atexit.register(cleanup, self) + else: + self.sync_fail_on_error = sync_fail_on_error + + def send_messages(self, topic, partition, *msg): + """Helper method to send produce requests. + + Note that msg type *must* be encoded to bytes by user. Passing unicode + message will not work, for example you should encode before calling + send_messages via something like `unicode_message.encode('utf-8')` + All messages will set the message 'key' to None. + + Arguments: + topic (str): name of topic for produce request + partition (int): partition number for produce request + *msg (bytes): one or more message payloads + + Returns: + ResponseRequest returned by server + + Raises: + FailedPayloadsError: low-level connection error, can be caused by + networking failures, or a malformed request. + ConnectionError: + KafkaUnavailableError: all known brokers are down when attempting + to refresh metadata. + LeaderNotAvailableError: topic or partition is initializing or + a broker failed and leadership election is in progress. + NotLeaderForPartitionError: metadata is out of sync; the broker + that the request was sent to is not the leader for the topic + or partition. + UnknownTopicOrPartitionError: the topic or partition has not + been created yet and auto-creation is not available. + AsyncProducerQueueFull: in async mode, if too many messages are + unsent and remain in the internal queue. + """ + return self._send_messages(topic, partition, *msg) + + def _send_messages(self, topic, partition, *msg, **kwargs): + key = kwargs.pop('key', None) + + # Guarantee that msg is actually a list or tuple (should always be true) + if not isinstance(msg, (list, tuple)): + raise TypeError("msg is not a list or tuple!") + + for m in msg: + # The protocol allows to have key & payload with null values both, + # (https://goo.gl/o694yN) but having (null,null) pair doesn't make sense. + if m is None: + if key is None: + raise TypeError("key and payload can't be null in one") + # Raise TypeError if any non-null message is not encoded as bytes + elif not isinstance(m, six.binary_type): + raise TypeError("all produce message payloads must be null or type bytes") + + # Raise TypeError if the key is not encoded as bytes + if key is not None and not isinstance(key, six.binary_type): + raise TypeError("the key must be type bytes") + + if self.async: + for idx, m in enumerate(msg): + try: + item = (TopicPartition(topic, partition), m, key) + if self.async_queue_put_timeout == 0: + self.queue.put_nowait(item) + else: + self.queue.put(item, True, self.async_queue_put_timeout) + except Full: + raise AsyncProducerQueueFull( + msg[idx:], + 'Producer async queue overfilled. ' + 'Current queue size %d.' % self.queue.qsize()) + resp = [] + else: + messages = create_message_set([(m, key) for m in msg], self.codec, key, self.codec_compresslevel) + req = ProduceRequestPayload(topic, partition, messages) + try: + resp = self.client.send_produce_request( + [req], acks=self.req_acks, timeout=self.ack_timeout, + fail_on_error=self.sync_fail_on_error + ) + except Exception: + log.exception("Unable to send messages") + raise + return resp + + def stop(self, timeout=None): + """ + Stop the producer (async mode). Blocks until async thread completes. + """ + if timeout is not None: + log.warning('timeout argument to stop() is deprecated - ' + 'it will be removed in future release') + + if not self.async: + log.warning('producer.stop() called, but producer is not async') + return + + if self.stopped: + log.warning('producer.stop() called, but producer is already stopped') + return + + if self.async: + self.queue.put((STOP_ASYNC_PRODUCER, None, None)) + self.thread_stop_event.set() + self.thread.join() + + if hasattr(self, '_cleanup_func'): + # Remove cleanup handler now that we've stopped + + # py3 supports unregistering + if hasattr(atexit, 'unregister'): + atexit.unregister(self._cleanup_func) # pylint: disable=no-member + + # py2 requires removing from private attribute... + else: + + # ValueError on list.remove() if the exithandler no longer exists + # but that is fine here + try: + atexit._exithandlers.remove( # pylint: disable=no-member + (self._cleanup_func, (self,), {})) + except ValueError: + pass + + del self._cleanup_func + + self.stopped = True + + def __del__(self): + if not self.stopped: + self.stop() diff -Nru python-kafka-python-0.9.2/kafka/producer/buffer.py python-kafka-python-1.0.1/kafka/producer/buffer.py --- python-kafka-python-0.9.2/kafka/producer/buffer.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/producer/buffer.py 2016-02-19 05:47:44.000000000 +0000 @@ -0,0 +1,392 @@ +from __future__ import absolute_import + +import collections +import io +import threading +import time + +from ..codec import (has_gzip, has_snappy, has_lz4, + gzip_encode, snappy_encode, lz4_encode) +from ..protocol.types import Int32, Int64 +from ..protocol.message import MessageSet, Message + +import kafka.common as Errors + + +class MessageSetBuffer(object): + """Wrap a buffer for writing MessageSet batches. + + Arguments: + buf (IO stream): a buffer for writing data. Typically BytesIO. + batch_size (int): maximum number of bytes to write to the buffer. + + Keyword Arguments: + compression_type ('gzip', 'snappy', None): compress messages before + publishing. Default: None. + """ + _COMPRESSORS = { + 'gzip': (has_gzip, gzip_encode, Message.CODEC_GZIP), + 'snappy': (has_snappy, snappy_encode, Message.CODEC_SNAPPY), + 'lz4': (has_lz4, lz4_encode, Message.CODEC_LZ4), + } + def __init__(self, buf, batch_size, compression_type=None): + if compression_type is not None: + assert compression_type in self._COMPRESSORS, 'Unrecognized compression type' + checker, encoder, attributes = self._COMPRESSORS[compression_type] + assert checker(), 'Compression Libraries Not Found' + self._compressor = encoder + self._compression_attributes = attributes + else: + self._compressor = None + self._compression_attributes = None + + self._buffer = buf + # Init MessageSetSize to 0 -- update on close + self._buffer.seek(0) + self._buffer.write(Int32.encode(0)) + self._batch_size = batch_size + self._closed = False + self._messages = 0 + + def append(self, offset, message): + """Apend a Message to the MessageSet. + + Arguments: + offset (int): offset of the message + message (Message or bytes): message struct or encoded bytes + """ + if isinstance(message, Message): + encoded = message.encode() + else: + encoded = bytes(message) + msg = Int64.encode(offset) + Int32.encode(len(encoded)) + encoded + self._buffer.write(msg) + self._messages += 1 + + def has_room_for(self, key, value): + if self._closed: + return False + if not self._messages: + return True + needed_bytes = MessageSet.HEADER_SIZE + Message.HEADER_SIZE + if key is not None: + needed_bytes += len(key) + if value is not None: + needed_bytes += len(value) + return self._buffer.tell() + needed_bytes < self._batch_size + + def is_full(self): + if self._closed: + return True + return self._buffer.tell() >= self._batch_size + + def close(self): + if self._compressor: + # TODO: avoid copies with bytearray / memoryview + self._buffer.seek(4) + msg = Message(self._compressor(self._buffer.read()), + attributes=self._compression_attributes) + encoded = msg.encode() + self._buffer.seek(4) + self._buffer.write(Int64.encode(0)) # offset 0 for wrapper msg + self._buffer.write(Int32.encode(len(encoded))) + self._buffer.write(encoded) + + # Update the message set size, and return ready for full read() + size = self._buffer.tell() - 4 + self._buffer.seek(0) + self._buffer.write(Int32.encode(size)) + self._buffer.seek(0) + self._closed = True + + def size_in_bytes(self): + return self._buffer.tell() + + def buffer(self): + return self._buffer + + +class SimpleBufferPool(object): + """A simple pool of BytesIO objects with a weak memory ceiling.""" + def __init__(self, memory, poolable_size): + """Create a new buffer pool. + + Arguments: + memory (int): maximum memory that this buffer pool can allocate + poolable_size (int): memory size per buffer to cache in the free + list rather than deallocating + """ + self._poolable_size = poolable_size + self._lock = threading.RLock() + + buffers = int(memory / poolable_size) if poolable_size else 0 + self._free = collections.deque([io.BytesIO() for _ in range(buffers)]) + + self._waiters = collections.deque() + #self.metrics = metrics; + #self.waitTime = this.metrics.sensor("bufferpool-wait-time"); + #MetricName metricName = metrics.metricName("bufferpool-wait-ratio", metricGrpName, "The fraction of time an appender waits for space allocation."); + #this.waitTime.add(metricName, new Rate(TimeUnit.NANOSECONDS)); + + def allocate(self, size, max_time_to_block_ms): + """ + Allocate a buffer of the given size. This method blocks if there is not + enough memory and the buffer pool is configured with blocking mode. + + Arguments: + size (int): The buffer size to allocate in bytes [ignored] + max_time_to_block_ms (int): The maximum time in milliseconds to + block for buffer memory to be available + + Returns: + io.BytesIO + """ + with self._lock: + # check if we have a free buffer of the right size pooled + if self._free: + return self._free.popleft() + + elif self._poolable_size == 0: + return io.BytesIO() + + else: + # we are out of buffers and will have to block + buf = None + more_memory = threading.Condition(self._lock) + self._waiters.append(more_memory) + # loop over and over until we have a buffer or have reserved + # enough memory to allocate one + while buf is None: + start_wait = time.time() + more_memory.wait(max_time_to_block_ms / 1000.0) + end_wait = time.time() + #this.waitTime.record(endWait - startWait, time.milliseconds()); + + if self._free: + buf = self._free.popleft() + else: + raise Errors.KafkaTimeoutError( + "Failed to allocate memory within the configured" + " max blocking time") + + # remove the condition for this thread to let the next thread + # in line start getting memory + removed = self._waiters.popleft() + assert removed is more_memory, 'Wrong condition' + + # signal any additional waiters if there is more memory left + # over for them + if self._free and self._waiters: + self._waiters[0].notify() + + # unlock and return the buffer + return buf + + def deallocate(self, buf): + """ + Return buffers to the pool. If they are of the poolable size add them + to the free list, otherwise just mark the memory as free. + + Arguments: + buffer_ (io.BytesIO): The buffer to return + """ + with self._lock: + capacity = buf.seek(0, 2) + + # free extra memory if needed + if capacity > self._poolable_size: + # BytesIO (cpython) only frees memory if 2x reduction or more + trunc_to = int(min(capacity / 2, self._poolable_size)) + buf.truncate(trunc_to) + + buf.seek(0) + #buf.write(bytearray(12)) + #buf.seek(0) + self._free.append(buf) + + if self._waiters: + self._waiters[0].notify() + + def queued(self): + """The number of threads blocked waiting on memory.""" + with self._lock: + return len(self._waiters) + +''' +class BufferPool(object): + """ + A pool of ByteBuffers kept under a given memory limit. This class is fairly + specific to the needs of the producer. In particular it has the following + properties: + + * There is a special "poolable size" and buffers of this size are kept in a + free list and recycled + * It is fair. That is all memory is given to the longest waiting thread + until it has sufficient memory. This prevents starvation or deadlock when + a thread asks for a large chunk of memory and needs to block until + multiple buffers are deallocated. + """ + def __init__(self, memory, poolable_size): + """Create a new buffer pool. + + Arguments: + memory (int): maximum memory that this buffer pool can allocate + poolable_size (int): memory size per buffer to cache in the free + list rather than deallocating + """ + self._poolable_size = poolable_size + self._lock = threading.RLock() + self._free = collections.deque() + self._waiters = collections.deque() + self._total_memory = memory + self._available_memory = memory + #self.metrics = metrics; + #self.waitTime = this.metrics.sensor("bufferpool-wait-time"); + #MetricName metricName = metrics.metricName("bufferpool-wait-ratio", metricGrpName, "The fraction of time an appender waits for space allocation."); + #this.waitTime.add(metricName, new Rate(TimeUnit.NANOSECONDS)); + + def allocate(self, size, max_time_to_block_ms): + """ + Allocate a buffer of the given size. This method blocks if there is not + enough memory and the buffer pool is configured with blocking mode. + + Arguments: + size (int): The buffer size to allocate in bytes + max_time_to_block_ms (int): The maximum time in milliseconds to + block for buffer memory to be available + + Returns: + buffer + + Raises: + InterruptedException If the thread is interrupted while blocked + IllegalArgumentException if size is larger than the total memory + controlled by the pool (and hence we would block forever) + """ + assert size <= self._total_memory, ( + "Attempt to allocate %d bytes, but there is a hard limit of %d on" + " memory allocations." % (size, self._total_memory)) + + with self._lock: + # check if we have a free buffer of the right size pooled + if (size == self._poolable_size and len(self._free) > 0): + return self._free.popleft() + + # now check if the request is immediately satisfiable with the + # memory on hand or if we need to block + free_list_size = len(self._free) * self._poolable_size + if self._available_memory + free_list_size >= size: + # we have enough unallocated or pooled memory to immediately + # satisfy the request + self._free_up(size) + self._available_memory -= size + raise NotImplementedError() + #return ByteBuffer.allocate(size) + else: + # we are out of memory and will have to block + accumulated = 0 + buf = None + more_memory = threading.Condition(self._lock) + self._waiters.append(more_memory) + # loop over and over until we have a buffer or have reserved + # enough memory to allocate one + while (accumulated < size): + start_wait = time.time() + if not more_memory.wait(max_time_to_block_ms / 1000.0): + raise Errors.KafkaTimeoutError( + "Failed to allocate memory within the configured" + " max blocking time") + end_wait = time.time() + #this.waitTime.record(endWait - startWait, time.milliseconds()); + + # check if we can satisfy this request from the free list, + # otherwise allocate memory + if (accumulated == 0 + and size == self._poolable_size + and self._free): + + # just grab a buffer from the free list + buf = self._free.popleft() + accumulated = size + else: + # we'll need to allocate memory, but we may only get + # part of what we need on this iteration + self._free_up(size - accumulated) + got = min(size - accumulated, self._available_memory) + self._available_memory -= got + accumulated += got + + # remove the condition for this thread to let the next thread + # in line start getting memory + removed = self._waiters.popleft() + assert removed is more_memory, 'Wrong condition' + + # signal any additional waiters if there is more memory left + # over for them + if (self._available_memory > 0 or len(self._free) > 0): + if len(self._waiters) > 0: + self._waiters[0].notify() + + # unlock and return the buffer + if buf is None: + raise NotImplementedError() + #return ByteBuffer.allocate(size) + else: + return buf + + def _free_up(self, size): + """ + Attempt to ensure we have at least the requested number of bytes of + memory for allocation by deallocating pooled buffers (if needed) + """ + while self._free and self._available_memory < size: + self._available_memory += self._free.pop().capacity + + def deallocate(self, buffer_, size=None): + """ + Return buffers to the pool. If they are of the poolable size add them + to the free list, otherwise just mark the memory as free. + + Arguments: + buffer (io.BytesIO): The buffer to return + size (int): The size of the buffer to mark as deallocated, note + that this maybe smaller than buffer.capacity since the buffer + may re-allocate itself during in-place compression + """ + with self._lock: + if size is None: + size = buffer_.capacity + if (size == self._poolable_size and size == buffer_.capacity): + buffer_.seek(0) + buffer_.truncate() + self._free.append(buffer_) + else: + self._available_memory += size + + if self._waiters: + more_mem = self._waiters[0] + more_mem.notify() + + def available_memory(self): + """The total free memory both unallocated and in the free list.""" + with self._lock: + return self._available_memory + len(self._free) * self._poolable_size + + def unallocated_memory(self): + """Get the unallocated memory (not in the free list or in use).""" + with self._lock: + return self._available_memory + + def queued(self): + """The number of threads blocked waiting on memory.""" + with self._lock: + return len(self._waiters) + + def poolable_size(self): + """The buffer size that will be retained in the free list after use.""" + return self._poolable_size + + def total_memory(self): + """The total memory managed by this pool.""" + return self._total_memory +''' diff -Nru python-kafka-python-0.9.2/kafka/producer/future.py python-kafka-python-1.0.1/kafka/producer/future.py --- python-kafka-python-0.9.2/kafka/producer/future.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/producer/future.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,66 @@ +from __future__ import absolute_import + +import collections +import threading + +from ..future import Future + +import kafka.common as Errors + + +class FutureProduceResult(Future): + def __init__(self, topic_partition): + super(FutureProduceResult, self).__init__() + self.topic_partition = topic_partition + self._latch = threading.Event() + + def success(self, value): + ret = super(FutureProduceResult, self).success(value) + self._latch.set() + return ret + + def failure(self, error): + ret = super(FutureProduceResult, self).failure(error) + self._latch.set() + return ret + + def await(self, timeout=None): + return self._latch.wait(timeout) + + +class FutureRecordMetadata(Future): + def __init__(self, produce_future, relative_offset): + super(FutureRecordMetadata, self).__init__() + self._produce_future = produce_future + self.relative_offset = relative_offset + produce_future.add_callback(self._produce_success) + produce_future.add_errback(self.failure) + + def _produce_success(self, base_offset): + self.success(RecordMetadata(self._produce_future.topic_partition, + base_offset, self.relative_offset)) + + def get(self, timeout=None): + if not self.is_done and not self._produce_future.await(timeout): + raise Errors.KafkaTimeoutError( + "Timeout after waiting for %s secs." % timeout) + assert self.is_done + if self.failed(): + raise self.exception # pylint: disable-msg=raising-bad-type + return self.value + + +class RecordMetadata(collections.namedtuple( + 'RecordMetadata', 'topic partition topic_partition offset')): + def __new__(cls, tp, base_offset, relative_offset=None): + offset = base_offset + if relative_offset is not None and base_offset != -1: + offset += relative_offset + return super(RecordMetadata, cls).__new__(cls, tp.topic, tp.partition, tp, offset) + + def __str__(self): + return 'RecordMetadata(topic=%s, partition=%s, offset=%s)' % ( + self.topic, self.partition, self.offset) + + def __repr__(self): + return str(self) diff -Nru python-kafka-python-0.9.2/kafka/producer/__init__.py python-kafka-python-1.0.1/kafka/producer/__init__.py --- python-kafka-python-0.9.2/kafka/producer/__init__.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/producer/__init__.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,8 @@ +from .kafka import KafkaProducer +from .simple import SimpleProducer +from .keyed import KeyedProducer + +__all__ = [ + 'KafkaProducer', + 'SimpleProducer', 'KeyedProducer' # deprecated +] diff -Nru python-kafka-python-0.9.2/kafka/producer/kafka.py python-kafka-python-1.0.1/kafka/producer/kafka.py --- python-kafka-python-0.9.2/kafka/producer/kafka.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/producer/kafka.py 2016-02-19 05:47:44.000000000 +0000 @@ -0,0 +1,491 @@ +from __future__ import absolute_import + +import atexit +import copy +import logging +import signal +import threading +import time + +from ..client_async import KafkaClient +from ..common import TopicPartition +from ..partitioner.default import DefaultPartitioner +from ..protocol.message import Message, MessageSet +from .future import FutureRecordMetadata, FutureProduceResult +from .record_accumulator import AtomicInteger, RecordAccumulator +from .sender import Sender + +import kafka.common as Errors + +log = logging.getLogger(__name__) +PRODUCER_CLIENT_ID_SEQUENCE = AtomicInteger() + + +class KafkaProducer(object): + """A Kafka client that publishes records to the Kafka cluster. + + The producer is thread safe and sharing a single producer instance across + threads will generally be faster than having multiple instances. + + The producer consists of a pool of buffer space that holds records that + haven't yet been transmitted to the server as well as a background I/O + thread that is responsible for turning these records into requests and + transmitting them to the cluster. + + The send() method is asynchronous. When called it adds the record to a + buffer of pending record sends and immediately returns. This allows the + producer to batch together individual records for efficiency. + + The 'acks' config controls the criteria under which requests are considered + complete. The "all" setting will result in blocking on the full commit of + the record, the slowest but most durable setting. + + If the request fails, the producer can automatically retry, unless + 'retries' is configured to 0. Enabling retries also opens up the + possibility of duplicates (see the documentation on message + delivery semantics for details: + http://kafka.apache.org/documentation.html#semantics + ). + + The producer maintains buffers of unsent records for each partition. These + buffers are of a size specified by the 'batch_size' config. Making this + larger can result in more batching, but requires more memory (since we will + generally have one of these buffers for each active partition). + + By default a buffer is available to send immediately even if there is + additional unused space in the buffer. However if you want to reduce the + number of requests you can set 'linger_ms' to something greater than 0. + This will instruct the producer to wait up to that number of milliseconds + before sending a request in hope that more records will arrive to fill up + the same batch. This is analogous to Nagle's algorithm in TCP. Note that + records that arrive close together in time will generally batch together + even with linger_ms=0 so under heavy load batching will occur regardless of + the linger configuration; however setting this to something larger than 0 + can lead to fewer, more efficient requests when not under maximal load at + the cost of a small amount of latency. + + The buffer_memory controls the total amount of memory available to the + producer for buffering. If records are sent faster than they can be + transmitted to the server then this buffer space will be exhausted. When + the buffer space is exhausted additional send calls will block. + + The key_serializer and value_serializer instruct how to turn the key and + value objects the user provides into bytes. + + Keyword Arguments: + bootstrap_servers: 'host[:port]' string (or list of 'host[:port]' + strings) that the producer should contact to bootstrap initial + cluster metadata. This does not have to be the full node list. + It just needs to have at least one broker that will respond to a + Metadata API Request. Default port is 9092. If no servers are + specified, will default to localhost:9092. + client_id (str): a name for this client. This string is passed in + each request to servers and can be used to identify specific + server-side log entries that correspond to this client. + Default: 'kafka-python-producer-#' (appended with a unique number + per instance) + key_serializer (callable): used to convert user-supplied keys to bytes + If not None, called as f(key), should return bytes. Default: None. + value_serializer (callable): used to convert user-supplied message + values to bytes. If not None, called as f(value), should return + bytes. Default: None. + acks (0, 1, 'all'): The number of acknowledgments the producer requires + the leader to have received before considering a request complete. + This controls the durability of records that are sent. The + following settings are common: + + 0: Producer will not wait for any acknowledgment from the server. + The message will immediately be added to the socket + buffer and considered sent. No guarantee can be made that the + server has received the record in this case, and the retries + configuration will not take effect (as the client won't + generally know of any failures). The offset given back for each + record will always be set to -1. + 1: Wait for leader to write the record to its local log only. + Broker will respond without awaiting full acknowledgement from + all followers. In this case should the leader fail immediately + after acknowledging the record but before the followers have + replicated it then the record will be lost. + all: Wait for the full set of in-sync replicas to write the record. + This guarantees that the record will not be lost as long as at + least one in-sync replica remains alive. This is the strongest + available guarantee. + If unset, defaults to acks=1. + compression_type (str): The compression type for all data generated by + the producer. Valid values are 'gzip', 'snappy', 'lz4', or None. + Compression is of full batches of data, so the efficacy of batching + will also impact the compression ratio (more batching means better + compression). Default: None. + retries (int): Setting a value greater than zero will cause the client + to resend any record whose send fails with a potentially transient + error. Note that this retry is no different than if the client + resent the record upon receiving the error. Allowing retries will + potentially change the ordering of records because if two records + are sent to a single partition, and the first fails and is retried + but the second succeeds, then the second record may appear first. + Default: 0. + batch_size (int): Requests sent to brokers will contain multiple + batches, one for each partition with data available to be sent. + A small batch size will make batching less common and may reduce + throughput (a batch size of zero will disable batching entirely). + Default: 16384 + linger_ms (int): The producer groups together any records that arrive + in between request transmissions into a single batched request. + Normally this occurs only under load when records arrive faster + than they can be sent out. However in some circumstances the client + may want to reduce the number of requests even under moderate load. + This setting accomplishes this by adding a small amount of + artificial delay; that is, rather than immediately sending out a + record the producer will wait for up to the given delay to allow + other records to be sent so that the sends can be batched together. + This can be thought of as analogous to Nagle's algorithm in TCP. + This setting gives the upper bound on the delay for batching: once + we get batch_size worth of records for a partition it will be sent + immediately regardless of this setting, however if we have fewer + than this many bytes accumulated for this partition we will + 'linger' for the specified time waiting for more records to show + up. This setting defaults to 0 (i.e. no delay). Setting linger_ms=5 + would have the effect of reducing the number of requests sent but + would add up to 5ms of latency to records sent in the absense of + load. Default: 0. + partitioner (callable): Callable used to determine which partition + each message is assigned to. Called (after key serialization): + partitioner(key_bytes, all_partitions, available_partitions). + The default partitioner implementation hashes each non-None key + using the same murmur2 algorithm as the java client so that + messages with the same key are assigned to the same partition. + When a key is None, the message is delivered to a random partition + (filtered to partitions with available leaders only, if possible). + buffer_memory (int): The total bytes of memory the producer should use + to buffer records waiting to be sent to the server. If records are + sent faster than they can be delivered to the server the producer + will block up to max_block_ms, raising an exception on timeout. + In the current implementation, this setting is an approximation. + Default: 33554432 (32MB) + max_block_ms (int): Number of milliseconds to block during send() + when attempting to allocate additional memory before raising an + exception. Default: 60000. + max_request_size (int): The maximum size of a request. This is also + effectively a cap on the maximum record size. Note that the server + has its own cap on record size which may be different from this. + This setting will limit the number of record batches the producer + will send in a single request to avoid sending huge requests. + Default: 1048576. + metadata_max_age_ms (int): The period of time in milliseconds after + which we force a refresh of metadata even if we haven't seen any + partition leadership changes to proactively discover any new + brokers or partitions. Default: 300000 + retry_backoff_ms (int): Milliseconds to backoff when retrying on + errors. Default: 100. + request_timeout_ms (int): Client request timeout in milliseconds. + Default: 30000. + receive_buffer_bytes (int): The size of the TCP receive buffer + (SO_RCVBUF) to use when reading data. Default: None (relies on + system defaults). Java client defaults to 32768. + send_buffer_bytes (int): The size of the TCP send buffer + (SO_SNDBUF) to use when sending data. Default: None (relies on + system defaults). Java client defaults to 131072. + reconnect_backoff_ms (int): The amount of time in milliseconds to + wait before attempting to reconnect to a given host. + Default: 50. + max_in_flight_requests_per_connection (int): Requests are pipelined + to kafka brokers up to this number of maximum requests per + broker connection. Default: 5. + api_version (str): specify which kafka API version to use. + If set to 'auto', will attempt to infer the broker version by + probing various APIs. Default: auto + + Note: + Configuration parameters are described in more detail at + https://kafka.apache.org/090/configuration.html#producerconfigs + """ + _DEFAULT_CONFIG = { + 'bootstrap_servers': 'localhost', + 'client_id': None, + 'key_serializer': None, + 'value_serializer': None, + 'acks': 1, + 'compression_type': None, + 'retries': 0, + 'batch_size': 16384, + 'linger_ms': 0, + 'partitioner': DefaultPartitioner(), + 'buffer_memory': 33554432, + 'connections_max_idle_ms': 600000, # not implemented yet + 'max_block_ms': 60000, + 'max_request_size': 1048576, + 'metadata_max_age_ms': 300000, + 'retry_backoff_ms': 100, + 'request_timeout_ms': 30000, + 'receive_buffer_bytes': None, + 'send_buffer_bytes': None, + 'reconnect_backoff_ms': 50, + 'max_in_flight_requests_per_connection': 5, + 'api_version': 'auto', + } + + def __init__(self, **configs): + log.debug("Starting the Kafka producer") # trace + self.config = copy.copy(self._DEFAULT_CONFIG) + for key in self.config: + if key in configs: + self.config[key] = configs.pop(key) + + # Only check for extra config keys in top-level class + assert not configs, 'Unrecognized configs: %s' % configs + + if self.config['client_id'] is None: + self.config['client_id'] = 'kafka-python-producer-%s' % \ + PRODUCER_CLIENT_ID_SEQUENCE.increment() + + if self.config['acks'] == 'all': + self.config['acks'] = -1 + + client = KafkaClient(**self.config) + + # Check Broker Version if not set explicitly + if self.config['api_version'] == 'auto': + self.config['api_version'] = client.check_version() + assert self.config['api_version'] in ('0.9', '0.8.2', '0.8.1', '0.8.0') + + # Convert api_version config to tuple for easy comparisons + self.config['api_version'] = tuple( + map(int, self.config['api_version'].split('.'))) + + if self.config['compression_type'] == 'lz4': + assert self.config['api_version'] >= (0, 8, 2), 'LZ4 Requires >= Kafka 0.8.2 Brokers' + + self._accumulator = RecordAccumulator(**self.config) + self._metadata = client.cluster + self._sender = Sender(client, self._metadata, self._accumulator, + **self.config) + self._sender.daemon = True + self._sender.start() + self._closed = False + atexit.register(self.close, timeout=0) + log.debug("Kafka producer started") + + def __del__(self): + self.close(timeout=0) + + def close(self, timeout=None): + """Close this producer.""" + if not hasattr(self, '_closed') or self._closed: + log.info('Kafka producer closed') + return + if timeout is None: + timeout = 999999999 + assert timeout >= 0 + + log.info("Closing the Kafka producer with %s secs timeout.", timeout) + #first_exception = AtomicReference() # this will keep track of the first encountered exception + invoked_from_callback = bool(threading.current_thread() is self._sender) + if timeout > 0: + if invoked_from_callback: + log.warning("Overriding close timeout %s secs to 0 in order to" + " prevent useless blocking due to self-join. This" + " means you have incorrectly invoked close with a" + " non-zero timeout from the producer call-back.", + timeout) + else: + # Try to close gracefully. + if self._sender is not None: + self._sender.initiate_close() + self._sender.join(timeout) + + if self._sender is not None and self._sender.is_alive(): + + log.info("Proceeding to force close the producer since pending" + " requests could not be completed within timeout %s.", + timeout) + self._sender.force_close() + # Only join the sender thread when not calling from callback. + if not invoked_from_callback: + self._sender.join() + + try: + self.config['key_serializer'].close() + except AttributeError: + pass + try: + self.config['value_serializer'].close() + except AttributeError: + pass + self._closed = True + log.debug("The Kafka producer has closed.") + + def partitions_for(self, topic): + """Returns set of all known partitions for the topic.""" + max_wait = self.config['max_block_ms'] / 1000.0 + return self._wait_on_metadata(topic, max_wait) + + def send(self, topic, value=None, key=None, partition=None): + """Publish a message to a topic. + + Arguments: + topic (str): topic where the message will be published + value (optional): message value. Must be type bytes, or be + serializable to bytes via configured value_serializer. If value + is None, key is required and message acts as a 'delete'. + See kafka compaction documentation for more details: + http://kafka.apache.org/documentation.html#compaction + (compaction requires kafka >= 0.8.1) + partition (int, optional): optionally specify a partition. If not + set, the partition will be selected using the configured + 'partitioner'. + key (optional): a key to associate with the message. Can be used to + determine which partition to send the message to. If partition + is None (and producer's partitioner config is left as default), + then messages with the same key will be delivered to the same + partition (but if key is None, partition is chosen randomly). + Must be type bytes, or be serializable to bytes via configured + key_serializer. + + Returns: + FutureRecordMetadata: resolves to RecordMetadata + + Raises: + KafkaTimeoutError: if unable to fetch topic metadata, or unable + to obtain memory buffer prior to configured max_block_ms + """ + assert value is not None or self.config['api_version'] >= (0, 8, 1), ( + 'Null messages require kafka >= 0.8.1') + assert not (value is None and key is None), 'Need at least one: key or value' + try: + # first make sure the metadata for the topic is + # available + self._wait_on_metadata(topic, self.config['max_block_ms'] / 1000.0) + + key_bytes, value_bytes = self._serialize(topic, key, value) + partition = self._partition(topic, partition, key, value, + key_bytes, value_bytes) + + message_size = MessageSet.HEADER_SIZE + Message.HEADER_SIZE + if key_bytes is not None: + message_size += len(key_bytes) + if value_bytes is not None: + message_size += len(value_bytes) + self._ensure_valid_record_size(message_size) + + tp = TopicPartition(topic, partition) + log.debug("Sending (key=%s value=%s) to %s", key, value, tp) + result = self._accumulator.append(tp, key_bytes, value_bytes, + self.config['max_block_ms']) + future, batch_is_full, new_batch_created = result + if batch_is_full or new_batch_created: + log.debug("Waking up the sender since %s is either full or" + " getting a new batch", tp) + self._sender.wakeup() + + return future + # handling exceptions and record the errors; + # for API exceptions return them in the future, + # for other exceptions raise directly + except Errors.KafkaTimeoutError: + raise + except AssertionError: + raise + except Exception as e: + log.debug("Exception occurred during message send: %s", e) + return FutureRecordMetadata( + FutureProduceResult(TopicPartition(topic, partition)), + -1).failure(e) + + def flush(self): + """ + Invoking this method makes all buffered records immediately available + to send (even if linger_ms is greater than 0) and blocks on the + completion of the requests associated with these records. The + post-condition of flush() is that any previously sent record will have + completed (e.g. Future.is_done() == True). A request is considered + completed when either it is successfully acknowledged according to the + 'acks' configuration for the producer, or it results in an error. + + Other threads can continue sending messages while one thread is blocked + waiting for a flush call to complete; however, no guarantee is made + about the completion of messages sent after the flush call begins. + """ + log.debug("Flushing accumulated records in producer.") # trace + self._accumulator.begin_flush() + self._sender.wakeup() + self._accumulator.await_flush_completion() + + def _ensure_valid_record_size(self, size): + """Validate that the record size isn't too large.""" + if size > self.config['max_request_size']: + raise Errors.MessageSizeTooLargeError( + "The message is %d bytes when serialized which is larger than" + " the maximum request size you have configured with the" + " max_request_size configuration" % size) + if size > self.config['buffer_memory']: + raise Errors.MessageSizeTooLargeError( + "The message is %d bytes when serialized which is larger than" + " the total memory buffer you have configured with the" + " buffer_memory configuration." % size) + + def _wait_on_metadata(self, topic, max_wait): + """ + Wait for cluster metadata including partitions for the given topic to + be available. + + Arguments: + topic (str): topic we want metadata for + max_wait (float): maximum time in secs for waiting on the metadata + + Returns: + set: partition ids for the topic + + Raises: + TimeoutException: if partitions for topic were not obtained before + specified max_wait timeout + """ + # add topic to metadata topic list if it is not there already. + self._sender.add_topic(topic) + begin = time.time() + elapsed = 0.0 + metadata_event = threading.Event() + while True: + partitions = self._metadata.partitions_for_topic(topic) + if partitions is not None: + return partitions + + log.debug("Requesting metadata update for topic %s", topic) + + metadata_event.clear() + future = self._metadata.request_update() + future.add_both(lambda e, *args: e.set(), metadata_event) + self._sender.wakeup() + metadata_event.wait(max_wait - elapsed) + elapsed = time.time() - begin + if not metadata_event.is_set(): + raise Errors.KafkaTimeoutError( + "Failed to update metadata after %s secs.", max_wait) + elif topic in self._metadata.unauthorized_topics: + raise Errors.TopicAuthorizationFailedError(topic) + else: + log.debug("_wait_on_metadata woke after %s secs.", elapsed) + + def _serialize(self, topic, key, value): + # pylint: disable-msg=not-callable + if self.config['key_serializer']: + serialized_key = self.config['key_serializer'](key) + else: + serialized_key = key + if self.config['value_serializer']: + serialized_value = self.config['value_serializer'](value) + else: + serialized_value = value + return serialized_key, serialized_value + + def _partition(self, topic, partition, key, value, + serialized_key, serialized_value): + if partition is not None: + assert partition >= 0 + assert partition in self._metadata.partitions_for_topic(topic), 'Unrecognized partition' + return partition + + all_partitions = list(self._metadata.partitions_for_topic(topic)) + available = list(self._metadata.available_partitions_for_topic(topic)) + return self.config['partitioner'](serialized_key, + all_partitions, + available) diff -Nru python-kafka-python-0.9.2/kafka/producer/keyed.py python-kafka-python-1.0.1/kafka/producer/keyed.py --- python-kafka-python-0.9.2/kafka/producer/keyed.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/producer/keyed.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,49 @@ +from __future__ import absolute_import + +import logging +import warnings + +from .base import Producer +from ..partitioner import HashedPartitioner + + +log = logging.getLogger(__name__) + + +class KeyedProducer(Producer): + """ + A producer which distributes messages to partitions based on the key + + See Producer class for Arguments + + Additional Arguments: + partitioner: A partitioner class that will be used to get the partition + to send the message to. Must be derived from Partitioner. + Defaults to HashedPartitioner. + """ + def __init__(self, *args, **kwargs): + self.partitioner_class = kwargs.pop('partitioner', HashedPartitioner) + self.partitioners = {} + super(KeyedProducer, self).__init__(*args, **kwargs) + + def _next_partition(self, topic, key): + if topic not in self.partitioners: + if not self.client.has_metadata_for_topic(topic): + self.client.load_metadata_for_topics(topic) + + self.partitioners[topic] = self.partitioner_class(self.client.get_partition_ids_for_topic(topic)) + + partitioner = self.partitioners[topic] + return partitioner.partition(key) + + def send_messages(self, topic, key, *msg): + partition = self._next_partition(topic, key) + return self._send_messages(topic, partition, *msg, key=key) + + # DEPRECATED + def send(self, topic, key, msg): + warnings.warn("KeyedProducer.send is deprecated in favor of send_messages", DeprecationWarning) + return self.send_messages(topic, key, msg) + + def __repr__(self): + return '' % self.async diff -Nru python-kafka-python-0.9.2/kafka/producer/record_accumulator.py python-kafka-python-1.0.1/kafka/producer/record_accumulator.py --- python-kafka-python-0.9.2/kafka/producer/record_accumulator.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/producer/record_accumulator.py 2016-02-19 06:27:26.000000000 +0000 @@ -0,0 +1,520 @@ +from __future__ import absolute_import + +import collections +import copy +import logging +import threading +import time + +import six + +from ..common import TopicPartition +from ..protocol.message import Message, MessageSet +from .buffer import MessageSetBuffer, SimpleBufferPool +from .future import FutureRecordMetadata, FutureProduceResult + +import kafka.common as Errors + + +log = logging.getLogger(__name__) + + +class AtomicInteger(object): + def __init__(self, val=0): + self._lock = threading.Lock() + self._val = val + + def increment(self): + with self._lock: + self._val += 1 + return self._val + + def decrement(self): + with self._lock: + self._val -= 1 + return self._val + + def get(self): + return self._val + + +class RecordBatch(object): + def __init__(self, tp, records): + self.record_count = 0 + #self.max_record_size = 0 # for metrics only + now = time.time() + #self.created = now # for metrics only + self.drained = None + self.attempts = 0 + self.last_attempt = now + self.last_append = now + self.records = records + self.topic_partition = tp + self.produce_future = FutureProduceResult(tp) + self._retry = False + + def try_append(self, key, value): + if not self.records.has_room_for(key, value): + return None + + self.records.append(self.record_count, Message(value, key=key)) + # self.max_record_size = max(self.max_record_size, Record.record_size(key, value)) # for metrics only + self.last_append = time.time() + future = FutureRecordMetadata(self.produce_future, self.record_count) + self.record_count += 1 + return future + + def done(self, base_offset=None, exception=None): + log.debug("Produced messages to topic-partition %s with base offset" + " %s and error %s.", self.topic_partition, base_offset, + exception) # trace + if self.produce_future.is_done: + log.warning('Batch is already closed -- ignoring batch.done()') + return + elif exception is None: + self.produce_future.success(base_offset) + else: + self.produce_future.failure(exception) + + def maybe_expire(self, request_timeout_ms, linger_ms): + since_append_ms = 1000 * (time.time() - self.last_append) + if ((self.records.is_full() and request_timeout_ms < since_append_ms) + or (request_timeout_ms < (since_append_ms + linger_ms))): + self.records.close() + self.done(-1, Errors.KafkaTimeoutError('Batch Expired')) + return True + return False + + def in_retry(self): + return self._retry + + def set_retry(self): + self._retry = True + + def __str__(self): + return 'RecordBatch(topic_partition=%s, record_count=%d)' % ( + self.topic_partition, self.record_count) + + +class RecordAccumulator(object): + """ + This class maintains a dequeue per TopicPartition that accumulates messages + into MessageSets to be sent to the server. + + The accumulator attempts to bound memory use, and append calls will block + when that memory is exhausted. + + Keyword Arguments: + batch_size (int): Requests sent to brokers will contain multiple + batches, one for each partition with data available to be sent. + A small batch size will make batching less common and may reduce + throughput (a batch size of zero will disable batching entirely). + Default: 16384 + buffer_memory (int): The total bytes of memory the producer should use + to buffer records waiting to be sent to the server. If records are + sent faster than they can be delivered to the server the producer + will block up to max_block_ms, raising an exception on timeout. + In the current implementation, this setting is an approximation. + Default: 33554432 (32MB) + compression_type (str): The compression type for all data generated by + the producer. Valid values are 'gzip', 'snappy', 'lz4', or None. + Compression is of full batches of data, so the efficacy of batching + will also impact the compression ratio (more batching means better + compression). Default: None. + linger_ms (int): An artificial delay time to add before declaring a + messageset (that isn't full) ready for sending. This allows + time for more records to arrive. Setting a non-zero linger_ms + will trade off some latency for potentially better throughput + due to more batching (and hence fewer, larger requests). + Default: 0 + retry_backoff_ms (int): An artificial delay time to retry the + produce request upon receiving an error. This avoids exhausting + all retries in a short period of time. Default: 100 + """ + _DEFAULT_CONFIG = { + 'buffer_memory': 33554432, + 'batch_size': 16384, + 'compression_type': None, + 'linger_ms': 0, + 'retry_backoff_ms': 100, + } + + def __init__(self, **configs): + self.config = copy.copy(self._DEFAULT_CONFIG) + for key in self.config: + if key in configs: + self.config[key] = configs.pop(key) + + self._closed = False + self._drain_index = 0 + self._flushes_in_progress = AtomicInteger() + self._appends_in_progress = AtomicInteger() + self._batches = collections.defaultdict(collections.deque) # TopicPartition: [RecordBatch] + self._tp_locks = {None: threading.Lock()} # TopicPartition: Lock, plus a lock to add entries + self._free = SimpleBufferPool(self.config['buffer_memory'], + self.config['batch_size']) + self._incomplete = IncompleteRecordBatches() + + def append(self, tp, key, value, max_time_to_block_ms): + """Add a record to the accumulator, return the append result. + + The append result will contain the future metadata, and flag for + whether the appended batch is full or a new batch is created + + Arguments: + tp (TopicPartition): The topic/partition to which this record is + being sent + key (bytes): The key for the record + value (bytes): The value for the record + max_time_to_block_ms (int): The maximum time in milliseconds to + block for buffer memory to be available + + Returns: + tuple: (future, batch_is_full, new_batch_created) + """ + assert isinstance(tp, TopicPartition), 'not TopicPartition' + assert not self._closed, 'RecordAccumulator is closed' + # We keep track of the number of appending thread to make sure we do not miss batches in + # abortIncompleteBatches(). + self._appends_in_progress.increment() + try: + if tp not in self._tp_locks: + with self._tp_locks[None]: + if tp not in self._tp_locks: + self._tp_locks[tp] = threading.Lock() + + with self._tp_locks[tp]: + # check if we have an in-progress batch + dq = self._batches[tp] + if dq: + last = dq[-1] + future = last.try_append(key, value) + if future is not None: + batch_is_full = len(dq) > 1 or last.records.is_full() + return future, batch_is_full, False + + # we don't have an in-progress record batch try to allocate a new batch + message_size = MessageSet.HEADER_SIZE + Message.HEADER_SIZE + if key is not None: + message_size += len(key) + if value is not None: + message_size += len(value) + assert message_size <= self.config['buffer_memory'], 'message too big' + + size = max(self.config['batch_size'], message_size) + log.debug("Allocating a new %d byte message buffer for %s", size, tp) # trace + buf = self._free.allocate(size, max_time_to_block_ms) + with self._tp_locks[tp]: + # Need to check if producer is closed again after grabbing the + # dequeue lock. + assert not self._closed, 'RecordAccumulator is closed' + + if dq: + last = dq[-1] + future = last.try_append(key, value) + if future is not None: + # Somebody else found us a batch, return the one we + # waited for! Hopefully this doesn't happen often... + self._free.deallocate(buf) + batch_is_full = len(dq) > 1 or last.records.is_full() + return future, batch_is_full, False + + records = MessageSetBuffer(buf, self.config['batch_size'], + self.config['compression_type']) + batch = RecordBatch(tp, records) + future = batch.try_append(key, value) + if not future: + raise Exception() + + dq.append(batch) + self._incomplete.add(batch) + batch_is_full = len(dq) > 1 or batch.records.is_full() + return future, batch_is_full, True + finally: + self._appends_in_progress.decrement() + + def abort_expired_batches(self, request_timeout_ms, cluster): + """Abort the batches that have been sitting in RecordAccumulator for + more than the configured request_timeout due to metadata being + unavailable. + + Arguments: + request_timeout_ms (int): milliseconds to timeout + cluster (ClusterMetadata): current metadata for kafka cluster + + Returns: + list of RecordBatch that were expired + """ + expired_batches = [] + to_remove = [] + count = 0 + for tp, dq in six.iteritems(self._batches): + assert tp in self._tp_locks, 'TopicPartition not in locks dict' + with self._tp_locks[tp]: + # iterate over the batches and expire them if they have stayed + # in accumulator for more than request_timeout_ms + for batch in dq: + # check if the batch is expired + if batch.maybe_expire(request_timeout_ms, + self.config['linger_ms']): + expired_batches.append(batch) + to_remove.append(batch) + count += 1 + self.deallocate(batch) + elif not batch.in_retry(): + break + + # Python does not allow us to mutate the dq during iteration + # Assuming expired batches are infrequent, this is better than + # creating a new copy of the deque for iteration on every loop + if to_remove: + for batch in to_remove: + dq.remove(batch) + to_remove = [] + + if expired_batches: + log.debug("Expired %d batches in accumulator", count) # trace + + return expired_batches + + def reenqueue(self, batch): + """Re-enqueue the given record batch in the accumulator to retry.""" + now = time.time() + batch.attempts += 1 + batch.last_attempt = now + batch.last_append = now + batch.set_retry() + assert batch.topic_partition in self._tp_locks, 'TopicPartition not in locks dict' + assert batch.topic_partition in self._batches, 'TopicPartition not in batches' + dq = self._batches[batch.topic_partition] + with self._tp_locks[batch.topic_partition]: + dq.appendleft(batch) + + def ready(self, cluster): + """ + Get a list of nodes whose partitions are ready to be sent, and the + earliest time at which any non-sendable partition will be ready; + Also return the flag for whether there are any unknown leaders for the + accumulated partition batches. + + A destination node is ready to send data if ANY one of its partition is + not backing off the send and ANY of the following are true: + + * The record set is full + * The record set has sat in the accumulator for at least linger_ms + milliseconds + * The accumulator is out of memory and threads are blocking waiting + for data (in this case all partitions are immediately considered + ready). + * The accumulator has been closed + + Arguments: + cluster (ClusterMetadata): + + Returns: + tuple: + ready_nodes (set): node_ids that have ready batches + next_ready_check (float): secs until next ready after backoff + unknown_leaders_exist (bool): True if metadata refresh needed + """ + ready_nodes = set() + next_ready_check = 9999999.99 + unknown_leaders_exist = False + now = time.time() + + exhausted = bool(self._free.queued() > 0) + # several threads are accessing self._batches -- to simplify + # concurrent access, we iterate over a snapshot of partitions + # and lock each partition separately as needed + partitions = list(self._batches.keys()) + for tp in partitions: + leader = cluster.leader_for_partition(tp) + if leader is None or leader == -1: + unknown_leaders_exist = True + continue + elif leader in ready_nodes: + continue + + with self._tp_locks[tp]: + dq = self._batches[tp] + if not dq: + continue + batch = dq[0] + retry_backoff = self.config['retry_backoff_ms'] / 1000.0 + linger = self.config['linger_ms'] / 1000.0 + backing_off = bool(batch.attempts > 0 and + batch.last_attempt + retry_backoff > now) + waited_time = now - batch.last_attempt + time_to_wait = retry_backoff if backing_off else linger + time_left = max(time_to_wait - waited_time, 0) + full = bool(len(dq) > 1 or batch.records.is_full()) + expired = bool(waited_time >= time_to_wait) + + sendable = (full or expired or exhausted or self._closed or + self._flush_in_progress()) + + if sendable and not backing_off: + ready_nodes.add(leader) + else: + # Note that this results in a conservative estimate since + # an un-sendable partition may have a leader that will + # later be found to have sendable data. However, this is + # good enough since we'll just wake up and then sleep again + # for the remaining time. + next_ready_check = min(time_left, next_ready_check) + + return ready_nodes, next_ready_check, unknown_leaders_exist + + def has_unsent(self): + """Return whether there is any unsent record in the accumulator.""" + for tp, dq in six.iteritems(self._batches): + with self._tp_locks[tp]: + if len(dq): + return True + return False + + def drain(self, cluster, nodes, max_size): + """ + Drain all the data for the given nodes and collate them into a list of + batches that will fit within the specified size on a per-node basis. + This method attempts to avoid choosing the same topic-node repeatedly. + + Arguments: + cluster (ClusterMetadata): The current cluster metadata + nodes (list): list of node_ids to drain + max_size (int): maximum number of bytes to drain + + Returns: + dict: {node_id: list of RecordBatch} with total size less than the + requested max_size. + """ + if not nodes: + return {} + + now = time.time() + batches = {} + for node_id in nodes: + size = 0 + partitions = list(cluster.partitions_for_broker(node_id)) + ready = [] + # to make starvation less likely this loop doesn't start at 0 + self._drain_index %= len(partitions) + start = self._drain_index + while True: + tp = partitions[self._drain_index] + if tp in self._batches: + with self._tp_locks[tp]: + dq = self._batches[tp] + if dq: + first = dq[0] + backoff = ( + bool(first.attempts > 0) and + bool(first.last_attempt + + self.config['retry_backoff_ms'] / 1000.0 + > now) + ) + # Only drain the batch if it is not during backoff + if not backoff: + if (size + first.records.size_in_bytes() > max_size + and len(ready) > 0): + # there is a rare case that a single batch + # size is larger than the request size due + # to compression; in this case we will + # still eventually send this batch in a + # single request + break + else: + batch = dq.popleft() + batch.records.close() + size += batch.records.size_in_bytes() + ready.append(batch) + batch.drained = now + + self._drain_index += 1 + self._drain_index %= len(partitions) + if start == self._drain_index: + break + + batches[node_id] = ready + return batches + + def deallocate(self, batch): + """Deallocate the record batch.""" + self._incomplete.remove(batch) + self._free.deallocate(batch.records.buffer()) + + def _flush_in_progress(self): + """Are there any threads currently waiting on a flush?""" + return self._flushes_in_progress.get() > 0 + + def begin_flush(self): + """ + Initiate the flushing of data from the accumulator...this makes all + requests immediately ready + """ + self._flushes_in_progress.increment() + + def await_flush_completion(self): + """ + Mark all partitions as ready to send and block until the send is complete + """ + for batch in self._incomplete.all(): + batch.produce_future.await() + assert batch.produce_future.is_done + if batch.produce_future.failed(): + log.warning(batch.produce_future.exception) + self._flushes_in_progress.decrement() + + def abort_incomplete_batches(self): + """ + This function is only called when sender is closed forcefully. It will fail all the + incomplete batches and return. + """ + # We need to keep aborting the incomplete batch until no thread is trying to append to + # 1. Avoid losing batches. + # 2. Free up memory in case appending threads are blocked on buffer full. + # This is a tight loop but should be able to get through very quickly. + while True: + self._abort_batches() + if not self._appends_in_progress.get(): + break + # After this point, no thread will append any messages because they will see the close + # flag set. We need to do the last abort after no thread was appending in case the there was a new + # batch appended by the last appending thread. + self._abort_batches() + self._batches.clear() + + def _abort_batches(self): + """Go through incomplete batches and abort them.""" + error = Errors.IllegalStateError("Producer is closed forcefully.") + for batch in self._incomplete.all(): + tp = batch.topic_partition + # Close the batch before aborting + with self._tp_locks[tp]: + batch.records.close() + batch.done(exception=error) + self.deallocate(batch) + + def close(self): + """Close this accumulator and force all the record buffers to be drained.""" + self._closed = True + + +class IncompleteRecordBatches(object): + """A threadsafe helper class to hold RecordBatches that haven't been ack'd yet""" + + def __init__(self): + self._incomplete = set() + self._lock = threading.Lock() + + def add(self, batch): + with self._lock: + return self._incomplete.add(batch) + + def remove(self, batch): + with self._lock: + return self._incomplete.remove(batch) + + def all(self): + with self._lock: + return list(self._incomplete) diff -Nru python-kafka-python-0.9.2/kafka/producer/sender.py python-kafka-python-1.0.1/kafka/producer/sender.py --- python-kafka-python-0.9.2/kafka/producer/sender.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/producer/sender.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,271 @@ +from __future__ import absolute_import + +import collections +import copy +import logging +import threading +import time + +import six + +from ..common import TopicPartition +from ..version import __version__ +from ..protocol.produce import ProduceRequest + +import kafka.common as Errors + + +log = logging.getLogger(__name__) + + +class Sender(threading.Thread): + """ + The background thread that handles the sending of produce requests to the + Kafka cluster. This thread makes metadata requests to renew its view of the + cluster and then sends produce requests to the appropriate nodes. + """ + _DEFAULT_CONFIG = { + 'max_request_size': 1048576, + 'acks': 1, + 'retries': 0, + 'request_timeout_ms': 30000, + 'client_id': 'kafka-python-' + __version__, + } + + def __init__(self, client, metadata, accumulator, **configs): + super(Sender, self).__init__() + self.config = copy.copy(self._DEFAULT_CONFIG) + for key in self.config: + if key in configs: + self.config[key] = configs.pop(key) + + self.name = self.config['client_id'] + '-network-thread' + self._client = client + self._accumulator = accumulator + self._metadata = client.cluster + self._running = True + self._force_close = False + self._topics_to_add = [] + + def run(self): + """The main run loop for the sender thread.""" + log.debug("Starting Kafka producer I/O thread.") + + # main loop, runs until close is called + while self._running: + try: + self.run_once() + except Exception: + log.exception("Uncaught error in kafka producer I/O thread") + + log.debug("Beginning shutdown of Kafka producer I/O thread, sending" + " remaining records.") + + # okay we stopped accepting requests but there may still be + # requests in the accumulator or waiting for acknowledgment, + # wait until these are completed. + while (not self._force_close + and (self._accumulator.has_unsent() + or self._client.in_flight_request_count() > 0)): + try: + self.run_once() + except Exception: + log.exception("Uncaught error in kafka producer I/O thread") + + if self._force_close: + # We need to fail all the incomplete batches and wake up the + # threads waiting on the futures. + self._accumulator.abort_incomplete_batches() + + try: + self._client.close() + except Exception: + log.exception("Failed to close network client") + + log.debug("Shutdown of Kafka producer I/O thread has completed.") + + def run_once(self): + """Run a single iteration of sending.""" + while self._topics_to_add: + self._client.add_topic(self._topics_to_add.pop()) + + # get the list of partitions with data ready to send + result = self._accumulator.ready(self._metadata) + ready_nodes, next_ready_check_delay, unknown_leaders_exist = result + + # if there are any partitions whose leaders are not known yet, force + # metadata update + if unknown_leaders_exist: + log.debug('Unknown leaders exist, requesting metadata update') + self._metadata.request_update() + + # remove any nodes we aren't ready to send to + not_ready_timeout = 999999999 + for node in list(ready_nodes): + if not self._client.ready(node): + log.debug('Node %s not ready; delaying produce of accumulated batch', node) + ready_nodes.remove(node) + not_ready_timeout = min(not_ready_timeout, + self._client.connection_delay(node)) + + # create produce requests + batches_by_node = self._accumulator.drain( + self._metadata, ready_nodes, self.config['max_request_size']) + + expired_batches = self._accumulator.abort_expired_batches( + self.config['request_timeout_ms'], self._metadata) + + requests = self._create_produce_requests(batches_by_node) + # If we have any nodes that are ready to send + have sendable data, + # poll with 0 timeout so this can immediately loop and try sending more + # data. Otherwise, the timeout is determined by nodes that have + # partitions with data that isn't yet sendable (e.g. lingering, backing + # off). Note that this specifically does not include nodes with + # sendable data that aren't ready to send since they would cause busy + # looping. + poll_timeout_ms = min(next_ready_check_delay * 1000, not_ready_timeout) + if ready_nodes: + log.debug("Nodes with data ready to send: %s", ready_nodes) # trace + log.debug("Created %d produce requests: %s", len(requests), requests) # trace + poll_timeout_ms = 0 + + for node_id, request in six.iteritems(requests): + batches = batches_by_node[node_id] + log.debug('Sending Produce Request: %r', request) + (self._client.send(node_id, request) + .add_callback( + self._handle_produce_response, batches) + .add_errback( + self._failed_produce, batches, node_id)) + + # if some partitions are already ready to be sent, the select time + # would be 0; otherwise if some partition already has some data + # accumulated but not ready yet, the select time will be the time + # difference between now and its linger expiry time; otherwise the + # select time will be the time difference between now and the + # metadata expiry time + self._client.poll(poll_timeout_ms, sleep=True) + + def initiate_close(self): + """Start closing the sender (won't complete until all data is sent).""" + self._running = False + self._accumulator.close() + self.wakeup() + + def force_close(self): + """Closes the sender without sending out any pending messages.""" + self._force_close = True + self.initiate_close() + + def add_topic(self, topic): + self._topics_to_add.append(topic) + self.wakeup() + + def _failed_produce(self, batches, node_id, error): + log.debug("Error sending produce request to node %d: %s", node_id, error) # trace + for batch in batches: + self._complete_batch(batch, error, -1) + + def _handle_produce_response(self, batches, response): + """Handle a produce response.""" + # if we have a response, parse it + log.debug('Parsing produce response: %r', response) + if response: + batches_by_partition = dict([(batch.topic_partition, batch) + for batch in batches]) + + for topic, partitions in response.topics: + for partition, error_code, offset in partitions: + tp = TopicPartition(topic, partition) + error = Errors.for_code(error_code) + batch = batches_by_partition[tp] + self._complete_batch(batch, error, offset) + + else: + # this is the acks = 0 case, just complete all requests + for batch in batches: + self._complete_batch(batch, None, -1) + + def _complete_batch(self, batch, error, base_offset): + """Complete or retry the given batch of records. + + Arguments: + batch (RecordBatch): The record batch + error (Exception): The error (or None if none) + base_offset (int): The base offset assigned to the records if successful + """ + # Standardize no-error to None + if error is Errors.NoError: + error = None + + if error is not None and self._can_retry(batch, error): + # retry + log.warning("Got error produce response on topic-partition %s," + " retrying (%d attempts left). Error: %s", + batch.topic_partition, + self.config['retries'] - batch.attempts - 1, + error) + self._accumulator.reenqueue(batch) + else: + if error is Errors.TopicAuthorizationFailedError: + error = error(batch.topic_partition.topic) + + # tell the user the result of their request + batch.done(base_offset, error) + self._accumulator.deallocate(batch) + + if getattr(error, 'invalid_metadata', False): + self._metadata.request_update() + + def _can_retry(self, batch, error): + """ + We can retry a send if the error is transient and the number of + attempts taken is fewer than the maximum allowed + """ + return (batch.attempts < self.config['retries'] + and getattr(error, 'retriable', False)) + + def _create_produce_requests(self, collated): + """ + Transfer the record batches into a list of produce requests on a + per-node basis. + + Arguments: + collated: {node_id: [RecordBatch]} + + Returns: + dict: {node_id: ProduceRequest} + """ + requests = {} + for node_id, batches in six.iteritems(collated): + requests[node_id] = self._produce_request( + node_id, self.config['acks'], + self.config['request_timeout_ms'], batches) + return requests + + def _produce_request(self, node_id, acks, timeout, batches): + """Create a produce request from the given record batches. + + Returns: + ProduceRequest + """ + produce_records_by_partition = collections.defaultdict(dict) + for batch in batches: + topic = batch.topic_partition.topic + partition = batch.topic_partition.partition + + # TODO: bytearray / memoryview + buf = batch.records.buffer() + produce_records_by_partition[topic][partition] = buf + + return ProduceRequest( + required_acks=acks, + timeout=timeout, + topics=[(topic, list(partition_info.items())) + for topic, partition_info + in six.iteritems(produce_records_by_partition)] + ) + + def wakeup(self): + """Wake up the selector associated with this send thread.""" + self._client.wakeup() diff -Nru python-kafka-python-0.9.2/kafka/producer/simple.py python-kafka-python-1.0.1/kafka/producer/simple.py --- python-kafka-python-0.9.2/kafka/producer/simple.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/producer/simple.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,55 @@ +from __future__ import absolute_import + +from itertools import cycle +import logging +import random +import six + +from six.moves import xrange + +from .base import Producer + + +log = logging.getLogger(__name__) + + +class SimpleProducer(Producer): + """A simple, round-robin producer. + + See Producer class for Base Arguments + + Additional Arguments: + random_start (bool, optional): randomize the initial partition which + the first message block will be published to, otherwise + if false, the first message block will always publish + to partition 0 before cycling through each partition, + defaults to True. + """ + def __init__(self, *args, **kwargs): + self.partition_cycles = {} + self.random_start = kwargs.pop('random_start', True) + super(SimpleProducer, self).__init__(*args, **kwargs) + + def _next_partition(self, topic): + if topic not in self.partition_cycles: + if not self.client.has_metadata_for_topic(topic): + self.client.ensure_topic_exists(topic) + + self.partition_cycles[topic] = cycle(self.client.get_partition_ids_for_topic(topic)) + + # Randomize the initial partition that is returned + if self.random_start: + num_partitions = len(self.client.get_partition_ids_for_topic(topic)) + for _ in xrange(random.randint(0, num_partitions-1)): + next(self.partition_cycles[topic]) + + return next(self.partition_cycles[topic]) + + def send_messages(self, topic, *msg): + partition = self._next_partition(topic) + return super(SimpleProducer, self).send_messages( + topic, partition, *msg + ) + + def __repr__(self): + return '' % self.async diff -Nru python-kafka-python-0.9.2/kafka/producer.py python-kafka-python-1.0.1/kafka/producer.py --- python-kafka-python-0.9.2/kafka/producer.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/producer.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,314 +0,0 @@ -from __future__ import absolute_import - -import logging -import time -import random - -from Queue import Empty -from collections import defaultdict -from itertools import cycle -from multiprocessing import Queue, Process - -from kafka.common import ( - ProduceRequest, TopicAndPartition, UnsupportedCodecError, UnknownTopicOrPartitionError -) -from kafka.partitioner import HashedPartitioner -from kafka.protocol import CODEC_NONE, ALL_CODECS, create_message_set - -log = logging.getLogger("kafka") - -BATCH_SEND_DEFAULT_INTERVAL = 20 -BATCH_SEND_MSG_COUNT = 20 - -STOP_ASYNC_PRODUCER = -1 - - -def _send_upstream(queue, client, codec, batch_time, batch_size, - req_acks, ack_timeout): - """ - Listen on the queue for a specified number of messages or till - a specified timeout and send them upstream to the brokers in one - request - - NOTE: Ideally, this should have been a method inside the Producer - class. However, multiprocessing module has issues in windows. The - functionality breaks unless this function is kept outside of a class - """ - stop = False - client.reinit() - - while not stop: - timeout = batch_time - count = batch_size - send_at = time.time() + timeout - msgset = defaultdict(list) - - # Keep fetching till we gather enough messages or a - # timeout is reached - while count > 0 and timeout >= 0: - try: - topic_partition, msg = queue.get(timeout=timeout) - - except Empty: - break - - # Check if the controller has requested us to stop - if topic_partition == STOP_ASYNC_PRODUCER: - stop = True - break - - # Adjust the timeout to match the remaining period - count -= 1 - timeout = send_at - time.time() - msgset[topic_partition].append(msg) - - # Send collected requests upstream - reqs = [] - for topic_partition, msg in msgset.items(): - messages = create_message_set(msg, codec) - req = ProduceRequest(topic_partition.topic, - topic_partition.partition, - messages) - reqs.append(req) - - try: - client.send_produce_request(reqs, - acks=req_acks, - timeout=ack_timeout) - except Exception: - log.exception("Unable to send message") - - -class Producer(object): - """ - Base class to be used by producers - - Params: - client - The Kafka client instance to use - async - If set to true, the messages are sent asynchronously via another - thread (process). We will not wait for a response to these - WARNING!!! current implementation of async producer does not - guarantee message delivery. Use at your own risk! Or help us - improve with a PR! - req_acks - A value indicating the acknowledgements that the server must - receive before responding to the request - ack_timeout - Value (in milliseconds) indicating a timeout for waiting - for an acknowledgement - batch_send - If True, messages are send in batches - batch_send_every_n - If set, messages are send in batches of this size - batch_send_every_t - If set, messages are send after this timeout - """ - - ACK_NOT_REQUIRED = 0 # No ack is required - ACK_AFTER_LOCAL_WRITE = 1 # Send response after it is written to log - ACK_AFTER_CLUSTER_COMMIT = -1 # Send response after data is committed - - DEFAULT_ACK_TIMEOUT = 1000 - - def __init__(self, client, async=False, - req_acks=ACK_AFTER_LOCAL_WRITE, - ack_timeout=DEFAULT_ACK_TIMEOUT, - codec=None, - batch_send=False, - batch_send_every_n=BATCH_SEND_MSG_COUNT, - batch_send_every_t=BATCH_SEND_DEFAULT_INTERVAL): - - if batch_send: - async = True - assert batch_send_every_n > 0 - assert batch_send_every_t > 0 - else: - batch_send_every_n = 1 - batch_send_every_t = 3600 - - self.client = client - self.async = async - self.req_acks = req_acks - self.ack_timeout = ack_timeout - - if codec is None: - codec = CODEC_NONE - elif codec not in ALL_CODECS: - raise UnsupportedCodecError("Codec 0x%02x unsupported" % codec) - - self.codec = codec - - if self.async: - log.warning("async producer does not guarantee message delivery!") - log.warning("Current implementation does not retry Failed messages") - log.warning("Use at your own risk! (or help improve with a PR!)") - self.queue = Queue() # Messages are sent through this queue - self.proc = Process(target=_send_upstream, - args=(self.queue, - self.client.copy(), - self.codec, - batch_send_every_t, - batch_send_every_n, - self.req_acks, - self.ack_timeout)) - - # Process will die if main thread exits - self.proc.daemon = True - self.proc.start() - - def send_messages(self, topic, partition, *msg): - """ - Helper method to send produce requests - @param: topic, name of topic for produce request -- type str - @param: partition, partition number for produce request -- type int - @param: *msg, one or more message payloads -- type str - @returns: ResponseRequest returned by server - raises on error - - Note that msg type *must* be encoded to str by user. - Passing unicode message will not work, for example - you should encode before calling send_messages via - something like `unicode_message.encode('utf-8')` - - All messages produced via this method will set the message 'key' to Null - """ - - # Guarantee that msg is actually a list or tuple (should always be true) - if not isinstance(msg, (list, tuple)): - raise TypeError("msg is not a list or tuple!") - - # Raise TypeError if any message is not encoded as a str - if any(not isinstance(m, str) for m in msg): - raise TypeError("all produce message payloads must be type str") - - if self.async: - for m in msg: - self.queue.put((TopicAndPartition(topic, partition), m)) - resp = [] - else: - messages = create_message_set(msg, self.codec) - req = ProduceRequest(topic, partition, messages) - try: - resp = self.client.send_produce_request([req], acks=self.req_acks, - timeout=self.ack_timeout) - except Exception: - log.exception("Unable to send messages") - raise - return resp - - def stop(self, timeout=1): - """ - Stop the producer. Optionally wait for the specified timeout before - forcefully cleaning up. - """ - if self.async: - self.queue.put((STOP_ASYNC_PRODUCER, None)) - self.proc.join(timeout) - - if self.proc.is_alive(): - self.proc.terminate() - - -class SimpleProducer(Producer): - """ - A simple, round-robin producer. Each message goes to exactly one partition - - Params: - client - The Kafka client instance to use - async - If True, the messages are sent asynchronously via another - thread (process). We will not wait for a response to these - req_acks - A value indicating the acknowledgements that the server must - receive before responding to the request - ack_timeout - Value (in milliseconds) indicating a timeout for waiting - for an acknowledgement - batch_send - If True, messages are send in batches - batch_send_every_n - If set, messages are send in batches of this size - batch_send_every_t - If set, messages are send after this timeout - random_start - If true, randomize the initial partition which the - the first message block will be published to, otherwise - if false, the first message block will always publish - to partition 0 before cycling through each partition - """ - def __init__(self, client, async=False, - req_acks=Producer.ACK_AFTER_LOCAL_WRITE, - ack_timeout=Producer.DEFAULT_ACK_TIMEOUT, - codec=None, - batch_send=False, - batch_send_every_n=BATCH_SEND_MSG_COUNT, - batch_send_every_t=BATCH_SEND_DEFAULT_INTERVAL, - random_start=False): - self.partition_cycles = {} - self.random_start = random_start - super(SimpleProducer, self).__init__(client, async, req_acks, - ack_timeout, codec, batch_send, - batch_send_every_n, - batch_send_every_t) - - def _next_partition(self, topic): - if topic not in self.partition_cycles: - if topic not in self.client.topic_partitions: - self.client.load_metadata_for_topics(topic) - try: - self.partition_cycles[topic] = cycle(self.client.topic_partitions[topic]) - except KeyError: - raise UnknownTopicOrPartitionError(topic) - - # Randomize the initial partition that is returned - if self.random_start: - num_partitions = len(self.client.topic_partitions[topic]) - for _ in xrange(random.randint(0, num_partitions-1)): - self.partition_cycles[topic].next() - - return self.partition_cycles[topic].next() - - def send_messages(self, topic, *msg): - partition = self._next_partition(topic) - return super(SimpleProducer, self).send_messages(topic, partition, *msg) - - def __repr__(self): - return '' % self.async - - -class KeyedProducer(Producer): - """ - A producer which distributes messages to partitions based on the key - - Args: - client - The kafka client instance - partitioner - A partitioner class that will be used to get the partition - to send the message to. Must be derived from Partitioner - async - If True, the messages are sent asynchronously via another - thread (process). We will not wait for a response to these - ack_timeout - Value (in milliseconds) indicating a timeout for waiting - for an acknowledgement - batch_send - If True, messages are send in batches - batch_send_every_n - If set, messages are send in batches of this size - batch_send_every_t - If set, messages are send after this timeout - """ - def __init__(self, client, partitioner=None, async=False, - req_acks=Producer.ACK_AFTER_LOCAL_WRITE, - ack_timeout=Producer.DEFAULT_ACK_TIMEOUT, - codec=None, - batch_send=False, - batch_send_every_n=BATCH_SEND_MSG_COUNT, - batch_send_every_t=BATCH_SEND_DEFAULT_INTERVAL): - if not partitioner: - partitioner = HashedPartitioner - self.partitioner_class = partitioner - self.partitioners = {} - - super(KeyedProducer, self).__init__(client, async, req_acks, - ack_timeout, codec, batch_send, - batch_send_every_n, - batch_send_every_t) - - def _next_partition(self, topic, key): - if topic not in self.partitioners: - if topic not in self.client.topic_partitions: - self.client.load_metadata_for_topics(topic) - self.partitioners[topic] = \ - self.partitioner_class(self.client.topic_partitions[topic]) - partitioner = self.partitioners[topic] - return partitioner.partition(key, self.client.topic_partitions[topic]) - - def send(self, topic, key, msg): - partition = self._next_partition(topic, key) - return self.send_messages(topic, partition, msg) - - def __repr__(self): - return '' % self.async diff -Nru python-kafka-python-0.9.2/kafka/protocol/abstract.py python-kafka-python-1.0.1/kafka/protocol/abstract.py --- python-kafka-python-0.9.2/kafka/protocol/abstract.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/protocol/abstract.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,17 @@ +import abc + + +class AbstractType(object): + __metaclass__ = abc.ABCMeta + + @abc.abstractmethod + def encode(cls, value): # pylint: disable=no-self-argument + pass + + @abc.abstractmethod + def decode(cls, data): # pylint: disable=no-self-argument + pass + + @classmethod + def repr(cls, value): + return repr(value) diff -Nru python-kafka-python-0.9.2/kafka/protocol/admin.py python-kafka-python-1.0.1/kafka/protocol/admin.py --- python-kafka-python-0.9.2/kafka/protocol/admin.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/protocol/admin.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,44 @@ +from .struct import Struct +from .types import Array, Bytes, Int16, Schema, String + + +class ListGroupsResponse(Struct): + SCHEMA = Schema( + ('error_code', Int16), + ('groups', Array( + ('group', String('utf-8')), + ('protocol_type', String('utf-8')))) + ) + + +class ListGroupsRequest(Struct): + API_KEY = 16 + API_VERSION = 0 + RESPONSE_TYPE = ListGroupsResponse + SCHEMA = Schema() + + +class DescribeGroupsResponse(Struct): + SCHEMA = Schema( + ('groups', Array( + ('error_code', Int16), + ('group', String('utf-8')), + ('state', String('utf-8')), + ('protocol_type', String('utf-8')), + ('protocol', String('utf-8')), + ('members', Array( + ('member_id', String('utf-8')), + ('client_id', String('utf-8')), + ('client_host', String('utf-8')), + ('member_metadata', Bytes), + ('member_assignment', Bytes))))) + ) + + +class DescribeGroupsRequest(Struct): + API_KEY = 15 + API_VERSION = 0 + RESPONSE_TYPE = DescribeGroupsResponse + SCHEMA = Schema( + ('groups', Array(String('utf-8'))) + ) diff -Nru python-kafka-python-0.9.2/kafka/protocol/api.py python-kafka-python-1.0.1/kafka/protocol/api.py --- python-kafka-python-0.9.2/kafka/protocol/api.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/protocol/api.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,16 @@ +from .struct import Struct +from .types import Int16, Int32, String, Schema + + +class RequestHeader(Struct): + SCHEMA = Schema( + ('api_key', Int16), + ('api_version', Int16), + ('correlation_id', Int32), + ('client_id', String('utf-8')) + ) + + def __init__(self, request, correlation_id=0, client_id='kafka-python'): + super(RequestHeader, self).__init__( + request.API_KEY, request.API_VERSION, correlation_id, client_id + ) diff -Nru python-kafka-python-0.9.2/kafka/protocol/commit.py python-kafka-python-1.0.1/kafka/protocol/commit.py --- python-kafka-python-0.9.2/kafka/protocol/commit.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/protocol/commit.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,119 @@ +from .struct import Struct +from .types import Array, Int16, Int32, Int64, Schema, String + + +class OffsetCommitResponse(Struct): + SCHEMA = Schema( + ('topics', Array( + ('topic', String('utf-8')), + ('partitions', Array( + ('partition', Int32), + ('error_code', Int16))))) + ) + + +class OffsetCommitRequest_v2(Struct): + API_KEY = 8 + API_VERSION = 2 # added retention_time, dropped timestamp + RESPONSE_TYPE = OffsetCommitResponse + SCHEMA = Schema( + ('consumer_group', String('utf-8')), + ('consumer_group_generation_id', Int32), + ('consumer_id', String('utf-8')), + ('retention_time', Int64), + ('topics', Array( + ('topic', String('utf-8')), + ('partitions', Array( + ('partition', Int32), + ('offset', Int64), + ('metadata', String('utf-8')))))) + ) + DEFAULT_GENERATION_ID = -1 + DEFAULT_RETENTION_TIME = -1 + + +class OffsetCommitRequest_v1(Struct): + API_KEY = 8 + API_VERSION = 1 # Kafka-backed storage + RESPONSE_TYPE = OffsetCommitResponse + SCHEMA = Schema( + ('consumer_group', String('utf-8')), + ('consumer_group_generation_id', Int32), + ('consumer_id', String('utf-8')), + ('topics', Array( + ('topic', String('utf-8')), + ('partitions', Array( + ('partition', Int32), + ('offset', Int64), + ('timestamp', Int64), + ('metadata', String('utf-8')))))) + ) + + +class OffsetCommitRequest_v0(Struct): + API_KEY = 8 + API_VERSION = 0 # Zookeeper-backed storage + RESPONSE_TYPE = OffsetCommitResponse + SCHEMA = Schema( + ('consumer_group', String('utf-8')), + ('topics', Array( + ('topic', String('utf-8')), + ('partitions', Array( + ('partition', Int32), + ('offset', Int64), + ('metadata', String('utf-8')))))) + ) + + +class OffsetFetchResponse(Struct): + SCHEMA = Schema( + ('topics', Array( + ('topic', String('utf-8')), + ('partitions', Array( + ('partition', Int32), + ('offset', Int64), + ('metadata', String('utf-8')), + ('error_code', Int16))))) + ) + + +class OffsetFetchRequest_v1(Struct): + API_KEY = 9 + API_VERSION = 1 # kafka-backed storage + RESPONSE_TYPE = OffsetFetchResponse + SCHEMA = Schema( + ('consumer_group', String('utf-8')), + ('topics', Array( + ('topic', String('utf-8')), + ('partitions', Array(Int32)))) + ) + + +class OffsetFetchRequest_v0(Struct): + API_KEY = 9 + API_VERSION = 0 # zookeeper-backed storage + RESPONSE_TYPE = OffsetFetchResponse + SCHEMA = Schema( + ('consumer_group', String('utf-8')), + ('topics', Array( + ('topic', String('utf-8')), + ('partitions', Array(Int32)))) + ) + + +class GroupCoordinatorResponse(Struct): + SCHEMA = Schema( + ('error_code', Int16), + ('coordinator_id', Int32), + ('host', String('utf-8')), + ('port', Int32) + ) + + +class GroupCoordinatorRequest(Struct): + API_KEY = 10 + API_VERSION = 0 + RESPONSE_TYPE = GroupCoordinatorResponse + SCHEMA = Schema( + ('consumer_group', String('utf-8')) + ) diff -Nru python-kafka-python-0.9.2/kafka/protocol/fetch.py python-kafka-python-1.0.1/kafka/protocol/fetch.py --- python-kafka-python-0.9.2/kafka/protocol/fetch.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/protocol/fetch.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,32 @@ +from .message import MessageSet +from .struct import Struct +from .types import Array, Int16, Int32, Int64, Schema, String + + +class FetchResponse(Struct): + SCHEMA = Schema( + ('topics', Array( + ('topics', String('utf-8')), + ('partitions', Array( + ('partition', Int32), + ('error_code', Int16), + ('highwater_offset', Int64), + ('message_set', MessageSet))))) + ) + + +class FetchRequest(Struct): + API_KEY = 1 + API_VERSION = 0 + RESPONSE_TYPE = FetchResponse + SCHEMA = Schema( + ('replica_id', Int32), + ('max_wait_time', Int32), + ('min_bytes', Int32), + ('topics', Array( + ('topic', String('utf-8')), + ('partitions', Array( + ('partition', Int32), + ('offset', Int64), + ('max_bytes', Int32))))) + ) diff -Nru python-kafka-python-0.9.2/kafka/protocol/group.py python-kafka-python-1.0.1/kafka/protocol/group.py --- python-kafka-python-0.9.2/kafka/protocol/group.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/protocol/group.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,103 @@ +from .struct import Struct +from .types import Array, Bytes, Int16, Int32, Schema, String + + +class JoinGroupResponse(Struct): + SCHEMA = Schema( + ('error_code', Int16), + ('generation_id', Int32), + ('group_protocol', String('utf-8')), + ('leader_id', String('utf-8')), + ('member_id', String('utf-8')), + ('members', Array( + ('member_id', String('utf-8')), + ('member_metadata', Bytes))) + ) + + +class JoinGroupRequest(Struct): + API_KEY = 11 + API_VERSION = 0 + RESPONSE_TYPE = JoinGroupResponse + SCHEMA = Schema( + ('group', String('utf-8')), + ('session_timeout', Int32), + ('member_id', String('utf-8')), + ('protocol_type', String('utf-8')), + ('group_protocols', Array( + ('protocol_name', String('utf-8')), + ('protocol_metadata', Bytes))) + ) + UNKNOWN_MEMBER_ID = '' + + +class ProtocolMetadata(Struct): + SCHEMA = Schema( + ('version', Int16), + ('subscription', Array(String('utf-8'))), # topics list + ('user_data', Bytes) + ) + + +class SyncGroupResponse(Struct): + SCHEMA = Schema( + ('error_code', Int16), + ('member_assignment', Bytes) + ) + + +class SyncGroupRequest(Struct): + API_KEY = 14 + API_VERSION = 0 + RESPONSE_TYPE = SyncGroupResponse + SCHEMA = Schema( + ('group', String('utf-8')), + ('generation_id', Int32), + ('member_id', String('utf-8')), + ('group_assignment', Array( + ('member_id', String('utf-8')), + ('member_metadata', Bytes))) + ) + + +class MemberAssignment(Struct): + SCHEMA = Schema( + ('version', Int16), + ('partition_assignment', Array( + ('topic', String('utf-8')), + ('partitions', Array(Int32)))), + ('user_data', Bytes) + ) + + +class HeartbeatResponse(Struct): + SCHEMA = Schema( + ('error_code', Int16) + ) + + +class HeartbeatRequest(Struct): + API_KEY = 12 + API_VERSION = 0 + RESPONSE_TYPE = HeartbeatResponse + SCHEMA = Schema( + ('group', String('utf-8')), + ('generation_id', Int32), + ('member_id', String('utf-8')) + ) + + +class LeaveGroupResponse(Struct): + SCHEMA = Schema( + ('error_code', Int16) + ) + + +class LeaveGroupRequest(Struct): + API_KEY = 13 + API_VERSION = 0 + RESPONSE_TYPE = LeaveGroupResponse + SCHEMA = Schema( + ('group', String('utf-8')), + ('member_id', String('utf-8')) + ) diff -Nru python-kafka-python-0.9.2/kafka/protocol/__init__.py python-kafka-python-1.0.1/kafka/protocol/__init__.py --- python-kafka-python-0.9.2/kafka/protocol/__init__.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/protocol/__init__.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,6 @@ +from .legacy import ( + create_message, create_gzip_message, + create_snappy_message, create_message_set, + CODEC_NONE, CODEC_GZIP, CODEC_SNAPPY, ALL_CODECS, + ATTRIBUTE_CODEC_MASK, KafkaProtocol, +) diff -Nru python-kafka-python-0.9.2/kafka/protocol/legacy.py python-kafka-python-1.0.1/kafka/protocol/legacy.py --- python-kafka-python-0.9.2/kafka/protocol/legacy.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/protocol/legacy.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,440 @@ +from __future__ import absolute_import + +import logging +import struct + +import six + +from six.moves import xrange + +import kafka.common +import kafka.protocol.commit +import kafka.protocol.fetch +import kafka.protocol.message +import kafka.protocol.metadata +import kafka.protocol.offset +import kafka.protocol.produce + +from kafka.codec import ( + gzip_encode, gzip_decode, snappy_encode, snappy_decode +) +from kafka.common import ( + ProtocolError, ChecksumError, + UnsupportedCodecError, + ConsumerMetadataResponse +) +from kafka.util import ( + crc32, read_short_string, read_int_string, relative_unpack, + write_short_string, write_int_string, group_by_topic_and_partition +) + + +log = logging.getLogger(__name__) + +ATTRIBUTE_CODEC_MASK = 0x03 +CODEC_NONE = 0x00 +CODEC_GZIP = 0x01 +CODEC_SNAPPY = 0x02 +ALL_CODECS = (CODEC_NONE, CODEC_GZIP, CODEC_SNAPPY) + + +class KafkaProtocol(object): + """ + Class to encapsulate all of the protocol encoding/decoding. + This class does not have any state associated with it, it is purely + for organization. + """ + PRODUCE_KEY = 0 + FETCH_KEY = 1 + OFFSET_KEY = 2 + METADATA_KEY = 3 + OFFSET_COMMIT_KEY = 8 + OFFSET_FETCH_KEY = 9 + CONSUMER_METADATA_KEY = 10 + + ################### + # Private API # + ################### + + @classmethod + def _encode_message_header(cls, client_id, correlation_id, request_key, + version=0): + """ + Encode the common request envelope + """ + return struct.pack('>hhih%ds' % len(client_id), + request_key, # ApiKey + version, # ApiVersion + correlation_id, # CorrelationId + len(client_id), # ClientId size + client_id) # ClientId + + @classmethod + def _encode_message_set(cls, messages): + """ + Encode a MessageSet. Unlike other arrays in the protocol, + MessageSets are not length-prefixed + + Format + ====== + MessageSet => [Offset MessageSize Message] + Offset => int64 + MessageSize => int32 + """ + message_set = [] + for message in messages: + encoded_message = KafkaProtocol._encode_message(message) + message_set.append(struct.pack('>qi%ds' % len(encoded_message), 0, + len(encoded_message), + encoded_message)) + return b''.join(message_set) + + @classmethod + def _encode_message(cls, message): + """ + Encode a single message. + + The magic number of a message is a format version number. + The only supported magic number right now is zero + + Format + ====== + Message => Crc MagicByte Attributes Key Value + Crc => int32 + MagicByte => int8 + Attributes => int8 + Key => bytes + Value => bytes + """ + if message.magic == 0: + msg = b''.join([ + struct.pack('>BB', message.magic, message.attributes), + write_int_string(message.key), + write_int_string(message.value) + ]) + crc = crc32(msg) + msg = struct.pack('>i%ds' % len(msg), crc, msg) + else: + raise ProtocolError("Unexpected magic number: %d" % message.magic) + return msg + + ################## + # Public API # + ################## + + @classmethod + def encode_produce_request(cls, payloads=(), acks=1, timeout=1000): + """ + Encode a ProduceRequest struct + + Arguments: + payloads: list of ProduceRequestPayload + acks: How "acky" you want the request to be + 1: written to disk by the leader + 0: immediate response + -1: waits for all replicas to be in sync + timeout: Maximum time (in ms) the server will wait for replica acks. + This is _not_ a socket timeout + + Returns: ProduceRequest + """ + if acks not in (1, 0, -1): + raise ValueError('ProduceRequest acks (%s) must be 1, 0, -1' % acks) + + return kafka.protocol.produce.ProduceRequest( + required_acks=acks, + timeout=timeout, + topics=[( + topic, + [( + partition, + [(0, 0, kafka.protocol.message.Message(msg.value, key=msg.key, + magic=msg.magic, + attributes=msg.attributes)) + for msg in payload.messages]) + for partition, payload in topic_payloads.items()]) + for topic, topic_payloads in group_by_topic_and_partition(payloads).items()]) + + @classmethod + def decode_produce_response(cls, response): + """ + Decode ProduceResponse to ProduceResponsePayload + + Arguments: + response: ProduceResponse + + Return: list of ProduceResponsePayload + """ + return [ + kafka.common.ProduceResponsePayload(topic, partition, error, offset) + for topic, partitions in response.topics + for partition, error, offset in partitions + ] + + @classmethod + def encode_fetch_request(cls, payloads=(), max_wait_time=100, min_bytes=4096): + """ + Encodes a FetchRequest struct + + Arguments: + payloads: list of FetchRequestPayload + max_wait_time (int, optional): ms to block waiting for min_bytes + data. Defaults to 100. + min_bytes (int, optional): minimum bytes required to return before + max_wait_time. Defaults to 4096. + + Return: FetchRequest + """ + return kafka.protocol.fetch.FetchRequest( + replica_id=-1, + max_wait_time=max_wait_time, + min_bytes=min_bytes, + topics=[( + topic, + [( + partition, + payload.offset, + payload.max_bytes) + for partition, payload in topic_payloads.items()]) + for topic, topic_payloads in group_by_topic_and_partition(payloads).items()]) + + @classmethod + def decode_fetch_response(cls, response): + """ + Decode FetchResponse struct to FetchResponsePayloads + + Arguments: + response: FetchResponse + """ + return [ + kafka.common.FetchResponsePayload( + topic, partition, error, highwater_offset, [ + kafka.common.OffsetAndMessage(offset, message) + for offset, _, message in messages]) + for topic, partitions in response.topics + for partition, error, highwater_offset, messages in partitions + ] + + @classmethod + def encode_offset_request(cls, payloads=()): + return kafka.protocol.offset.OffsetRequest( + replica_id=-1, + topics=[( + topic, + [( + partition, + payload.time, + payload.max_offsets) + for partition, payload in six.iteritems(topic_payloads)]) + for topic, topic_payloads in six.iteritems(group_by_topic_and_partition(payloads))]) + + @classmethod + def decode_offset_response(cls, response): + """ + Decode OffsetResponse into OffsetResponsePayloads + + Arguments: + response: OffsetResponse + + Returns: list of OffsetResponsePayloads + """ + return [ + kafka.common.OffsetResponsePayload(topic, partition, error, tuple(offsets)) + for topic, partitions in response.topics + for partition, error, offsets in partitions + ] + + @classmethod + def encode_metadata_request(cls, topics=(), payloads=None): + """ + Encode a MetadataRequest + + Arguments: + topics: list of strings + """ + if payloads is not None: + topics = payloads + + return kafka.protocol.metadata.MetadataRequest(topics) + + @classmethod + def decode_metadata_response(cls, response): + return response + + @classmethod + def encode_consumer_metadata_request(cls, client_id, correlation_id, payloads): + """ + Encode a ConsumerMetadataRequest + + Arguments: + client_id: string + correlation_id: int + payloads: string (consumer group) + """ + message = [] + message.append(cls._encode_message_header(client_id, correlation_id, + KafkaProtocol.CONSUMER_METADATA_KEY)) + message.append(struct.pack('>h%ds' % len(payloads), len(payloads), payloads)) + + msg = b''.join(message) + return write_int_string(msg) + + @classmethod + def decode_consumer_metadata_response(cls, data): + """ + Decode bytes to a ConsumerMetadataResponse + + Arguments: + data: bytes to decode + """ + ((correlation_id, error, nodeId), cur) = relative_unpack('>ihi', data, 0) + (host, cur) = read_short_string(data, cur) + ((port,), cur) = relative_unpack('>i', data, cur) + + return ConsumerMetadataResponse(error, nodeId, host, port) + + @classmethod + def encode_offset_commit_request(cls, group, payloads): + """ + Encode an OffsetCommitRequest struct + + Arguments: + group: string, the consumer group you are committing offsets for + payloads: list of OffsetCommitRequestPayload + """ + return kafka.protocol.commit.OffsetCommitRequest_v0( + consumer_group=group, + topics=[( + topic, + [( + partition, + payload.offset, + payload.metadata) + for partition, payload in six.iteritems(topic_payloads)]) + for topic, topic_payloads in six.iteritems(group_by_topic_and_partition(payloads))]) + + + @classmethod + def decode_offset_commit_response(cls, response): + """ + Decode OffsetCommitResponse to an OffsetCommitResponsePayload + + Arguments: + response: OffsetCommitResponse + """ + return [ + kafka.common.OffsetCommitResponsePayload(topic, partition, error) + for topic, partitions in response.topics + for partition, error in partitions + ] + + @classmethod + def encode_offset_fetch_request(cls, group, payloads, from_kafka=False): + """ + Encode an OffsetFetchRequest struct. The request is encoded using + version 0 if from_kafka is false, indicating a request for Zookeeper + offsets. It is encoded using version 1 otherwise, indicating a request + for Kafka offsets. + + Arguments: + group: string, the consumer group you are fetching offsets for + payloads: list of OffsetFetchRequestPayload + from_kafka: bool, default False, set True for Kafka-committed offsets + """ + if from_kafka: + request_class = kafka.protocol.commit.OffsetFetchRequest_v1 + else: + request_class = kafka.protocol.commit.OffsetFetchRequest_v0 + + return request_class( + consumer_group=group, + topics=[( + topic, + list(topic_payloads.keys())) + for topic, topic_payloads in six.iteritems(group_by_topic_and_partition(payloads))]) + + @classmethod + def decode_offset_fetch_response(cls, response): + """ + Decode OffsetFetchResponse to OffsetFetchResponsePayloads + + Arguments: + response: OffsetFetchResponse + """ + return [ + kafka.common.OffsetFetchResponsePayload( + topic, partition, offset, metadata, error + ) + for topic, partitions in response.topics + for partition, offset, metadata, error in partitions + ] + + +def create_message(payload, key=None): + """ + Construct a Message + + Arguments: + payload: bytes, the payload to send to Kafka + key: bytes, a key used for partition routing (optional) + + """ + return kafka.common.Message(0, 0, key, payload) + + +def create_gzip_message(payloads, key=None, compresslevel=None): + """ + Construct a Gzipped Message containing multiple Messages + + The given payloads will be encoded, compressed, and sent as a single atomic + message to Kafka. + + Arguments: + payloads: list(bytes), a list of payload to send be sent to Kafka + key: bytes, a key used for partition routing (optional) + + """ + message_set = KafkaProtocol._encode_message_set( + [create_message(payload, pl_key) for payload, pl_key in payloads]) + + gzipped = gzip_encode(message_set, compresslevel=compresslevel) + codec = ATTRIBUTE_CODEC_MASK & CODEC_GZIP + + return kafka.common.Message(0, 0x00 | codec, key, gzipped) + + +def create_snappy_message(payloads, key=None): + """ + Construct a Snappy Message containing multiple Messages + + The given payloads will be encoded, compressed, and sent as a single atomic + message to Kafka. + + Arguments: + payloads: list(bytes), a list of payload to send be sent to Kafka + key: bytes, a key used for partition routing (optional) + + """ + message_set = KafkaProtocol._encode_message_set( + [create_message(payload, pl_key) for payload, pl_key in payloads]) + + snapped = snappy_encode(message_set) + codec = ATTRIBUTE_CODEC_MASK & CODEC_SNAPPY + + return kafka.common.Message(0, 0x00 | codec, key, snapped) + + +def create_message_set(messages, codec=CODEC_NONE, key=None, compresslevel=None): + """Create a message set using the given codec. + + If codec is CODEC_NONE, return a list of raw Kafka messages. Otherwise, + return a list containing a single codec-encoded message. + """ + if codec == CODEC_NONE: + return [create_message(m, k) for m, k in messages] + elif codec == CODEC_GZIP: + return [create_gzip_message(messages, key, compresslevel)] + elif codec == CODEC_SNAPPY: + return [create_snappy_message(messages, key)] + else: + raise UnsupportedCodecError("Codec 0x%02x unsupported" % codec) diff -Nru python-kafka-python-0.9.2/kafka/protocol/message.py python-kafka-python-1.0.1/kafka/protocol/message.py --- python-kafka-python-0.9.2/kafka/protocol/message.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/protocol/message.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,167 @@ +import io + +from ..codec import (has_gzip, has_snappy, has_lz4, + gzip_decode, snappy_decode, lz4_decode) +from . import pickle +from .struct import Struct +from .types import ( + Int8, Int32, Int64, Bytes, Schema, AbstractType +) +from ..util import crc32 + + +class Message(Struct): + SCHEMA = Schema( + ('crc', Int32), + ('magic', Int8), + ('attributes', Int8), + ('key', Bytes), + ('value', Bytes) + ) + CODEC_MASK = 0x03 + CODEC_GZIP = 0x01 + CODEC_SNAPPY = 0x02 + CODEC_LZ4 = 0x03 + HEADER_SIZE = 14 # crc(4), magic(1), attributes(1), key+value size(4*2) + + def __init__(self, value, key=None, magic=0, attributes=0, crc=0): + assert value is None or isinstance(value, bytes), 'value must be bytes' + assert key is None or isinstance(key, bytes), 'key must be bytes' + self.crc = crc + self.magic = magic + self.attributes = attributes + self.key = key + self.value = value + self.encode = self._encode_self + + def _encode_self(self, recalc_crc=True): + message = Message.SCHEMA.encode( + (self.crc, self.magic, self.attributes, self.key, self.value) + ) + if not recalc_crc: + return message + self.crc = crc32(message[4:]) + return self.SCHEMA.fields[0].encode(self.crc) + message[4:] + + @classmethod + def decode(cls, data): + if isinstance(data, bytes): + data = io.BytesIO(data) + fields = [field.decode(data) for field in cls.SCHEMA.fields] + return cls(fields[4], key=fields[3], + magic=fields[1], attributes=fields[2], crc=fields[0]) + + def validate_crc(self): + raw_msg = self._encode_self(recalc_crc=False) + crc = crc32(raw_msg[4:]) + if crc == self.crc: + return True + return False + + def is_compressed(self): + return self.attributes & self.CODEC_MASK != 0 + + def decompress(self): + codec = self.attributes & self.CODEC_MASK + assert codec in (self.CODEC_GZIP, self.CODEC_SNAPPY, self.CODEC_LZ4) + if codec == self.CODEC_GZIP: + assert has_gzip(), 'Gzip decompression unsupported' + raw_bytes = gzip_decode(self.value) + elif codec == self.CODEC_SNAPPY: + assert has_snappy(), 'Snappy decompression unsupported' + raw_bytes = snappy_decode(self.value) + elif codec == self.CODEC_LZ4: + assert has_lz4(), 'LZ4 decompression unsupported' + raw_bytes = lz4_decode(self.value) + else: + raise Exception('This should be impossible') + + return MessageSet.decode(raw_bytes, bytes_to_read=len(raw_bytes)) + + def __hash__(self): + return hash(self._encode_self(recalc_crc=False)) + + +class PartialMessage(bytes): + def __repr__(self): + return 'PartialMessage(%s)' % self + + +class MessageSet(AbstractType): + ITEM = Schema( + ('offset', Int64), + ('message_size', Int32), + ('message', Message.SCHEMA) + ) + HEADER_SIZE = 12 # offset + message_size + + @classmethod + def encode(cls, items, size=True, recalc_message_size=True): + # RecordAccumulator encodes messagesets internally + if isinstance(items, io.BytesIO): + size = Int32.decode(items) + # rewind and return all the bytes + items.seek(-4, 1) + return items.read(size + 4) + + encoded_values = [] + for (offset, message_size, message) in items: + if isinstance(message, Message): + encoded_message = message.encode() + else: + encoded_message = cls.ITEM.fields[2].encode(message) + if recalc_message_size: + message_size = len(encoded_message) + encoded_values.append(cls.ITEM.fields[0].encode(offset)) + encoded_values.append(cls.ITEM.fields[1].encode(message_size)) + encoded_values.append(encoded_message) + encoded = b''.join(encoded_values) + if not size: + return encoded + return Int32.encode(len(encoded)) + encoded + + @classmethod + def decode(cls, data, bytes_to_read=None): + """Compressed messages should pass in bytes_to_read (via message size) + otherwise, we decode from data as Int32 + """ + if isinstance(data, bytes): + data = io.BytesIO(data) + if bytes_to_read is None: + bytes_to_read = Int32.decode(data) + items = [] + + # We need at least 8 + 4 + 14 bytes to read offset + message size + message + # (14 bytes is a message w/ null key and null value) + while bytes_to_read >= 26: + offset = Int64.decode(data) + bytes_to_read -= 8 + + message_size = Int32.decode(data) + bytes_to_read -= 4 + + # if FetchRequest max_bytes is smaller than the available message set + # the server returns partial data for the final message + if message_size > bytes_to_read: + break + + message = Message.decode(data) + bytes_to_read -= message_size + + items.append((offset, message_size, message)) + + # If any bytes are left over, clear them from the buffer + # and append a PartialMessage to signal that max_bytes may be too small + if bytes_to_read: + items.append((None, None, PartialMessage(data.read(bytes_to_read)))) + + return items + + @classmethod + def repr(cls, messages): + if isinstance(messages, io.BytesIO): + offset = messages.tell() + decoded = cls.decode(messages) + messages.seek(offset) + messages = decoded + return '[' + ', '.join([cls.ITEM.repr(m) for m in messages]) + ']' diff -Nru python-kafka-python-0.9.2/kafka/protocol/metadata.py python-kafka-python-1.0.1/kafka/protocol/metadata.py --- python-kafka-python-0.9.2/kafka/protocol/metadata.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/protocol/metadata.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,29 @@ +from .struct import Struct +from .types import Array, Int16, Int32, Schema, String + + +class MetadataResponse(Struct): + SCHEMA = Schema( + ('brokers', Array( + ('node_id', Int32), + ('host', String('utf-8')), + ('port', Int32))), + ('topics', Array( + ('error_code', Int16), + ('topic', String('utf-8')), + ('partitions', Array( + ('error_code', Int16), + ('partition', Int32), + ('leader', Int32), + ('replicas', Array(Int32)), + ('isr', Array(Int32)))))) + ) + + +class MetadataRequest(Struct): + API_KEY = 3 + API_VERSION = 0 + RESPONSE_TYPE = MetadataResponse + SCHEMA = Schema( + ('topics', Array(String('utf-8'))) + ) diff -Nru python-kafka-python-0.9.2/kafka/protocol/offset.py python-kafka-python-1.0.1/kafka/protocol/offset.py --- python-kafka-python-0.9.2/kafka/protocol/offset.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/protocol/offset.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,36 @@ +from .struct import Struct +from .types import Array, Int16, Int32, Int64, Schema, String + +class OffsetResetStrategy(object): + LATEST = -1 + EARLIEST = -2 + NONE = 0 + + +class OffsetResponse(Struct): + SCHEMA = Schema( + ('topics', Array( + ('topic', String('utf-8')), + ('partitions', Array( + ('partition', Int32), + ('error_code', Int16), + ('offsets', Array(Int64)))))) + ) + + +class OffsetRequest(Struct): + API_KEY = 2 + API_VERSION = 0 + RESPONSE_TYPE = OffsetResponse + SCHEMA = Schema( + ('replica_id', Int32), + ('topics', Array( + ('topic', String('utf-8')), + ('partitions', Array( + ('partition', Int32), + ('time', Int64), + ('max_offsets', Int32))))) + ) + DEFAULTS = { + 'replica_id': -1 + } diff -Nru python-kafka-python-0.9.2/kafka/protocol/pickle.py python-kafka-python-1.0.1/kafka/protocol/pickle.py --- python-kafka-python-0.9.2/kafka/protocol/pickle.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/protocol/pickle.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,29 @@ +from __future__ import absolute_import + +try: + import copyreg # pylint: disable=import-error +except ImportError: + import copy_reg as copyreg # pylint: disable=import-error + +import types + + +def _pickle_method(method): + func_name = method.im_func.__name__ + obj = method.im_self + cls = method.im_class + return _unpickle_method, (func_name, obj, cls) + + +def _unpickle_method(func_name, obj, cls): + for cls in cls.mro(): + try: + func = cls.__dict__[func_name] + except KeyError: + pass + else: + break + return func.__get__(obj, cls) + +# https://bytes.com/topic/python/answers/552476-why-cant-you-pickle-instancemethods +copyreg.pickle(types.MethodType, _pickle_method, _unpickle_method) diff -Nru python-kafka-python-0.9.2/kafka/protocol/produce.py python-kafka-python-1.0.1/kafka/protocol/produce.py --- python-kafka-python-0.9.2/kafka/protocol/produce.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/protocol/produce.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,29 @@ +from .message import MessageSet +from .struct import Struct +from .types import Int8, Int16, Int32, Int64, Bytes, String, Array, Schema + + +class ProduceResponse(Struct): + SCHEMA = Schema( + ('topics', Array( + ('topic', String('utf-8')), + ('partitions', Array( + ('partition', Int32), + ('error_code', Int16), + ('offset', Int64))))) + ) + + +class ProduceRequest(Struct): + API_KEY = 0 + API_VERSION = 0 + RESPONSE_TYPE = ProduceResponse + SCHEMA = Schema( + ('required_acks', Int16), + ('timeout', Int32), + ('topics', Array( + ('topic', String('utf-8')), + ('partitions', Array( + ('partition', Int32), + ('messages', MessageSet))))) + ) diff -Nru python-kafka-python-0.9.2/kafka/protocol/struct.py python-kafka-python-1.0.1/kafka/protocol/struct.py --- python-kafka-python-0.9.2/kafka/protocol/struct.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/protocol/struct.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,64 @@ +#from collections import namedtuple +from io import BytesIO + +from .abstract import AbstractType +from .types import Schema + + +class Struct(AbstractType): + SCHEMA = Schema() + + def __init__(self, *args, **kwargs): + if len(args) == len(self.SCHEMA.fields): + for i, name in enumerate(self.SCHEMA.names): + self.__dict__[name] = args[i] + elif len(args) > 0: + raise ValueError('Args must be empty or mirror schema') + else: + self.__dict__.update(kwargs) + + # overloading encode() to support both class and instance + self.encode = self._encode_self + + @classmethod + def encode(cls, item): # pylint: disable=E0202 + bits = [] + for i, field in enumerate(cls.SCHEMA.fields): + bits.append(field.encode(item[i])) + return b''.join(bits) + + def _encode_self(self): + return self.SCHEMA.encode( + [self.__dict__[name] for name in self.SCHEMA.names] + ) + + @classmethod + def decode(cls, data): + if isinstance(data, bytes): + data = BytesIO(data) + return cls(*[field.decode(data) for field in cls.SCHEMA.fields]) + + def __repr__(self): + key_vals = [] + for name, field in zip(self.SCHEMA.names, self.SCHEMA.fields): + key_vals.append('%s=%s' % (name, field.repr(self.__dict__[name]))) + return self.__class__.__name__ + '(' + ', '.join(key_vals) + ')' + + def __hash__(self): + return hash(self.encode()) + + def __eq__(self, other): + if self.SCHEMA != other.SCHEMA: + return False + for attr in self.SCHEMA.names: + if self.__dict__[attr] != other.__dict__[attr]: + return False + return True + +""" +class MetaStruct(type): + def __new__(cls, clsname, bases, dct): + nt = namedtuple(clsname, [name for (name, _) in dct['SCHEMA']]) + bases = tuple([Struct, nt] + list(bases)) + return super(MetaStruct, cls).__new__(cls, clsname, bases, dct) +""" diff -Nru python-kafka-python-0.9.2/kafka/protocol/types.py python-kafka-python-1.0.1/kafka/protocol/types.py --- python-kafka-python-0.9.2/kafka/protocol/types.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/protocol/types.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,141 @@ +from __future__ import absolute_import + +from struct import pack, unpack + +from .abstract import AbstractType + + +class Int8(AbstractType): + @classmethod + def encode(cls, value): + return pack('>b', value) + + @classmethod + def decode(cls, data): + (value,) = unpack('>b', data.read(1)) + return value + + +class Int16(AbstractType): + @classmethod + def encode(cls, value): + return pack('>h', value) + + @classmethod + def decode(cls, data): + (value,) = unpack('>h', data.read(2)) + return value + + +class Int32(AbstractType): + @classmethod + def encode(cls, value): + return pack('>i', value) + + @classmethod + def decode(cls, data): + (value,) = unpack('>i', data.read(4)) + return value + + +class Int64(AbstractType): + @classmethod + def encode(cls, value): + return pack('>q', value) + + @classmethod + def decode(cls, data): + (value,) = unpack('>q', data.read(8)) + return value + + +class String(AbstractType): + def __init__(self, encoding='utf-8'): + self.encoding = encoding + + def encode(self, value): + if value is None: + return Int16.encode(-1) + value = str(value).encode(self.encoding) + return Int16.encode(len(value)) + value + + def decode(self, data): + length = Int16.decode(data) + if length < 0: + return None + return data.read(length).decode(self.encoding) + + +class Bytes(AbstractType): + @classmethod + def encode(cls, value): + if value is None: + return Int32.encode(-1) + else: + return Int32.encode(len(value)) + value + + @classmethod + def decode(cls, data): + length = Int32.decode(data) + if length < 0: + return None + return data.read(length) + + +class Schema(AbstractType): + def __init__(self, *fields): + if fields: + self.names, self.fields = zip(*fields) + else: + self.names, self.fields = (), () + + def encode(self, item): + if len(item) != len(self.fields): + raise ValueError('Item field count does not match Schema') + return b''.join([ + field.encode(item[i]) + for i, field in enumerate(self.fields) + ]) + + def decode(self, data): + return tuple([field.decode(data) for field in self.fields]) + + def __len__(self): + return len(self.fields) + + def repr(self, value): + key_vals = [] + try: + for i in range(len(self)): + try: + field_val = getattr(value, self.names[i]) + except AttributeError: + field_val = value[i] + key_vals.append('%s=%s' % (self.names[i], self.fields[i].repr(field_val))) + return '(' + ', '.join(key_vals) + ')' + except: + return repr(value) + + +class Array(AbstractType): + def __init__(self, *array_of): + if len(array_of) > 1: + self.array_of = Schema(*array_of) + elif len(array_of) == 1 and (isinstance(array_of[0], AbstractType) or + issubclass(array_of[0], AbstractType)): + self.array_of = array_of[0] + else: + raise ValueError('Array instantiated with no array_of type') + + def encode(self, items): + return b''.join( + [Int32.encode(len(items))] + + [self.array_of.encode(item) for item in items] + ) + + def decode(self, data): + length = Int32.decode(data) + return [self.array_of.decode(data) for _ in range(length)] + + def repr(self, list_of_items): + return '[' + ', '.join([self.array_of.repr(item) for item in list_of_items]) + ']' diff -Nru python-kafka-python-0.9.2/kafka/protocol.py python-kafka-python-1.0.1/kafka/protocol.py --- python-kafka-python-0.9.2/kafka/protocol.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/protocol.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,587 +0,0 @@ -import logging -import struct -import zlib - -from kafka.codec import ( - gzip_encode, gzip_decode, snappy_encode, snappy_decode -) -from kafka.common import ( - BrokerMetadata, PartitionMetadata, Message, OffsetAndMessage, - ProduceResponse, FetchResponse, OffsetResponse, - OffsetCommitResponse, OffsetFetchResponse, ProtocolError, - BufferUnderflowError, ChecksumError, ConsumerFetchSizeTooSmall, - UnsupportedCodecError -) -from kafka.util import ( - read_short_string, read_int_string, relative_unpack, - write_short_string, write_int_string, group_by_topic_and_partition -) - -log = logging.getLogger("kafka") - -ATTRIBUTE_CODEC_MASK = 0x03 -CODEC_NONE = 0x00 -CODEC_GZIP = 0x01 -CODEC_SNAPPY = 0x02 -ALL_CODECS = (CODEC_NONE, CODEC_GZIP, CODEC_SNAPPY) - - -class KafkaProtocol(object): - """ - Class to encapsulate all of the protocol encoding/decoding. - This class does not have any state associated with it, it is purely - for organization. - """ - PRODUCE_KEY = 0 - FETCH_KEY = 1 - OFFSET_KEY = 2 - METADATA_KEY = 3 - OFFSET_COMMIT_KEY = 8 - OFFSET_FETCH_KEY = 9 - - ################### - # Private API # - ################### - - @classmethod - def _encode_message_header(cls, client_id, correlation_id, request_key): - """ - Encode the common request envelope - """ - return struct.pack('>hhih%ds' % len(client_id), - request_key, # ApiKey - 0, # ApiVersion - correlation_id, # CorrelationId - len(client_id), # ClientId size - client_id) # ClientId - - @classmethod - def _encode_message_set(cls, messages): - """ - Encode a MessageSet. Unlike other arrays in the protocol, - MessageSets are not length-prefixed - - Format - ====== - MessageSet => [Offset MessageSize Message] - Offset => int64 - MessageSize => int32 - """ - message_set = "" - for message in messages: - encoded_message = KafkaProtocol._encode_message(message) - message_set += struct.pack('>qi%ds' % len(encoded_message), 0, len(encoded_message), encoded_message) - return message_set - - @classmethod - def _encode_message(cls, message): - """ - Encode a single message. - - The magic number of a message is a format version number. - The only supported magic number right now is zero - - Format - ====== - Message => Crc MagicByte Attributes Key Value - Crc => int32 - MagicByte => int8 - Attributes => int8 - Key => bytes - Value => bytes - """ - if message.magic == 0: - msg = struct.pack('>BB', message.magic, message.attributes) - msg += write_int_string(message.key) - msg += write_int_string(message.value) - crc = zlib.crc32(msg) - msg = struct.pack('>i%ds' % len(msg), crc, msg) - else: - raise ProtocolError("Unexpected magic number: %d" % message.magic) - return msg - - @classmethod - def _decode_message_set_iter(cls, data): - """ - Iteratively decode a MessageSet - - Reads repeated elements of (offset, message), calling decode_message - to decode a single message. Since compressed messages contain futher - MessageSets, these two methods have been decoupled so that they may - recurse easily. - """ - cur = 0 - read_message = False - while cur < len(data): - try: - ((offset, ), cur) = relative_unpack('>q', data, cur) - (msg, cur) = read_int_string(data, cur) - for (offset, message) in KafkaProtocol._decode_message(msg, offset): - read_message = True - yield OffsetAndMessage(offset, message) - except BufferUnderflowError: - # NOTE: Not sure this is correct error handling: - # Is it possible to get a BUE if the message set is somewhere - # in the middle of the fetch response? If so, we probably have - # an issue that's not fetch size too small. - # Aren't we ignoring errors if we fail to unpack data by - # raising StopIteration()? - # If _decode_message() raises a ChecksumError, couldn't that - # also be due to the fetch size being too small? - if read_message is False: - # If we get a partial read of a message, but haven't - # yielded anything there's a problem - raise ConsumerFetchSizeTooSmall() - else: - raise StopIteration() - - @classmethod - def _decode_message(cls, data, offset): - """ - Decode a single Message - - The only caller of this method is decode_message_set_iter. - They are decoupled to support nested messages (compressed MessageSets). - The offset is actually read from decode_message_set_iter (it is part - of the MessageSet payload). - """ - ((crc, magic, att), cur) = relative_unpack('>iBB', data, 0) - if crc != zlib.crc32(data[4:]): - raise ChecksumError("Message checksum failed") - - (key, cur) = read_int_string(data, cur) - (value, cur) = read_int_string(data, cur) - - codec = att & ATTRIBUTE_CODEC_MASK - - if codec == CODEC_NONE: - yield (offset, Message(magic, att, key, value)) - - elif codec == CODEC_GZIP: - gz = gzip_decode(value) - for (offset, msg) in KafkaProtocol._decode_message_set_iter(gz): - yield (offset, msg) - - elif codec == CODEC_SNAPPY: - snp = snappy_decode(value) - for (offset, msg) in KafkaProtocol._decode_message_set_iter(snp): - yield (offset, msg) - - ################## - # Public API # - ################## - - @classmethod - def encode_produce_request(cls, client_id, correlation_id, - payloads=None, acks=1, timeout=1000): - """ - Encode some ProduceRequest structs - - Params - ====== - client_id: string - correlation_id: int - payloads: list of ProduceRequest - acks: How "acky" you want the request to be - 0: immediate response - 1: written to disk by the leader - 2+: waits for this many number of replicas to sync - -1: waits for all replicas to be in sync - timeout: Maximum time the server will wait for acks from replicas. - This is _not_ a socket timeout - """ - payloads = [] if payloads is None else payloads - grouped_payloads = group_by_topic_and_partition(payloads) - - message = cls._encode_message_header(client_id, correlation_id, - KafkaProtocol.PRODUCE_KEY) - - message += struct.pack('>hii', acks, timeout, len(grouped_payloads)) - - for topic, topic_payloads in grouped_payloads.items(): - message += struct.pack('>h%dsi' % len(topic), - len(topic), topic, len(topic_payloads)) - - for partition, payload in topic_payloads.items(): - msg_set = KafkaProtocol._encode_message_set(payload.messages) - message += struct.pack('>ii%ds' % len(msg_set), partition, - len(msg_set), msg_set) - - return struct.pack('>i%ds' % len(message), len(message), message) - - @classmethod - def decode_produce_response(cls, data): - """ - Decode bytes to a ProduceResponse - - Params - ====== - data: bytes to decode - """ - ((correlation_id, num_topics), cur) = relative_unpack('>ii', data, 0) - - for i in range(num_topics): - ((strlen,), cur) = relative_unpack('>h', data, cur) - topic = data[cur:cur + strlen] - cur += strlen - ((num_partitions,), cur) = relative_unpack('>i', data, cur) - for i in range(num_partitions): - ((partition, error, offset), cur) = relative_unpack('>ihq', - data, cur) - - yield ProduceResponse(topic, partition, error, offset) - - @classmethod - def encode_fetch_request(cls, client_id, correlation_id, payloads=None, - max_wait_time=100, min_bytes=4096): - """ - Encodes some FetchRequest structs - - Params - ====== - client_id: string - correlation_id: int - payloads: list of FetchRequest - max_wait_time: int, how long to block waiting on min_bytes of data - min_bytes: int, the minimum number of bytes to accumulate before - returning the response - """ - - payloads = [] if payloads is None else payloads - grouped_payloads = group_by_topic_and_partition(payloads) - - message = cls._encode_message_header(client_id, correlation_id, - KafkaProtocol.FETCH_KEY) - - # -1 is the replica id - message += struct.pack('>iiii', -1, max_wait_time, min_bytes, - len(grouped_payloads)) - - for topic, topic_payloads in grouped_payloads.items(): - message += write_short_string(topic) - message += struct.pack('>i', len(topic_payloads)) - for partition, payload in topic_payloads.items(): - message += struct.pack('>iqi', partition, payload.offset, - payload.max_bytes) - - return struct.pack('>i%ds' % len(message), len(message), message) - - @classmethod - def decode_fetch_response(cls, data): - """ - Decode bytes to a FetchResponse - - Params - ====== - data: bytes to decode - """ - ((correlation_id, num_topics), cur) = relative_unpack('>ii', data, 0) - - for i in range(num_topics): - (topic, cur) = read_short_string(data, cur) - ((num_partitions,), cur) = relative_unpack('>i', data, cur) - - for i in range(num_partitions): - ((partition, error, highwater_mark_offset), cur) = \ - relative_unpack('>ihq', data, cur) - - (message_set, cur) = read_int_string(data, cur) - - yield FetchResponse( - topic, partition, error, - highwater_mark_offset, - KafkaProtocol._decode_message_set_iter(message_set)) - - @classmethod - def encode_offset_request(cls, client_id, correlation_id, payloads=None): - payloads = [] if payloads is None else payloads - grouped_payloads = group_by_topic_and_partition(payloads) - - message = cls._encode_message_header(client_id, correlation_id, - KafkaProtocol.OFFSET_KEY) - - # -1 is the replica id - message += struct.pack('>ii', -1, len(grouped_payloads)) - - for topic, topic_payloads in grouped_payloads.items(): - message += write_short_string(topic) - message += struct.pack('>i', len(topic_payloads)) - - for partition, payload in topic_payloads.items(): - message += struct.pack('>iqi', partition, payload.time, - payload.max_offsets) - - return struct.pack('>i%ds' % len(message), len(message), message) - - @classmethod - def decode_offset_response(cls, data): - """ - Decode bytes to an OffsetResponse - - Params - ====== - data: bytes to decode - """ - ((correlation_id, num_topics), cur) = relative_unpack('>ii', data, 0) - - for i in range(num_topics): - (topic, cur) = read_short_string(data, cur) - ((num_partitions,), cur) = relative_unpack('>i', data, cur) - - for i in range(num_partitions): - ((partition, error, num_offsets,), cur) = \ - relative_unpack('>ihi', data, cur) - - offsets = [] - for j in range(num_offsets): - ((offset,), cur) = relative_unpack('>q', data, cur) - offsets.append(offset) - - yield OffsetResponse(topic, partition, error, tuple(offsets)) - - @classmethod - def encode_metadata_request(cls, client_id, correlation_id, topics=None): - """ - Encode a MetadataRequest - - Params - ====== - client_id: string - correlation_id: int - topics: list of strings - """ - topics = [] if topics is None else topics - message = cls._encode_message_header(client_id, correlation_id, - KafkaProtocol.METADATA_KEY) - - message += struct.pack('>i', len(topics)) - - for topic in topics: - message += struct.pack('>h%ds' % len(topic), len(topic), topic) - - return write_int_string(message) - - @classmethod - def decode_metadata_response(cls, data): - """ - Decode bytes to a MetadataResponse - - Params - ====== - data: bytes to decode - """ - ((correlation_id, numbrokers), cur) = relative_unpack('>ii', data, 0) - - # Broker info - brokers = {} - for i in range(numbrokers): - ((nodeId, ), cur) = relative_unpack('>i', data, cur) - (host, cur) = read_short_string(data, cur) - ((port,), cur) = relative_unpack('>i', data, cur) - brokers[nodeId] = BrokerMetadata(nodeId, host, port) - - # Topic info - ((num_topics,), cur) = relative_unpack('>i', data, cur) - topic_metadata = {} - - for i in range(num_topics): - # NOTE: topic_error is discarded. Should probably be returned with - # the topic metadata. - ((topic_error,), cur) = relative_unpack('>h', data, cur) - (topic_name, cur) = read_short_string(data, cur) - ((num_partitions,), cur) = relative_unpack('>i', data, cur) - partition_metadata = {} - - for j in range(num_partitions): - # NOTE: partition_error_code is discarded. Should probably be - # returned with the partition metadata. - ((partition_error_code, partition, leader, numReplicas), cur) = \ - relative_unpack('>hiii', data, cur) - - (replicas, cur) = relative_unpack( - '>%di' % numReplicas, data, cur) - - ((num_isr,), cur) = relative_unpack('>i', data, cur) - (isr, cur) = relative_unpack('>%di' % num_isr, data, cur) - - partition_metadata[partition] = \ - PartitionMetadata( - topic_name, partition, leader, replicas, isr) - - topic_metadata[topic_name] = partition_metadata - - return brokers, topic_metadata - - @classmethod - def encode_offset_commit_request(cls, client_id, correlation_id, - group, payloads): - """ - Encode some OffsetCommitRequest structs - - Params - ====== - client_id: string - correlation_id: int - group: string, the consumer group you are committing offsets for - payloads: list of OffsetCommitRequest - """ - grouped_payloads = group_by_topic_and_partition(payloads) - - message = cls._encode_message_header(client_id, correlation_id, - KafkaProtocol.OFFSET_COMMIT_KEY) - message += write_short_string(group) - message += struct.pack('>i', len(grouped_payloads)) - - for topic, topic_payloads in grouped_payloads.items(): - message += write_short_string(topic) - message += struct.pack('>i', len(topic_payloads)) - - for partition, payload in topic_payloads.items(): - message += struct.pack('>iq', partition, payload.offset) - message += write_short_string(payload.metadata) - - return struct.pack('>i%ds' % len(message), len(message), message) - - @classmethod - def decode_offset_commit_response(cls, data): - """ - Decode bytes to an OffsetCommitResponse - - Params - ====== - data: bytes to decode - """ - ((correlation_id,), cur) = relative_unpack('>i', data, 0) - ((num_topics,), cur) = relative_unpack('>i', data, cur) - - for i in xrange(num_topics): - (topic, cur) = read_short_string(data, cur) - ((num_partitions,), cur) = relative_unpack('>i', data, cur) - - for i in xrange(num_partitions): - ((partition, error), cur) = relative_unpack('>ih', data, cur) - yield OffsetCommitResponse(topic, partition, error) - - @classmethod - def encode_offset_fetch_request(cls, client_id, correlation_id, - group, payloads): - """ - Encode some OffsetFetchRequest structs - - Params - ====== - client_id: string - correlation_id: int - group: string, the consumer group you are fetching offsets for - payloads: list of OffsetFetchRequest - """ - grouped_payloads = group_by_topic_and_partition(payloads) - message = cls._encode_message_header(client_id, correlation_id, - KafkaProtocol.OFFSET_FETCH_KEY) - - message += write_short_string(group) - message += struct.pack('>i', len(grouped_payloads)) - - for topic, topic_payloads in grouped_payloads.items(): - message += write_short_string(topic) - message += struct.pack('>i', len(topic_payloads)) - - for partition, payload in topic_payloads.items(): - message += struct.pack('>i', partition) - - return struct.pack('>i%ds' % len(message), len(message), message) - - @classmethod - def decode_offset_fetch_response(cls, data): - """ - Decode bytes to an OffsetFetchResponse - - Params - ====== - data: bytes to decode - """ - - ((correlation_id,), cur) = relative_unpack('>i', data, 0) - ((num_topics,), cur) = relative_unpack('>i', data, cur) - - for i in range(num_topics): - (topic, cur) = read_short_string(data, cur) - ((num_partitions,), cur) = relative_unpack('>i', data, cur) - - for i in range(num_partitions): - ((partition, offset), cur) = relative_unpack('>iq', data, cur) - (metadata, cur) = read_short_string(data, cur) - ((error,), cur) = relative_unpack('>h', data, cur) - - yield OffsetFetchResponse(topic, partition, offset, - metadata, error) - - -def create_message(payload, key=None): - """ - Construct a Message - - Params - ====== - payload: bytes, the payload to send to Kafka - key: bytes, a key used for partition routing (optional) - """ - return Message(0, 0, key, payload) - - -def create_gzip_message(payloads, key=None): - """ - Construct a Gzipped Message containing multiple Messages - - The given payloads will be encoded, compressed, and sent as a single atomic - message to Kafka. - - Params - ====== - payloads: list(bytes), a list of payload to send be sent to Kafka - key: bytes, a key used for partition routing (optional) - """ - message_set = KafkaProtocol._encode_message_set( - [create_message(payload) for payload in payloads]) - - gzipped = gzip_encode(message_set) - codec = ATTRIBUTE_CODEC_MASK & CODEC_GZIP - - return Message(0, 0x00 | codec, key, gzipped) - - -def create_snappy_message(payloads, key=None): - """ - Construct a Snappy Message containing multiple Messages - - The given payloads will be encoded, compressed, and sent as a single atomic - message to Kafka. - - Params - ====== - payloads: list(bytes), a list of payload to send be sent to Kafka - key: bytes, a key used for partition routing (optional) - """ - message_set = KafkaProtocol._encode_message_set( - [create_message(payload) for payload in payloads]) - - snapped = snappy_encode(message_set) - codec = ATTRIBUTE_CODEC_MASK & CODEC_SNAPPY - - return Message(0, 0x00 | codec, key, snapped) - - -def create_message_set(messages, codec=CODEC_NONE): - """Create a message set using the given codec. - - If codec is CODEC_NONE, return a list of raw Kafka messages. Otherwise, - return a list containing a single codec-encoded message. - """ - if codec == CODEC_NONE: - return [create_message(m) for m in messages] - elif codec == CODEC_GZIP: - return [create_gzip_message(messages)] - elif codec == CODEC_SNAPPY: - return [create_snappy_message(messages)] - else: - raise UnsupportedCodecError("Codec 0x%02x unsupported" % codec) diff -Nru python-kafka-python-0.9.2/kafka/queue.py python-kafka-python-1.0.1/kafka/queue.py --- python-kafka-python-0.9.2/kafka/queue.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/queue.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,219 +0,0 @@ -from __future__ import absolute_import - -from copy import copy -import logging -from multiprocessing import Process, Queue, Event -from Queue import Empty -import time - -from kafka.client import KafkaClient, FetchRequest, ProduceRequest - -log = logging.getLogger("kafka") - -raise NotImplementedError("Still need to refactor this class") - - -class KafkaConsumerProcess(Process): - def __init__(self, client, topic, partition, out_queue, barrier, - consumer_fetch_size=1024, consumer_sleep=200): - self.client = copy(client) - self.topic = topic - self.partition = partition - self.out_queue = out_queue - self.barrier = barrier - self.consumer_fetch_size = consumer_fetch_size - self.consumer_sleep = consumer_sleep / 1000. - log.info("Initializing %s" % self) - Process.__init__(self) - - def __str__(self): - return "[KafkaConsumerProcess: topic=%s, \ - partition=%s, sleep=%s]" % \ - (self.topic, self.partition, self.consumer_sleep) - - def run(self): - self.barrier.wait() - log.info("Starting %s" % self) - fetchRequest = FetchRequest(self.topic, self.partition, - offset=0, size=self.consumer_fetch_size) - - while True: - if self.barrier.is_set() is False: - log.info("Shutdown %s" % self) - self.client.close() - break - - lastOffset = fetchRequest.offset - (messages, fetchRequest) = self.client.get_message_set(fetchRequest) - - if fetchRequest.offset == lastOffset: - log.debug("No more data for this partition, " - "sleeping a bit (200ms)") - time.sleep(self.consumer_sleep) - continue - - for message in messages: - self.out_queue.put(message) - - -class KafkaProducerProcess(Process): - def __init__(self, client, topic, in_queue, barrier, - producer_flush_buffer=500, - producer_flush_timeout=2000, - producer_timeout=100): - - self.client = copy(client) - self.topic = topic - self.in_queue = in_queue - self.barrier = barrier - self.producer_flush_buffer = producer_flush_buffer - self.producer_flush_timeout = producer_flush_timeout / 1000. - self.producer_timeout = producer_timeout / 1000. - log.info("Initializing %s" % self) - Process.__init__(self) - - def __str__(self): - return "[KafkaProducerProcess: topic=%s, \ - flush_buffer=%s, flush_timeout=%s, timeout=%s]" % \ - (self.topic, - self.producer_flush_buffer, - self.producer_flush_timeout, - self.producer_timeout) - - def run(self): - self.barrier.wait() - log.info("Starting %s" % self) - messages = [] - last_produce = time.time() - - def flush(messages): - self.client.send_message_set(ProduceRequest(self.topic, -1, - messages)) - del messages[:] - - while True: - if self.barrier.is_set() is False: - log.info("Shutdown %s, flushing messages" % self) - flush(messages) - self.client.close() - break - - if len(messages) > self.producer_flush_buffer: - log.debug("Message count threshold reached. Flushing messages") - flush(messages) - last_produce = time.time() - - elif (time.time() - last_produce) > self.producer_flush_timeout: - log.debug("Producer timeout reached. Flushing messages") - flush(messages) - last_produce = time.time() - - try: - msg = KafkaClient.create_message( - self.in_queue.get(True, self.producer_timeout)) - messages.append(msg) - - except Empty: - continue - - -class KafkaQueue(object): - def __init__(self, client, topic, partitions, - producer_config=None, consumer_config=None): - """ - KafkaQueue a Queue-like object backed by a Kafka producer and some - number of consumers - - Messages are eagerly loaded by the consumer in batches of size - consumer_fetch_size. - Messages are buffered in the producer thread until - producer_flush_timeout or producer_flush_buffer is reached. - - Params - ====== - client: KafkaClient object - topic: str, the topic name - partitions: list of ints, the partions to consume from - producer_config: dict, see below - consumer_config: dict, see below - - Consumer Config - =============== - consumer_fetch_size: int, number of bytes to fetch in one call - to Kafka. Default is 1024 - consumer_sleep: int, time in milliseconds a consumer should sleep - when it reaches the end of a partition. Default is 200 - - Producer Config - =============== - producer_timeout: int, time in milliseconds a producer should - wait for messages to enqueue for producing. - Default is 100 - producer_flush_timeout: int, time in milliseconds a producer - should allow messages to accumulate before - sending to Kafka. Default is 2000 - producer_flush_buffer: int, number of messages a producer should - allow to accumulate. Default is 500 - - """ - producer_config = {} if producer_config is None else producer_config - consumer_config = {} if consumer_config is None else consumer_config - - self.in_queue = Queue() - self.out_queue = Queue() - self.consumers = [] - self.barrier = Event() - - # Initialize and start consumer threads - for partition in partitions: - consumer = KafkaConsumerProcess(client, topic, partition, - self.in_queue, self.barrier, - **consumer_config) - consumer.start() - self.consumers.append(consumer) - - # Initialize and start producer thread - self.producer = KafkaProducerProcess(client, topic, self.out_queue, - self.barrier, **producer_config) - self.producer.start() - - # Trigger everything to start - self.barrier.set() - - def get(self, block=True, timeout=None): - """ - Consume a message from Kafka - - Params - ====== - block: boolean, default True - timeout: int, number of seconds to wait when blocking, default None - - Returns - ======= - msg: str, the payload from Kafka - """ - return self.in_queue.get(block, timeout).payload - - def put(self, msg, block=True, timeout=None): - """ - Send a message to Kafka - - Params - ====== - msg: std, the message to send - block: boolean, default True - timeout: int, number of seconds to wait when blocking, default None - """ - self.out_queue.put(msg, block, timeout) - - def close(self): - """ - Close the internal queues and Kafka consumers/producer - """ - self.in_queue.close() - self.out_queue.close() - self.barrier.clear() - self.producer.join() - for consumer in self.consumers: - consumer.join() diff -Nru python-kafka-python-0.9.2/kafka/util.py python-kafka-python-1.0.1/kafka/util.py --- python-kafka-python-0.9.2/kafka/util.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/util.py 2016-02-18 16:38:17.000000000 +0000 @@ -1,14 +1,28 @@ +import binascii import collections import struct import sys from threading import Thread, Event +import weakref + +import six from kafka.common import BufferUnderflowError +def crc32(data): + crc = binascii.crc32(data) + # py2 and py3 behave a little differently + # CRC is encoded as a signed int in kafka protocol + # so we'll convert the py3 unsigned result to signed + if six.PY3 and crc >= 2**31: + crc -= 2**32 + return crc + + def write_int_string(s): - if s is not None and not isinstance(s, str): - raise TypeError('Expected "%s" to be str\n' + if s is not None and not isinstance(s, six.binary_type): + raise TypeError('Expected "%s" to be bytes\n' 'data=%s' % (type(s), repr(s))) if s is None: return struct.pack('>i', -1) @@ -17,12 +31,12 @@ def write_short_string(s): - if s is not None and not isinstance(s, str): - raise TypeError('Expected "%s" to be str\n' + if s is not None and not isinstance(s, six.binary_type): + raise TypeError('Expected "%s" to be bytes\n' 'data=%s' % (type(s), repr(s))) if s is None: return struct.pack('>h', -1) - elif len(s) > 32767 and sys.version < (2, 7): + elif len(s) > 32767 and sys.version_info < (2, 7): # Python 2.6 issues a deprecation warning instead of a struct error raise struct.error(len(s)) else: @@ -75,6 +89,9 @@ def group_by_topic_and_partition(tuples): out = collections.defaultdict(dict) for t in tuples: + assert t.topic not in out or t.partition not in out[t.topic], \ + 'Duplicate {0}s for {1} {2}'.format(t.__class__.__name__, + t.topic, t.partition) out[t.topic][t.partition] = t return out @@ -84,10 +101,12 @@ A timer that can be restarted, unlike threading.Timer (although this uses threading.Timer) - t: timer interval in milliseconds - fn: a callable to invoke - args: tuple of args to be passed to function - kwargs: keyword arguments to be passed to function + Arguments: + + t: timer interval in milliseconds + fn: a callable to invoke + args: tuple of args to be passed to function + kwargs: keyword arguments to be passed to function """ def __init__(self, t, fn, *args, **kwargs): @@ -105,7 +124,11 @@ self.active = None def _timer(self, active): - while not active.wait(self.t): + # python2.6 Event.wait() always returns None + # python2.7 and greater returns the flag value (true/false) + # we want the flag value, so add an 'or' here for python2.6 + # this is redundant for later python versions (FLAG OR FLAG == FLAG) + while not (active.wait(self.t) or active.is_set()): self.fn(*self.args, **self.kwargs) def start(self): @@ -125,3 +148,43 @@ self.thread.join(self.t + 1) # noinspection PyAttributeOutsideInit self.timer = None + self.fn = None + + def __del__(self): + self.stop() + + +class WeakMethod(object): + """ + Callable that weakly references a method and the object it is bound to. It + is based on http://stackoverflow.com/a/24287465. + + Arguments: + + object_dot_method: A bound instance method (i.e. 'object.method'). + """ + def __init__(self, object_dot_method): + try: + self.target = weakref.ref(object_dot_method.__self__) + except AttributeError: + self.target = weakref.ref(object_dot_method.im_self) + self._target_id = id(self.target()) + try: + self.method = weakref.ref(object_dot_method.__func__) + except AttributeError: + self.method = weakref.ref(object_dot_method.im_func) + self._method_id = id(self.method()) + + def __call__(self, *args, **kwargs): + """ + Calls the method on target with args and kwargs. + """ + return self.method()(self.target(), *args, **kwargs) + + def __hash__(self): + return hash(self.target) ^ hash(self.method) + + def __eq__(self, other): + if not isinstance(other, WeakMethod): + return False + return self._target_id == other._target_id and self._method_id == other._method_id diff -Nru python-kafka-python-0.9.2/kafka/version.py python-kafka-python-1.0.1/kafka/version.py --- python-kafka-python-0.9.2/kafka/version.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka/version.py 2016-02-19 17:02:08.000000000 +0000 @@ -0,0 +1 @@ +__version__ = '1.0.1' diff -Nru python-kafka-python-0.9.2/kafka_python.egg-info/PKG-INFO python-kafka-python-1.0.1/kafka_python.egg-info/PKG-INFO --- python-kafka-python-0.9.2/kafka_python.egg-info/PKG-INFO 2014-08-27 21:25:43.000000000 +0000 +++ python-kafka-python-1.0.1/kafka_python.egg-info/PKG-INFO 2016-02-19 18:18:12.000000000 +0000 @@ -1,16 +1,138 @@ Metadata-Version: 1.1 Name: kafka-python -Version: 0.9.2 +Version: 1.0.1 Summary: Pure Python client for Apache Kafka -Home-page: https://github.com/mumrah/kafka-python -Author: David Arthur -Author-email: mumrah@gmail.com +Home-page: https://github.com/dpkp/kafka-python +Author: Dana Powers +Author-email: dana.powers@gmail.com License: Apache License 2.0 -Description: - This module provides low-level protocol support for Apache Kafka as well as - high-level consumer and producer classes. Request batching is supported by the - protocol as well as broker-aware request routing. Gzip and Snappy compression - is also supported for message sets. +Description: Kafka Python client + ------------------------ + + .. image:: https://img.shields.io/badge/kafka-0.9%2C%200.8.2%2C%200.8.1%2C%200.8-brightgreen.svg + :target: https://kafka-python.readthedocs.org/compatibility.html + .. image:: https://img.shields.io/pypi/pyversions/kafka-python.svg + :target: https://pypi.python.org/pypi/kafka-python + .. image:: https://coveralls.io/repos/dpkp/kafka-python/badge.svg?branch=master&service=github + :target: https://coveralls.io/github/dpkp/kafka-python?branch=master + .. image:: https://travis-ci.org/dpkp/kafka-python.svg?branch=master + :target: https://travis-ci.org/dpkp/kafka-python + .. image:: https://img.shields.io/badge/license-Apache%202-blue.svg + :target: https://github.com/dpkp/kafka-python/blob/master/LICENSE + + Python client for the Apache Kafka distributed stream processing system. + kafka-python is designed to function much like the official java client, with a + sprinkling of pythonic interfaces (e.g., consumer iterators). + + kafka-python is best used with 0.9 brokers, but is backwards-compatible with + older versions (to 0.8.0). Some features will only be enabled on newer brokers, + however; for example, fully coordinated consumer groups -- i.e., dynamic partition + assignment to multiple consumers in the same group -- requires use of 0.9 kafka + brokers. Supporting this feature for earlier broker releases would require + writing and maintaining custom leadership election and membership / health + check code (perhaps using zookeeper or consul). For older brokers, you can + achieve something similar by manually assigning different partitions to each + consumer instance with config management tools like chef, ansible, etc. This + approach will work fine, though it does not support rebalancing on failures. + See `Compatibility `_ + for more details. + + Please note that the master branch may contain unreleased features. For release + documentation, please see readthedocs and/or python's inline help. + + >>> pip install kafka-python + + KafkaConsumer + ************* + + KafkaConsumer is a high-level message consumer, intended to operate as similarly + as possible to the official 0.9 java client. Full support for coordinated + consumer groups requires use of kafka brokers that support the 0.9 Group APIs. + + See `ReadTheDocs `_ + for API and configuration details. + + The consumer iterator returns ConsumerRecords, which are simple namedtuples + that expose basic message attributes: topic, partition, offset, key, and value: + + >>> from kafka import KafkaConsumer + >>> consumer = KafkaConsumer('my_favorite_topic') + >>> for msg in consumer: + ... print (msg) + + >>> # manually assign the partition list for the consumer + >>> from kafka import TopicPartition + >>> consumer = KafkaConsumer(bootstrap_servers='localhost:1234') + >>> consumer.assign([TopicPartition('foobar', 2)]) + >>> msg = next(consumer) + + >>> # Deserialize msgpack-encoded values + >>> consumer = KafkaConsumer(value_deserializer=msgpack.dumps) + >>> consumer.subscribe(['msgpackfoo']) + >>> for msg in consumer: + ... msg = next(consumer) + ... assert isinstance(msg.value, dict) + + + KafkaProducer + ************* + + KafkaProducer is a high-level, asynchronous message producer. The class is + intended to operate as similarly as possible to the official java client. + See `ReadTheDocs `_ + for more details. + + >>> from kafka import KafkaProducer + >>> producer = KafkaProducer(bootstrap_servers='localhost:1234') + >>> producer.send('foobar', b'some_message_bytes') + + >>> # Blocking send + >>> producer.send('foobar', b'another_message').get(timeout=60) + + >>> # Use a key for hashed-partitioning + >>> producer.send('foobar', key=b'foo', value=b'bar') + + >>> # Serialize json messages + >>> import json + >>> producer = KafkaProducer(value_serializer=json.loads) + >>> producer.send('fizzbuzz', {'foo': 'bar'}) + + >>> # Serialize string keys + >>> producer = KafkaProducer(key_serializer=str.encode) + >>> producer.send('flipflap', key='ping', value=b'1234') + + >>> # Compress messages + >>> producer = KafkaProducer(compression_type='gzip') + >>> for i in range(1000): + ... producer.send('foobar', b'msg %d' % i) + + Compression + *********** + + kafka-python supports gzip compression/decompression natively. To produce or + consume lz4 compressed messages, you must install lz4tools and xxhash (modules + may not work on python2.6). To enable snappy compression/decompression install + python-snappy (also requires snappy library). + See `Installation `_ + for more information. + + Protocol + ******** + + A secondary goal of kafka-python is to provide an easy-to-use protocol layer + for interacting with kafka brokers via the python repl. This is useful for + testing, probing, and general experimentation. The protocol support is + leveraged to enable a KafkaClient.check_version() method that + probes a kafka broker and attempts to identify which version it is running + (0.8.0 to 0.9). + + + Low-level + ********* + + Legacy support is maintained for low-level consumer and producer classes, + SimpleConsumer and SimpleProducer. See + `ReadTheDocs `_ for API details. Keywords: apache kafka Platform: UNKNOWN @@ -18,4 +140,12 @@ Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: Apache Software License Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 2 +Classifier: Programming Language :: Python :: 2.6 +Classifier: Programming Language :: Python :: 2.7 +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.3 +Classifier: Programming Language :: Python :: 3.4 +Classifier: Programming Language :: Python :: 3.5 +Classifier: Programming Language :: Python :: Implementation :: PyPy Classifier: Topic :: Software Development :: Libraries :: Python Modules diff -Nru python-kafka-python-0.9.2/kafka_python.egg-info/requires.txt python-kafka-python-1.0.1/kafka_python.egg-info/requires.txt --- python-kafka-python-0.9.2/kafka_python.egg-info/requires.txt 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/kafka_python.egg-info/requires.txt 2016-02-19 18:18:12.000000000 +0000 @@ -0,0 +1 @@ +six \ No newline at end of file diff -Nru python-kafka-python-0.9.2/kafka_python.egg-info/SOURCES.txt python-kafka-python-1.0.1/kafka_python.egg-info/SOURCES.txt --- python-kafka-python-0.9.2/kafka_python.egg-info/SOURCES.txt 2014-08-27 21:25:44.000000000 +0000 +++ python-kafka-python-1.0.1/kafka_python.egg-info/SOURCES.txt 2016-02-19 18:18:13.000000000 +0000 @@ -1,31 +1,88 @@ +AUTHORS.md +CHANGES.md +LICENSE MANIFEST.in -VERSION +README.rst setup.py kafka/__init__.py kafka/client.py +kafka/client_async.py +kafka/cluster.py kafka/codec.py kafka/common.py kafka/conn.py -kafka/consumer.py -kafka/partitioner.py -kafka/producer.py -kafka/protocol.py -kafka/queue.py +kafka/context.py +kafka/future.py kafka/util.py +kafka/version.py +kafka/consumer/__init__.py +kafka/consumer/base.py +kafka/consumer/fetcher.py +kafka/consumer/group.py +kafka/consumer/multiprocess.py +kafka/consumer/simple.py +kafka/consumer/subscription_state.py +kafka/coordinator/__init__.py +kafka/coordinator/base.py +kafka/coordinator/consumer.py +kafka/coordinator/heartbeat.py +kafka/coordinator/protocol.py +kafka/coordinator/assignors/__init__.py +kafka/coordinator/assignors/abstract.py +kafka/coordinator/assignors/range.py +kafka/coordinator/assignors/roundrobin.py +kafka/partitioner/__init__.py +kafka/partitioner/base.py +kafka/partitioner/default.py +kafka/partitioner/hashed.py +kafka/partitioner/roundrobin.py +kafka/producer/__init__.py +kafka/producer/base.py +kafka/producer/buffer.py +kafka/producer/future.py +kafka/producer/kafka.py +kafka/producer/keyed.py +kafka/producer/record_accumulator.py +kafka/producer/sender.py +kafka/producer/simple.py +kafka/protocol/__init__.py +kafka/protocol/abstract.py +kafka/protocol/admin.py +kafka/protocol/api.py +kafka/protocol/commit.py +kafka/protocol/fetch.py +kafka/protocol/group.py +kafka/protocol/legacy.py +kafka/protocol/message.py +kafka/protocol/metadata.py +kafka/protocol/offset.py +kafka/protocol/pickle.py +kafka/protocol/produce.py +kafka/protocol/struct.py +kafka/protocol/types.py kafka_python.egg-info/PKG-INFO kafka_python.egg-info/SOURCES.txt kafka_python.egg-info/dependency_links.txt +kafka_python.egg-info/requires.txt kafka_python.egg-info/top_level.txt +test/test_assignors.py test/test_client.py +test/test_client_async.py test/test_client_integration.py test/test_codec.py test/test_conn.py test/test_consumer.py +test/test_consumer_group.py test/test_consumer_integration.py +test/test_context.py +test/test_coordinator.py test/test_failover_integration.py +test/test_fetcher.py test/test_package.py +test/test_partitioner.py test/test_producer.py test/test_producer_integration.py +test/test_producer_legacy.py test/test_protocol.py test/test_util.py test/testutil.py \ No newline at end of file diff -Nru python-kafka-python-0.9.2/LICENSE python-kafka-python-1.0.1/LICENSE --- python-kafka-python-0.9.2/LICENSE 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/LICENSE 2016-01-23 22:22:32.000000000 +0000 @@ -0,0 +1,202 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2015 David Arthur + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + diff -Nru python-kafka-python-0.9.2/MANIFEST.in python-kafka-python-1.0.1/MANIFEST.in --- python-kafka-python-0.9.2/MANIFEST.in 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/MANIFEST.in 2016-01-23 22:22:32.000000000 +0000 @@ -1,2 +1,5 @@ -include VERSION recursive-include kafka *.py +include README.rst +include LICENSE +include AUTHORS.md +include CHANGES.md diff -Nru python-kafka-python-0.9.2/PKG-INFO python-kafka-python-1.0.1/PKG-INFO --- python-kafka-python-0.9.2/PKG-INFO 2014-08-27 21:25:44.000000000 +0000 +++ python-kafka-python-1.0.1/PKG-INFO 2016-02-19 18:18:13.000000000 +0000 @@ -1,16 +1,138 @@ Metadata-Version: 1.1 Name: kafka-python -Version: 0.9.2 +Version: 1.0.1 Summary: Pure Python client for Apache Kafka -Home-page: https://github.com/mumrah/kafka-python -Author: David Arthur -Author-email: mumrah@gmail.com +Home-page: https://github.com/dpkp/kafka-python +Author: Dana Powers +Author-email: dana.powers@gmail.com License: Apache License 2.0 -Description: - This module provides low-level protocol support for Apache Kafka as well as - high-level consumer and producer classes. Request batching is supported by the - protocol as well as broker-aware request routing. Gzip and Snappy compression - is also supported for message sets. +Description: Kafka Python client + ------------------------ + + .. image:: https://img.shields.io/badge/kafka-0.9%2C%200.8.2%2C%200.8.1%2C%200.8-brightgreen.svg + :target: https://kafka-python.readthedocs.org/compatibility.html + .. image:: https://img.shields.io/pypi/pyversions/kafka-python.svg + :target: https://pypi.python.org/pypi/kafka-python + .. image:: https://coveralls.io/repos/dpkp/kafka-python/badge.svg?branch=master&service=github + :target: https://coveralls.io/github/dpkp/kafka-python?branch=master + .. image:: https://travis-ci.org/dpkp/kafka-python.svg?branch=master + :target: https://travis-ci.org/dpkp/kafka-python + .. image:: https://img.shields.io/badge/license-Apache%202-blue.svg + :target: https://github.com/dpkp/kafka-python/blob/master/LICENSE + + Python client for the Apache Kafka distributed stream processing system. + kafka-python is designed to function much like the official java client, with a + sprinkling of pythonic interfaces (e.g., consumer iterators). + + kafka-python is best used with 0.9 brokers, but is backwards-compatible with + older versions (to 0.8.0). Some features will only be enabled on newer brokers, + however; for example, fully coordinated consumer groups -- i.e., dynamic partition + assignment to multiple consumers in the same group -- requires use of 0.9 kafka + brokers. Supporting this feature for earlier broker releases would require + writing and maintaining custom leadership election and membership / health + check code (perhaps using zookeeper or consul). For older brokers, you can + achieve something similar by manually assigning different partitions to each + consumer instance with config management tools like chef, ansible, etc. This + approach will work fine, though it does not support rebalancing on failures. + See `Compatibility `_ + for more details. + + Please note that the master branch may contain unreleased features. For release + documentation, please see readthedocs and/or python's inline help. + + >>> pip install kafka-python + + KafkaConsumer + ************* + + KafkaConsumer is a high-level message consumer, intended to operate as similarly + as possible to the official 0.9 java client. Full support for coordinated + consumer groups requires use of kafka brokers that support the 0.9 Group APIs. + + See `ReadTheDocs `_ + for API and configuration details. + + The consumer iterator returns ConsumerRecords, which are simple namedtuples + that expose basic message attributes: topic, partition, offset, key, and value: + + >>> from kafka import KafkaConsumer + >>> consumer = KafkaConsumer('my_favorite_topic') + >>> for msg in consumer: + ... print (msg) + + >>> # manually assign the partition list for the consumer + >>> from kafka import TopicPartition + >>> consumer = KafkaConsumer(bootstrap_servers='localhost:1234') + >>> consumer.assign([TopicPartition('foobar', 2)]) + >>> msg = next(consumer) + + >>> # Deserialize msgpack-encoded values + >>> consumer = KafkaConsumer(value_deserializer=msgpack.dumps) + >>> consumer.subscribe(['msgpackfoo']) + >>> for msg in consumer: + ... msg = next(consumer) + ... assert isinstance(msg.value, dict) + + + KafkaProducer + ************* + + KafkaProducer is a high-level, asynchronous message producer. The class is + intended to operate as similarly as possible to the official java client. + See `ReadTheDocs `_ + for more details. + + >>> from kafka import KafkaProducer + >>> producer = KafkaProducer(bootstrap_servers='localhost:1234') + >>> producer.send('foobar', b'some_message_bytes') + + >>> # Blocking send + >>> producer.send('foobar', b'another_message').get(timeout=60) + + >>> # Use a key for hashed-partitioning + >>> producer.send('foobar', key=b'foo', value=b'bar') + + >>> # Serialize json messages + >>> import json + >>> producer = KafkaProducer(value_serializer=json.loads) + >>> producer.send('fizzbuzz', {'foo': 'bar'}) + + >>> # Serialize string keys + >>> producer = KafkaProducer(key_serializer=str.encode) + >>> producer.send('flipflap', key='ping', value=b'1234') + + >>> # Compress messages + >>> producer = KafkaProducer(compression_type='gzip') + >>> for i in range(1000): + ... producer.send('foobar', b'msg %d' % i) + + Compression + *********** + + kafka-python supports gzip compression/decompression natively. To produce or + consume lz4 compressed messages, you must install lz4tools and xxhash (modules + may not work on python2.6). To enable snappy compression/decompression install + python-snappy (also requires snappy library). + See `Installation `_ + for more information. + + Protocol + ******** + + A secondary goal of kafka-python is to provide an easy-to-use protocol layer + for interacting with kafka brokers via the python repl. This is useful for + testing, probing, and general experimentation. The protocol support is + leveraged to enable a KafkaClient.check_version() method that + probes a kafka broker and attempts to identify which version it is running + (0.8.0 to 0.9). + + + Low-level + ********* + + Legacy support is maintained for low-level consumer and producer classes, + SimpleConsumer and SimpleProducer. See + `ReadTheDocs `_ for API details. Keywords: apache kafka Platform: UNKNOWN @@ -18,4 +140,12 @@ Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: Apache Software License Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 2 +Classifier: Programming Language :: Python :: 2.6 +Classifier: Programming Language :: Python :: 2.7 +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.3 +Classifier: Programming Language :: Python :: 3.4 +Classifier: Programming Language :: Python :: 3.5 +Classifier: Programming Language :: Python :: Implementation :: PyPy Classifier: Topic :: Software Development :: Libraries :: Python Modules diff -Nru python-kafka-python-0.9.2/README.rst python-kafka-python-1.0.1/README.rst --- python-kafka-python-0.9.2/README.rst 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/README.rst 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,127 @@ +Kafka Python client +------------------------ + +.. image:: https://img.shields.io/badge/kafka-0.9%2C%200.8.2%2C%200.8.1%2C%200.8-brightgreen.svg + :target: https://kafka-python.readthedocs.org/compatibility.html +.. image:: https://img.shields.io/pypi/pyversions/kafka-python.svg + :target: https://pypi.python.org/pypi/kafka-python +.. image:: https://coveralls.io/repos/dpkp/kafka-python/badge.svg?branch=master&service=github + :target: https://coveralls.io/github/dpkp/kafka-python?branch=master +.. image:: https://travis-ci.org/dpkp/kafka-python.svg?branch=master + :target: https://travis-ci.org/dpkp/kafka-python +.. image:: https://img.shields.io/badge/license-Apache%202-blue.svg + :target: https://github.com/dpkp/kafka-python/blob/master/LICENSE + +Python client for the Apache Kafka distributed stream processing system. +kafka-python is designed to function much like the official java client, with a +sprinkling of pythonic interfaces (e.g., consumer iterators). + +kafka-python is best used with 0.9 brokers, but is backwards-compatible with +older versions (to 0.8.0). Some features will only be enabled on newer brokers, +however; for example, fully coordinated consumer groups -- i.e., dynamic partition +assignment to multiple consumers in the same group -- requires use of 0.9 kafka +brokers. Supporting this feature for earlier broker releases would require +writing and maintaining custom leadership election and membership / health +check code (perhaps using zookeeper or consul). For older brokers, you can +achieve something similar by manually assigning different partitions to each +consumer instance with config management tools like chef, ansible, etc. This +approach will work fine, though it does not support rebalancing on failures. +See `Compatibility `_ +for more details. + +Please note that the master branch may contain unreleased features. For release +documentation, please see readthedocs and/or python's inline help. + +>>> pip install kafka-python + +KafkaConsumer +************* + +KafkaConsumer is a high-level message consumer, intended to operate as similarly +as possible to the official 0.9 java client. Full support for coordinated +consumer groups requires use of kafka brokers that support the 0.9 Group APIs. + +See `ReadTheDocs `_ +for API and configuration details. + +The consumer iterator returns ConsumerRecords, which are simple namedtuples +that expose basic message attributes: topic, partition, offset, key, and value: + +>>> from kafka import KafkaConsumer +>>> consumer = KafkaConsumer('my_favorite_topic') +>>> for msg in consumer: +... print (msg) + +>>> # manually assign the partition list for the consumer +>>> from kafka import TopicPartition +>>> consumer = KafkaConsumer(bootstrap_servers='localhost:1234') +>>> consumer.assign([TopicPartition('foobar', 2)]) +>>> msg = next(consumer) + +>>> # Deserialize msgpack-encoded values +>>> consumer = KafkaConsumer(value_deserializer=msgpack.dumps) +>>> consumer.subscribe(['msgpackfoo']) +>>> for msg in consumer: +... msg = next(consumer) +... assert isinstance(msg.value, dict) + + +KafkaProducer +************* + +KafkaProducer is a high-level, asynchronous message producer. The class is +intended to operate as similarly as possible to the official java client. +See `ReadTheDocs `_ +for more details. + +>>> from kafka import KafkaProducer +>>> producer = KafkaProducer(bootstrap_servers='localhost:1234') +>>> producer.send('foobar', b'some_message_bytes') + +>>> # Blocking send +>>> producer.send('foobar', b'another_message').get(timeout=60) + +>>> # Use a key for hashed-partitioning +>>> producer.send('foobar', key=b'foo', value=b'bar') + +>>> # Serialize json messages +>>> import json +>>> producer = KafkaProducer(value_serializer=json.loads) +>>> producer.send('fizzbuzz', {'foo': 'bar'}) + +>>> # Serialize string keys +>>> producer = KafkaProducer(key_serializer=str.encode) +>>> producer.send('flipflap', key='ping', value=b'1234') + +>>> # Compress messages +>>> producer = KafkaProducer(compression_type='gzip') +>>> for i in range(1000): +... producer.send('foobar', b'msg %d' % i) + +Compression +*********** + +kafka-python supports gzip compression/decompression natively. To produce or +consume lz4 compressed messages, you must install lz4tools and xxhash (modules +may not work on python2.6). To enable snappy compression/decompression install +python-snappy (also requires snappy library). +See `Installation `_ +for more information. + +Protocol +******** + +A secondary goal of kafka-python is to provide an easy-to-use protocol layer +for interacting with kafka brokers via the python repl. This is useful for +testing, probing, and general experimentation. The protocol support is +leveraged to enable a KafkaClient.check_version() method that +probes a kafka broker and attempts to identify which version it is running +(0.8.0 to 0.9). + + +Low-level +********* + +Legacy support is maintained for low-level consumer and producer classes, +SimpleConsumer and SimpleProducer. See +`ReadTheDocs `_ for API details. diff -Nru python-kafka-python-0.9.2/setup.py python-kafka-python-1.0.1/setup.py --- python-kafka-python-0.9.2/setup.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/setup.py 2016-02-17 18:37:58.000000000 +0000 @@ -1,10 +1,10 @@ import sys +import os +from setuptools import setup, Command, find_packages -from setuptools import setup, Command - -with open('VERSION', 'r') as v: - __version__ = v.read().rstrip() - +# Pull version from source without importing +# since we can't import something we haven't built yet :) +exec(open('kafka/version.py').read()) class Tox(Command): @@ -22,32 +22,43 @@ sys.exit(tox.cmdline([])) +test_require = ['tox', 'mock'] +if sys.version_info < (2, 7): + test_require.append('unittest2') + +here = os.path.abspath(os.path.dirname(__file__)) + +with open(os.path.join(here, 'README.rst')) as f: + README = f.read() + setup( name="kafka-python", version=__version__, - tests_require=["tox", "mock", "unittest2"], + tests_require=test_require, cmdclass={"test": Tox}, - - packages=["kafka"], - - author="David Arthur", - author_email="mumrah@gmail.com", - url="https://github.com/mumrah/kafka-python", + packages=find_packages(exclude=['test']), + author="Dana Powers", + author_email="dana.powers@gmail.com", + url="https://github.com/dpkp/kafka-python", license="Apache License 2.0", description="Pure Python client for Apache Kafka", - long_description=""" -This module provides low-level protocol support for Apache Kafka as well as -high-level consumer and producer classes. Request batching is supported by the -protocol as well as broker-aware request routing. Gzip and Snappy compression -is also supported for message sets. -""", + long_description=README, keywords="apache kafka", - classifiers = [ + install_requires=['six'], + classifiers=[ "Development Status :: 4 - Beta", "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python", - "Topic :: Software Development :: Libraries :: Python Modules" + "Programming Language :: Python :: 2", + "Programming Language :: Python :: 2.6", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.3", + "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: Implementation :: PyPy", + "Topic :: Software Development :: Libraries :: Python Modules", ] ) diff -Nru python-kafka-python-0.9.2/test/test_assignors.py python-kafka-python-1.0.1/test/test_assignors.py --- python-kafka-python-0.9.2/test/test_assignors.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/test/test_assignors.py 2016-02-18 16:38:17.000000000 +0000 @@ -0,0 +1,58 @@ +# pylint: skip-file +from __future__ import absolute_import + +import pytest + +from kafka.coordinator.assignors.range import RangePartitionAssignor +from kafka.coordinator.assignors.roundrobin import RoundRobinPartitionAssignor +from kafka.coordinator.protocol import ( + ConsumerProtocolMemberMetadata, ConsumerProtocolMemberAssignment) + + +@pytest.fixture +def cluster(mocker): + cluster = mocker.MagicMock() + cluster.partitions_for_topic.return_value = set([0, 1, 2]) + return cluster + + +def test_assignor_roundrobin(cluster): + assignor = RoundRobinPartitionAssignor + + member_metadata = { + 'C0': assignor.metadata(set(['t0', 't1'])), + 'C1': assignor.metadata(set(['t0', 't1'])), + } + + ret = assignor.assign(cluster, member_metadata) + expected = { + 'C0': ConsumerProtocolMemberAssignment( + assignor.version, [('t0', [0, 2]), ('t1', [1])], b''), + 'C1': ConsumerProtocolMemberAssignment( + assignor.version, [('t0', [1]), ('t1', [0, 2])], b'') + } + assert ret == expected + assert set(ret) == set(expected) + for member in ret: + assert ret[member].encode() == expected[member].encode() + + +def test_assignor_range(cluster): + assignor = RangePartitionAssignor + + member_metadata = { + 'C0': assignor.metadata(set(['t0', 't1'])), + 'C1': assignor.metadata(set(['t0', 't1'])), + } + + ret = assignor.assign(cluster, member_metadata) + expected = { + 'C0': ConsumerProtocolMemberAssignment( + assignor.version, [('t0', [0, 1]), ('t1', [0, 1])], b''), + 'C1': ConsumerProtocolMemberAssignment( + assignor.version, [('t0', [2]), ('t1', [2])], b'') + } + assert ret == expected + assert set(ret) == set(expected) + for member in ret: + assert ret[member].encode() == expected[member].encode() diff -Nru python-kafka-python-0.9.2/test/test_client_async.py python-kafka-python-1.0.1/test/test_client_async.py --- python-kafka-python-0.9.2/test/test_client_async.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/test/test_client_async.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,301 @@ +import time + +import pytest + +from kafka.client_async import KafkaClient +from kafka.common import BrokerMetadata +import kafka.common as Errors +from kafka.conn import ConnectionStates +from kafka.future import Future +from kafka.protocol.metadata import MetadataResponse, MetadataRequest +from kafka.protocol.produce import ProduceRequest + + +@pytest.mark.parametrize("bootstrap,expected_hosts", [ + (None, [('localhost', 9092)]), + ('foobar:1234', [('foobar', 1234)]), + ('fizzbuzz', [('fizzbuzz', 9092)]), + ('foo:12,bar:34', [('foo', 12), ('bar', 34)]), + (['fizz:56', 'buzz'], [('fizz', 56), ('buzz', 9092)]), +]) +def test_bootstrap_servers(mocker, bootstrap, expected_hosts): + mocker.patch.object(KafkaClient, '_bootstrap') + if bootstrap is None: + KafkaClient() + else: + KafkaClient(bootstrap_servers=bootstrap) + + # host order is randomized internally, so resort before testing + (hosts,), _ = KafkaClient._bootstrap.call_args # pylint: disable=no-member + assert sorted(hosts) == sorted(expected_hosts) + + +@pytest.fixture +def conn(mocker): + conn = mocker.patch('kafka.client_async.BrokerConnection') + conn.return_value = conn + conn.state = ConnectionStates.CONNECTED + conn.send.return_value = Future().success( + MetadataResponse( + [(0, 'foo', 12), (1, 'bar', 34)], # brokers + [])) # topics + conn.blacked_out.return_value = False + conn.connect.return_value = conn.state + return conn + + +def test_bootstrap_success(conn): + conn.state = ConnectionStates.CONNECTED + cli = KafkaClient() + conn.assert_called_once_with('localhost', 9092, **cli.config) + conn.connect.assert_called_with() + conn.send.assert_called_once_with(MetadataRequest([])) + assert cli._bootstrap_fails == 0 + assert cli.cluster.brokers() == set([BrokerMetadata(0, 'foo', 12), + BrokerMetadata(1, 'bar', 34)]) + +def test_bootstrap_failure(conn): + conn.state = ConnectionStates.DISCONNECTED + cli = KafkaClient() + conn.assert_called_once_with('localhost', 9092, **cli.config) + conn.connect.assert_called_with() + conn.close.assert_called_with() + assert cli._bootstrap_fails == 1 + assert cli.cluster.brokers() == set() + + +def test_can_connect(conn): + cli = KafkaClient() + + # Node is not in broker metadata - cant connect + assert not cli._can_connect(2) + + # Node is in broker metadata but not in _conns + assert 0 not in cli._conns + assert cli._can_connect(0) + + # Node is connected, can't reconnect + cli._initiate_connect(0) + assert not cli._can_connect(0) + + # Node is disconnected, can connect + cli._conns[0].state = ConnectionStates.DISCONNECTED + assert cli._can_connect(0) + + # Node is disconnected, but blacked out + conn.blacked_out.return_value = True + assert not cli._can_connect(0) + +def test_initiate_connect(conn): + cli = KafkaClient() + try: + # Node not in metadata, raises AssertionError + cli._initiate_connect(2) + except AssertionError: + pass + else: + assert False, 'Exception not raised' + + assert 0 not in cli._conns + state = cli._initiate_connect(0) + assert cli._conns[0] is conn + assert state is conn.state + + +def test_finish_connect(conn): + cli = KafkaClient() + try: + # Node not in metadata, raises AssertionError + cli._initiate_connect(2) + except AssertionError: + pass + else: + assert False, 'Exception not raised' + + assert 0 not in cli._conns + cli._initiate_connect(0) + + conn.connect.return_value = ConnectionStates.CONNECTING + state = cli._finish_connect(0) + assert 0 in cli._connecting + assert state is ConnectionStates.CONNECTING + + conn.connect.return_value = ConnectionStates.CONNECTED + state = cli._finish_connect(0) + assert 0 not in cli._connecting + assert state is ConnectionStates.CONNECTED + + # Failure to connect should trigger metadata update + assert not cli.cluster._need_update + cli._connecting.add(0) + conn.connect.return_value = ConnectionStates.DISCONNECTED + state = cli._finish_connect(0) + assert 0 not in cli._connecting + assert state is ConnectionStates.DISCONNECTED + assert cli.cluster._need_update + + +def test_ready(conn): + cli = KafkaClient() + + # Node not in metadata + assert not cli.ready(2) + + # Node in metadata will connect + assert 0 not in cli._conns + assert cli.ready(0) + assert 0 in cli._conns + assert cli._conns[0].state is ConnectionStates.CONNECTED + + # metadata refresh blocks ready nodes + assert cli.ready(0) + assert cli.ready(1) + cli._metadata_refresh_in_progress = True + assert not cli.ready(0) + assert not cli.ready(1) + + # requesting metadata update also blocks ready nodes + cli._metadata_refresh_in_progress = False + assert cli.ready(0) + assert cli.ready(1) + cli.cluster.request_update() + cli.cluster.config['retry_backoff_ms'] = 0 + assert not cli._metadata_refresh_in_progress + assert not cli.ready(0) + assert not cli.ready(1) + cli.cluster._need_update = False + + # if connection can't send more, not ready + assert cli.ready(0) + assert cli.ready(1) + conn.can_send_more.return_value = False + assert not cli.ready(0) + conn.can_send_more.return_value = True + + # disconnected nodes, not ready + assert cli.ready(0) + assert cli.ready(1) + conn.connected.return_value = False + assert not cli.ready(0) + conn.connected.return_value = True + + # connecting node connects + cli._connecting.add(0) + conn.connected.return_value = False + cli.ready(0) + assert 0 not in cli._connecting + assert cli._conns[0].connect.called_with() + + +def test_close(conn): + cli = KafkaClient() + + # Unknown node - silent + cli.close(2) + + # Single node close + cli._initiate_connect(0) + assert not conn.close.call_count + cli.close(0) + assert conn.close.call_count == 1 + + # All node close + cli._initiate_connect(1) + cli.close() + assert conn.close.call_count == 3 + + +def test_is_disconnected(conn): + cli = KafkaClient() + + # False if not connected yet + conn.state = ConnectionStates.DISCONNECTED + assert not cli.is_disconnected(0) + + cli._initiate_connect(0) + assert cli.is_disconnected(0) + + conn.state = ConnectionStates.CONNECTING + assert not cli.is_disconnected(0) + + conn.state = ConnectionStates.CONNECTED + assert not cli.is_disconnected(0) + + +def test_send(conn): + cli = KafkaClient() + try: + cli.send(2, None) + except Errors.NodeNotReadyError: + pass + else: + assert False, 'NodeNotReadyError not raised' + + cli._initiate_connect(0) + # ProduceRequest w/ 0 required_acks -> no response + request = ProduceRequest(0, 0, []) + ret = cli.send(0, request) + assert conn.send.called_with(request, expect_response=False) + assert isinstance(ret, Future) + + request = MetadataRequest([]) + cli.send(0, request) + assert conn.send.called_with(request, expect_response=True) + + +def test_poll(mocker): + mocker.patch.object(KafkaClient, '_bootstrap') + metadata = mocker.patch.object(KafkaClient, '_maybe_refresh_metadata') + _poll = mocker.patch.object(KafkaClient, '_poll') + cli = KafkaClient() + tasks = mocker.patch.object(cli._delayed_tasks, 'next_at') + + # metadata timeout wins + metadata.return_value = 1000 + tasks.return_value = 2 + cli.poll() + _poll.assert_called_with(1.0, sleep=False) + + # user timeout wins + cli.poll(250) + _poll.assert_called_with(0.25, sleep=False) + + # tasks timeout wins + tasks.return_value = 0 + cli.poll(250) + _poll.assert_called_with(0, sleep=False) + + # default is request_timeout_ms + metadata.return_value = 1000000 + tasks.return_value = 10000 + cli.poll() + _poll.assert_called_with(cli.config['request_timeout_ms'] / 1000.0, + sleep=False) + + +def test__poll(): + pass + + +def test_in_flight_request_count(): + pass + + +def test_least_loaded_node(): + pass + + +def test_set_topics(): + pass + + +def test_maybe_refresh_metadata(): + pass + + +def test_schedule(): + pass + + +def test_unschedule(): + pass diff -Nru python-kafka-python-0.9.2/test/test_client_integration.py python-kafka-python-1.0.1/test/test_client_integration.py --- python-kafka-python-0.9.2/test/test_client_integration.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/test/test_client_integration.py 2016-02-17 18:37:58.000000000 +0000 @@ -1,11 +1,14 @@ import os -import socket -import unittest2 -import kafka -from kafka.common import * +from kafka.common import ( + FetchRequestPayload, OffsetCommitRequestPayload, OffsetFetchRequestPayload, + KafkaTimeoutError, ProduceRequestPayload +) +from kafka.protocol import create_message + from test.fixtures import ZookeeperFixture, KafkaFixture -from test.testutil import * +from test.testutil import KafkaIntegrationTestCase, kafka_versions + class TestKafkaClientIntegration(KafkaIntegrationTestCase): @classmethod @@ -24,30 +27,17 @@ cls.server.close() cls.zk.close() - @unittest2.skip("This doesn't appear to work on Linux?") - def test_timeout(self): - server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - server_port = get_open_port() - server_socket.bind(('localhost', server_port)) - - with Timer() as t: - with self.assertRaises((socket.timeout, socket.error)): - kafka.conn.KafkaConnection("localhost", server_port, 1.0) - self.assertGreaterEqual(t.interval, 1.0) - - @kafka_versions("all") def test_consume_none(self): - fetch = FetchRequest(self.topic, 0, 0, 1024) + fetch = FetchRequestPayload(self.topic, 0, 0, 1024) fetch_resp, = self.client.send_fetch_request([fetch]) - self.assertEquals(fetch_resp.error, 0) - self.assertEquals(fetch_resp.topic, self.topic) - self.assertEquals(fetch_resp.partition, 0) + self.assertEqual(fetch_resp.error, 0) + self.assertEqual(fetch_resp.topic, self.topic) + self.assertEqual(fetch_resp.partition, 0) messages = list(fetch_resp.messages) - self.assertEquals(len(messages), 0) + self.assertEqual(len(messages), 0) - @kafka_versions("all") def test_ensure_topic_exists(self): # assume that self.topic was created by setUp @@ -56,20 +46,48 @@ # ensure_topic_exists should fail with KafkaTimeoutError with self.assertRaises(KafkaTimeoutError): - self.client.ensure_topic_exists("this_topic_doesnt_exist", timeout=0) + self.client.ensure_topic_exists('this_topic_doesnt_exist', timeout=0) + + def test_send_produce_request_maintains_request_response_order(self): + + self.client.ensure_topic_exists('foo') + self.client.ensure_topic_exists('bar') + + requests = [ + ProduceRequestPayload( + 'foo', 0, + [create_message(b'a'), create_message(b'b')]), + ProduceRequestPayload( + 'bar', 1, + [create_message(b'a'), create_message(b'b')]), + ProduceRequestPayload( + 'foo', 1, + [create_message(b'a'), create_message(b'b')]), + ProduceRequestPayload( + 'bar', 0, + [create_message(b'a'), create_message(b'b')]), + ] + + responses = self.client.send_produce_request(requests) + while len(responses): + request = requests.pop() + response = responses.pop() + self.assertEqual(request.topic, response.topic) + self.assertEqual(request.partition, response.partition) + #################### # Offset Tests # #################### - @kafka_versions("0.8.1", "0.8.1.1") + @kafka_versions('>=0.8.1') def test_commit_fetch_offsets(self): - req = OffsetCommitRequest(self.topic, 0, 42, "metadata") - (resp,) = self.client.send_offset_commit_request("group", [req]) - self.assertEquals(resp.error, 0) - - req = OffsetFetchRequest(self.topic, 0) - (resp,) = self.client.send_offset_fetch_request("group", [req]) - self.assertEquals(resp.error, 0) - self.assertEquals(resp.offset, 42) - self.assertEquals(resp.metadata, "") # Metadata isn't stored for now + req = OffsetCommitRequestPayload(self.topic, 0, 42, 'metadata') + (resp,) = self.client.send_offset_commit_request('group', [req]) + self.assertEqual(resp.error, 0) + + req = OffsetFetchRequestPayload(self.topic, 0) + (resp,) = self.client.send_offset_fetch_request('group', [req]) + self.assertEqual(resp.error, 0) + self.assertEqual(resp.offset, 42) + self.assertEqual(resp.metadata, '') # Metadata isn't stored for now diff -Nru python-kafka-python-0.9.2/test/test_client.py python-kafka-python-1.0.1/test/test_client.py --- python-kafka-python-0.9.2/test/test_client.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/test/test_client.py 2016-02-17 18:37:58.000000000 +0000 @@ -1,244 +1,411 @@ -import unittest2 +import socket +from time import sleep -from mock import MagicMock, patch +from mock import ANY, MagicMock, patch +import six +from . import unittest -from kafka import KafkaClient +from kafka import SimpleClient from kafka.common import ( - ProduceRequest, BrokerMetadata, PartitionMetadata, - TopicAndPartition, KafkaUnavailableError, - LeaderUnavailableError, PartitionUnavailableError + ProduceRequestPayload, + BrokerMetadata, + TopicPartition, KafkaUnavailableError, + LeaderNotAvailableError, UnknownTopicOrPartitionError, + KafkaTimeoutError, ConnectionError ) -from kafka.protocol import create_message +from kafka.conn import KafkaConnection +from kafka.future import Future +from kafka.protocol import KafkaProtocol, create_message +from kafka.protocol.metadata import MetadataResponse -class TestKafkaClient(unittest2.TestCase): +from test.testutil import Timer + +NO_ERROR = 0 +UNKNOWN_TOPIC_OR_PARTITION = 3 +NO_LEADER = 5 + + +def mock_conn(conn, success=True): + mocked = MagicMock() + mocked.connected.return_value = True + if success: + mocked.send.return_value = Future().success(True) + else: + mocked.send.return_value = Future().failure(Exception()) + conn.return_value = mocked + + +class TestSimpleClient(unittest.TestCase): def test_init_with_list(self): - with patch.object(KafkaClient, 'load_metadata_for_topics'): - client = KafkaClient(hosts=['kafka01:9092', 'kafka02:9092', 'kafka03:9092']) + with patch.object(SimpleClient, 'load_metadata_for_topics'): + client = SimpleClient(hosts=['kafka01:9092', 'kafka02:9092', 'kafka03:9092']) - self.assertItemsEqual( - [('kafka01', 9092), ('kafka02', 9092), ('kafka03', 9092)], - client.hosts) + self.assertEqual( + sorted([('kafka01', 9092), ('kafka02', 9092), ('kafka03', 9092)]), + sorted(client.hosts)) def test_init_with_csv(self): - with patch.object(KafkaClient, 'load_metadata_for_topics'): - client = KafkaClient(hosts='kafka01:9092,kafka02:9092,kafka03:9092') + with patch.object(SimpleClient, 'load_metadata_for_topics'): + client = SimpleClient(hosts='kafka01:9092,kafka02:9092,kafka03:9092') - self.assertItemsEqual( - [('kafka01', 9092), ('kafka02', 9092), ('kafka03', 9092)], - client.hosts) + self.assertEqual( + sorted([('kafka01', 9092), ('kafka02', 9092), ('kafka03', 9092)]), + sorted(client.hosts)) def test_init_with_unicode_csv(self): - with patch.object(KafkaClient, 'load_metadata_for_topics'): - client = KafkaClient(hosts=u'kafka01:9092,kafka02:9092,kafka03:9092') - - self.assertItemsEqual( - [('kafka01', 9092), ('kafka02', 9092), ('kafka03', 9092)], - client.hosts) - - def test_send_broker_unaware_request_fail(self): - 'Tests that call fails when all hosts are unavailable' + with patch.object(SimpleClient, 'load_metadata_for_topics'): + client = SimpleClient(hosts=u'kafka01:9092,kafka02:9092,kafka03:9092') + self.assertEqual( + sorted([('kafka01', 9092), ('kafka02', 9092), ('kafka03', 9092)]), + sorted(client.hosts)) + + @patch.object(SimpleClient, '_get_conn') + @patch.object(SimpleClient, 'load_metadata_for_topics') + def test_send_broker_unaware_request_fail(self, load_metadata, conn): mocked_conns = { ('kafka01', 9092): MagicMock(), ('kafka02', 9092): MagicMock() } - - # inject KafkaConnection side effects - mocked_conns[('kafka01', 9092)].send.side_effect = RuntimeError("kafka01 went away (unittest)") - mocked_conns[('kafka02', 9092)].send.side_effect = RuntimeError("Kafka02 went away (unittest)") + for val in mocked_conns.values(): + mock_conn(val, success=False) def mock_get_conn(host, port): return mocked_conns[(host, port)] + conn.side_effect = mock_get_conn - # patch to avoid making requests before we want it - with patch.object(KafkaClient, 'load_metadata_for_topics'): - with patch.object(KafkaClient, '_get_conn', side_effect=mock_get_conn): - client = KafkaClient(hosts=['kafka01:9092', 'kafka02:9092']) + client = SimpleClient(hosts=['kafka01:9092', 'kafka02:9092']) - with self.assertRaises(KafkaUnavailableError): - client._send_broker_unaware_request(1, 'fake request') + req = KafkaProtocol.encode_metadata_request() + with self.assertRaises(KafkaUnavailableError): + client._send_broker_unaware_request(payloads=['fake request'], + encoder_fn=MagicMock(return_value='fake encoded message'), + decoder_fn=lambda x: x) - for key, conn in mocked_conns.iteritems(): - conn.send.assert_called_with(1, 'fake request') + for key, conn in six.iteritems(mocked_conns): + conn.send.assert_called_with('fake encoded message') def test_send_broker_unaware_request(self): - 'Tests that call works when at least one of the host is available' - mocked_conns = { ('kafka01', 9092): MagicMock(), ('kafka02', 9092): MagicMock(), ('kafka03', 9092): MagicMock() } # inject KafkaConnection side effects - mocked_conns[('kafka01', 9092)].send.side_effect = RuntimeError("kafka01 went away (unittest)") - mocked_conns[('kafka02', 9092)].recv.return_value = 'valid response' - mocked_conns[('kafka03', 9092)].send.side_effect = RuntimeError("kafka03 went away (unittest)") + mock_conn(mocked_conns[('kafka01', 9092)], success=False) + mock_conn(mocked_conns[('kafka03', 9092)], success=False) + future = Future() + mocked_conns[('kafka02', 9092)].send.return_value = future + mocked_conns[('kafka02', 9092)].recv.side_effect = lambda: future.success('valid response') def mock_get_conn(host, port): return mocked_conns[(host, port)] # patch to avoid making requests before we want it - with patch.object(KafkaClient, 'load_metadata_for_topics'): - with patch.object(KafkaClient, '_get_conn', side_effect=mock_get_conn): - client = KafkaClient(hosts='kafka01:9092,kafka02:9092') + with patch.object(SimpleClient, 'load_metadata_for_topics'): + with patch.object(SimpleClient, '_get_conn', side_effect=mock_get_conn): - resp = client._send_broker_unaware_request(1, 'fake request') + client = SimpleClient(hosts='kafka01:9092,kafka02:9092') + resp = client._send_broker_unaware_request(payloads=['fake request'], + encoder_fn=MagicMock(), + decoder_fn=lambda x: x) self.assertEqual('valid response', resp) - mocked_conns[('kafka02', 9092)].recv.assert_called_with(1) + mocked_conns[('kafka02', 9092)].recv.assert_called_once_with() - @patch('kafka.client.KafkaConnection') + @patch('kafka.SimpleClient._get_conn') @patch('kafka.client.KafkaProtocol') def test_load_metadata(self, protocol, conn): - "Load metadata for all topics" - conn.recv.return_value = 'response' # anything but None + mock_conn(conn) - brokers = {} - brokers[0] = BrokerMetadata(1, 'broker_1', 4567) - brokers[1] = BrokerMetadata(2, 'broker_2', 5678) - - topics = {} - topics['topic_1'] = { - 0: PartitionMetadata('topic_1', 0, 1, [1, 2], [1, 2]) - } - topics['topic_noleader'] = { - 0: PartitionMetadata('topic_noleader', 0, -1, [], []), - 1: PartitionMetadata('topic_noleader', 1, -1, [], []) - } - topics['topic_no_partitions'] = {} - topics['topic_3'] = { - 0: PartitionMetadata('topic_3', 0, 0, [0, 1], [0, 1]), - 1: PartitionMetadata('topic_3', 1, 1, [1, 0], [1, 0]), - 2: PartitionMetadata('topic_3', 2, 0, [0, 1], [0, 1]) - } - protocol.decode_metadata_response.return_value = (brokers, topics) + brokers = [ + BrokerMetadata(0, 'broker_1', 4567), + BrokerMetadata(1, 'broker_2', 5678) + ] + + topics = [ + (NO_ERROR, 'topic_1', [ + (NO_ERROR, 0, 1, [1, 2], [1, 2]) + ]), + (NO_ERROR, 'topic_noleader', [ + (NO_LEADER, 0, -1, [], []), + (NO_LEADER, 1, -1, [], []), + ]), + (NO_LEADER, 'topic_no_partitions', []), + (UNKNOWN_TOPIC_OR_PARTITION, 'topic_unknown', []), + (NO_ERROR, 'topic_3', [ + (NO_ERROR, 0, 0, [0, 1], [0, 1]), + (NO_ERROR, 1, 1, [1, 0], [1, 0]), + (NO_ERROR, 2, 0, [0, 1], [0, 1]) + ]) + ] + protocol.decode_metadata_response.return_value = MetadataResponse(brokers, topics) # client loads metadata at init - client = KafkaClient(hosts=['broker_1:4567']) + client = SimpleClient(hosts=['broker_1:4567']) self.assertDictEqual({ - TopicAndPartition('topic_1', 0): brokers[1], - TopicAndPartition('topic_noleader', 0): None, - TopicAndPartition('topic_noleader', 1): None, - TopicAndPartition('topic_3', 0): brokers[0], - TopicAndPartition('topic_3', 1): brokers[1], - TopicAndPartition('topic_3', 2): brokers[0]}, + TopicPartition('topic_1', 0): brokers[1], + TopicPartition('topic_noleader', 0): None, + TopicPartition('topic_noleader', 1): None, + TopicPartition('topic_3', 0): brokers[0], + TopicPartition('topic_3', 1): brokers[1], + TopicPartition('topic_3', 2): brokers[0]}, client.topics_to_brokers) - @patch('kafka.client.KafkaConnection') + # if we ask for metadata explicitly, it should raise errors + with self.assertRaises(LeaderNotAvailableError): + client.load_metadata_for_topics('topic_no_partitions') + + with self.assertRaises(UnknownTopicOrPartitionError): + client.load_metadata_for_topics('topic_unknown') + + # This should not raise + client.load_metadata_for_topics('topic_no_leader') + + @patch('kafka.SimpleClient._get_conn') + @patch('kafka.client.KafkaProtocol') + def test_has_metadata_for_topic(self, protocol, conn): + + mock_conn(conn) + + brokers = [ + BrokerMetadata(0, 'broker_1', 4567), + BrokerMetadata(1, 'broker_2', 5678) + ] + + topics = [ + (NO_LEADER, 'topic_still_creating', []), + (UNKNOWN_TOPIC_OR_PARTITION, 'topic_doesnt_exist', []), + (NO_ERROR, 'topic_noleaders', [ + (NO_LEADER, 0, -1, [], []), + (NO_LEADER, 1, -1, [], []), + ]), + ] + protocol.decode_metadata_response.return_value = MetadataResponse(brokers, topics) + + client = SimpleClient(hosts=['broker_1:4567']) + + # Topics with no partitions return False + self.assertFalse(client.has_metadata_for_topic('topic_still_creating')) + self.assertFalse(client.has_metadata_for_topic('topic_doesnt_exist')) + + # Topic with partition metadata, but no leaders return True + self.assertTrue(client.has_metadata_for_topic('topic_noleaders')) + + @patch('kafka.SimpleClient._get_conn') + @patch('kafka.client.KafkaProtocol.decode_metadata_response') + def test_ensure_topic_exists(self, decode_metadata_response, conn): + + mock_conn(conn) + + brokers = [ + BrokerMetadata(0, 'broker_1', 4567), + BrokerMetadata(1, 'broker_2', 5678) + ] + + topics = [ + (NO_LEADER, 'topic_still_creating', []), + (UNKNOWN_TOPIC_OR_PARTITION, 'topic_doesnt_exist', []), + (NO_ERROR, 'topic_noleaders', [ + (NO_LEADER, 0, -1, [], []), + (NO_LEADER, 1, -1, [], []), + ]), + ] + decode_metadata_response.return_value = MetadataResponse(brokers, topics) + + client = SimpleClient(hosts=['broker_1:4567']) + + with self.assertRaises(UnknownTopicOrPartitionError): + client.ensure_topic_exists('topic_doesnt_exist', timeout=1) + + with self.assertRaises(KafkaTimeoutError): + client.ensure_topic_exists('topic_still_creating', timeout=1) + + # This should not raise + client.ensure_topic_exists('topic_noleaders', timeout=1) + + @patch('kafka.SimpleClient._get_conn') @patch('kafka.client.KafkaProtocol') def test_get_leader_for_partitions_reloads_metadata(self, protocol, conn): "Get leader for partitions reload metadata if it is not available" - conn.recv.return_value = 'response' # anything but None - - brokers = {} - brokers[0] = BrokerMetadata(0, 'broker_1', 4567) - brokers[1] = BrokerMetadata(1, 'broker_2', 5678) + mock_conn(conn) - topics = {'topic_no_partitions': {}} - protocol.decode_metadata_response.return_value = (brokers, topics) + brokers = [ + BrokerMetadata(0, 'broker_1', 4567), + BrokerMetadata(1, 'broker_2', 5678) + ] + + topics = [ + (NO_LEADER, 'topic_no_partitions', []) + ] + protocol.decode_metadata_response.return_value = MetadataResponse(brokers, topics) - client = KafkaClient(hosts=['broker_1:4567']) + client = SimpleClient(hosts=['broker_1:4567']) # topic metadata is loaded but empty self.assertDictEqual({}, client.topics_to_brokers) - topics['topic_no_partitions'] = { - 0: PartitionMetadata('topic_no_partitions', 0, 0, [0, 1], [0, 1]) - } - protocol.decode_metadata_response.return_value = (brokers, topics) + topics = [ + (NO_ERROR, 'topic_one_partition', [ + (NO_ERROR, 0, 0, [0, 1], [0, 1]) + ]) + ] + protocol.decode_metadata_response.return_value = MetadataResponse(brokers, topics) # calling _get_leader_for_partition (from any broker aware request) # will try loading metadata again for the same topic - leader = client._get_leader_for_partition('topic_no_partitions', 0) + leader = client._get_leader_for_partition('topic_one_partition', 0) self.assertEqual(brokers[0], leader) self.assertDictEqual({ - TopicAndPartition('topic_no_partitions', 0): brokers[0]}, + TopicPartition('topic_one_partition', 0): brokers[0]}, client.topics_to_brokers) - @patch('kafka.client.KafkaConnection') + @patch('kafka.SimpleClient._get_conn') @patch('kafka.client.KafkaProtocol') def test_get_leader_for_unassigned_partitions(self, protocol, conn): - "Get leader raises if no partitions is defined for a topic" - - conn.recv.return_value = 'response' # anything but None - brokers = {} - brokers[0] = BrokerMetadata(0, 'broker_1', 4567) - brokers[1] = BrokerMetadata(1, 'broker_2', 5678) + mock_conn(conn) - topics = {'topic_no_partitions': {}} - protocol.decode_metadata_response.return_value = (brokers, topics) + brokers = [ + BrokerMetadata(0, 'broker_1', 4567), + BrokerMetadata(1, 'broker_2', 5678) + ] + + topics = [ + (NO_LEADER, 'topic_no_partitions', []), + (UNKNOWN_TOPIC_OR_PARTITION, 'topic_unknown', []), + ] + protocol.decode_metadata_response.return_value = MetadataResponse(brokers, topics) - client = KafkaClient(hosts=['broker_1:4567']) + client = SimpleClient(hosts=['broker_1:4567']) self.assertDictEqual({}, client.topics_to_brokers) - with self.assertRaises(PartitionUnavailableError): + with self.assertRaises(LeaderNotAvailableError): client._get_leader_for_partition('topic_no_partitions', 0) - @patch('kafka.client.KafkaConnection') - @patch('kafka.client.KafkaProtocol') - def test_get_leader_returns_none_when_noleader(self, protocol, conn): - "Getting leader for partitions returns None when the partiion has no leader" + with self.assertRaises(UnknownTopicOrPartitionError): + client._get_leader_for_partition('topic_unknown', 0) - conn.recv.return_value = 'response' # anything but None + @patch('kafka.SimpleClient._get_conn') + @patch('kafka.client.KafkaProtocol') + def test_get_leader_exceptions_when_noleader(self, protocol, conn): - brokers = {} - brokers[0] = BrokerMetadata(0, 'broker_1', 4567) - brokers[1] = BrokerMetadata(1, 'broker_2', 5678) + mock_conn(conn) - topics = {} - topics['topic_noleader'] = { - 0: PartitionMetadata('topic_noleader', 0, -1, [], []), - 1: PartitionMetadata('topic_noleader', 1, -1, [], []) - } - protocol.decode_metadata_response.return_value = (brokers, topics) + brokers = [ + BrokerMetadata(0, 'broker_1', 4567), + BrokerMetadata(1, 'broker_2', 5678) + ] + + topics = [ + (NO_ERROR, 'topic_noleader', [ + (NO_LEADER, 0, -1, [], []), + (NO_LEADER, 1, -1, [], []), + ]), + ] + protocol.decode_metadata_response.return_value = MetadataResponse(brokers, topics) - client = KafkaClient(hosts=['broker_1:4567']) + client = SimpleClient(hosts=['broker_1:4567']) self.assertDictEqual( { - TopicAndPartition('topic_noleader', 0): None, - TopicAndPartition('topic_noleader', 1): None + TopicPartition('topic_noleader', 0): None, + TopicPartition('topic_noleader', 1): None }, client.topics_to_brokers) - self.assertIsNone(client._get_leader_for_partition('topic_noleader', 0)) - self.assertIsNone(client._get_leader_for_partition('topic_noleader', 1)) - topics['topic_noleader'] = { - 0: PartitionMetadata('topic_noleader', 0, 0, [0, 1], [0, 1]), - 1: PartitionMetadata('topic_noleader', 1, 1, [1, 0], [1, 0]) - } - protocol.decode_metadata_response.return_value = (brokers, topics) + # No leader partitions -- raise LeaderNotAvailableError + with self.assertRaises(LeaderNotAvailableError): + self.assertIsNone(client._get_leader_for_partition('topic_noleader', 0)) + with self.assertRaises(LeaderNotAvailableError): + self.assertIsNone(client._get_leader_for_partition('topic_noleader', 1)) + + # Unknown partitions -- raise UnknownTopicOrPartitionError + with self.assertRaises(UnknownTopicOrPartitionError): + self.assertIsNone(client._get_leader_for_partition('topic_noleader', 2)) + + topics = [ + (NO_ERROR, 'topic_noleader', [ + (NO_ERROR, 0, 0, [0, 1], [0, 1]), + (NO_ERROR, 1, 1, [1, 0], [1, 0]) + ]), + ] + protocol.decode_metadata_response.return_value = MetadataResponse(brokers, topics) self.assertEqual(brokers[0], client._get_leader_for_partition('topic_noleader', 0)) self.assertEqual(brokers[1], client._get_leader_for_partition('topic_noleader', 1)) - @patch('kafka.client.KafkaConnection') + @patch.object(SimpleClient, '_get_conn') @patch('kafka.client.KafkaProtocol') def test_send_produce_request_raises_when_noleader(self, protocol, conn): - "Send producer request raises LeaderUnavailableError if leader is not available" + mock_conn(conn) - conn.recv.return_value = 'response' # anything but None - - brokers = {} - brokers[0] = BrokerMetadata(0, 'broker_1', 4567) - brokers[1] = BrokerMetadata(1, 'broker_2', 5678) - - topics = {} - topics['topic_noleader'] = { - 0: PartitionMetadata('topic_noleader', 0, -1, [], []), - 1: PartitionMetadata('topic_noleader', 1, -1, [], []) - } - protocol.decode_metadata_response.return_value = (brokers, topics) + brokers = [ + BrokerMetadata(0, 'broker_1', 4567), + BrokerMetadata(1, 'broker_2', 5678) + ] + + topics = [ + (NO_ERROR, 'topic_noleader', [ + (NO_LEADER, 0, -1, [], []), + (NO_LEADER, 1, -1, [], []), + ]), + ] + protocol.decode_metadata_response.return_value = MetadataResponse(brokers, topics) - client = KafkaClient(hosts=['broker_1:4567']) + client = SimpleClient(hosts=['broker_1:4567']) - requests = [ProduceRequest( + requests = [ProduceRequestPayload( "topic_noleader", 0, [create_message("a"), create_message("b")])] - with self.assertRaises(LeaderUnavailableError): + with self.assertRaises(LeaderNotAvailableError): + client.send_produce_request(requests) + + @patch('kafka.SimpleClient._get_conn') + @patch('kafka.client.KafkaProtocol') + def test_send_produce_request_raises_when_topic_unknown(self, protocol, conn): + + mock_conn(conn) + + brokers = [ + BrokerMetadata(0, 'broker_1', 4567), + BrokerMetadata(1, 'broker_2', 5678) + ] + + topics = [ + (UNKNOWN_TOPIC_OR_PARTITION, 'topic_doesnt_exist', []), + ] + protocol.decode_metadata_response.return_value = MetadataResponse(brokers, topics) + + client = SimpleClient(hosts=['broker_1:4567']) + + requests = [ProduceRequestPayload( + "topic_doesnt_exist", 0, + [create_message("a"), create_message("b")])] + + with self.assertRaises(UnknownTopicOrPartitionError): client.send_produce_request(requests) + def test_timeout(self): + def _timeout(*args, **kwargs): + timeout = args[1] + sleep(timeout) + raise socket.timeout + + with patch.object(socket, "create_connection", side_effect=_timeout): + + with Timer() as t: + with self.assertRaises(ConnectionError): + KafkaConnection("nowhere", 1234, 1.0) + self.assertGreaterEqual(t.interval, 1.0) + + def test_correlation_rollover(self): + with patch.object(SimpleClient, 'load_metadata_for_topics'): + big_num = 2**31 - 3 + client = SimpleClient(hosts=[], correlation_id=big_num) + self.assertEqual(big_num + 1, client._next_id()) + self.assertEqual(big_num + 2, client._next_id()) + self.assertEqual(0, client._next_id()) diff -Nru python-kafka-python-0.9.2/test/test_codec.py python-kafka-python-1.0.1/test/test_codec.py --- python-kafka-python-0.9.2/test/test_codec.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/test/test_codec.py 2016-02-17 18:37:58.000000000 +0000 @@ -1,70 +1,87 @@ import struct -import unittest2 + +import pytest +from six.moves import xrange from kafka.codec import ( - has_snappy, gzip_encode, gzip_decode, - snappy_encode, snappy_decode -) -from kafka.protocol import ( - create_gzip_message, create_message, create_snappy_message, KafkaProtocol + has_snappy, has_gzip, has_lz4, + gzip_encode, gzip_decode, + snappy_encode, snappy_decode, + lz4_encode, lz4_decode, ) -from testutil import * - -class TestCodec(unittest2.TestCase): - def test_gzip(self): - for i in xrange(1000): - s1 = random_string(100) - s2 = gzip_decode(gzip_encode(s1)) - self.assertEquals(s1, s2) - - @unittest2.skipUnless(has_snappy(), "Snappy not available") - def test_snappy(self): - for i in xrange(1000): - s1 = random_string(100) - s2 = snappy_decode(snappy_encode(s1)) - self.assertEquals(s1, s2) - - @unittest2.skipUnless(has_snappy(), "Snappy not available") - def test_snappy_detect_xerial(self): - import kafka as kafka1 - _detect_xerial_stream = kafka1.codec._detect_xerial_stream - - header = b'\x82SNAPPY\x00\x00\x00\x00\x01\x00\x00\x00\x01Some extra bytes' - false_header = b'\x01SNAPPY\x00\x00\x00\x01\x00\x00\x00\x01' - random_snappy = snappy_encode('SNAPPY' * 50) - short_data = b'\x01\x02\x03\x04' - - self.assertTrue(_detect_xerial_stream(header)) - self.assertFalse(_detect_xerial_stream(b'')) - self.assertFalse(_detect_xerial_stream(b'\x00')) - self.assertFalse(_detect_xerial_stream(false_header)) - self.assertFalse(_detect_xerial_stream(random_snappy)) - self.assertFalse(_detect_xerial_stream(short_data)) - - @unittest2.skipUnless(has_snappy(), "Snappy not available") - def test_snappy_decode_xerial(self): - header = b'\x82SNAPPY\x00\x00\x00\x00\x01\x00\x00\x00\x01' - random_snappy = snappy_encode('SNAPPY' * 50) - block_len = len(random_snappy) - random_snappy2 = snappy_encode('XERIAL' * 50) - block_len2 = len(random_snappy2) - - to_test = header \ - + struct.pack('!i', block_len) + random_snappy \ - + struct.pack('!i', block_len2) + random_snappy2 \ - - self.assertEquals(snappy_decode(to_test), ('SNAPPY' * 50) + ('XERIAL' * 50)) - - @unittest2.skipUnless(has_snappy(), "Snappy not available") - def test_snappy_encode_xerial(self): - to_ensure = b'\x82SNAPPY\x00\x00\x00\x00\x01\x00\x00\x00\x01' + \ - '\x00\x00\x00\x18' + \ - '\xac\x02\x14SNAPPY\xfe\x06\x00\xfe\x06\x00\xfe\x06\x00\xfe\x06\x00\x96\x06\x00' + \ - '\x00\x00\x00\x18' + \ - '\xac\x02\x14XERIAL\xfe\x06\x00\xfe\x06\x00\xfe\x06\x00\xfe\x06\x00\x96\x06\x00' - to_test = ('SNAPPY' * 50) + ('XERIAL' * 50) +from test.testutil import random_string - compressed = snappy_encode(to_test, xerial_compatible=True, xerial_blocksize=300) - self.assertEquals(compressed, to_ensure) +def test_gzip(): + for i in xrange(1000): + b1 = random_string(100).encode('utf-8') + b2 = gzip_decode(gzip_encode(b1)) + assert b1 == b2 + + +@pytest.mark.skipif(not has_snappy(), reason="Snappy not available") +def test_snappy(): + for i in xrange(1000): + b1 = random_string(100).encode('utf-8') + b2 = snappy_decode(snappy_encode(b1)) + assert b1 == b2 + + +@pytest.mark.skipif(not has_snappy(), reason="Snappy not available") +def test_snappy_detect_xerial(): + import kafka as kafka1 + _detect_xerial_stream = kafka1.codec._detect_xerial_stream + + header = b'\x82SNAPPY\x00\x00\x00\x00\x01\x00\x00\x00\x01Some extra bytes' + false_header = b'\x01SNAPPY\x00\x00\x00\x01\x00\x00\x00\x01' + default_snappy = snappy_encode(b'foobar' * 50) + random_snappy = snappy_encode(b'SNAPPY' * 50, xerial_compatible=False) + short_data = b'\x01\x02\x03\x04' + + assert _detect_xerial_stream(header) is True + assert _detect_xerial_stream(b'') is False + assert _detect_xerial_stream(b'\x00') is False + assert _detect_xerial_stream(false_header) is False + assert _detect_xerial_stream(default_snappy) is True + assert _detect_xerial_stream(random_snappy) is False + assert _detect_xerial_stream(short_data) is False + + +@pytest.mark.skipif(not has_snappy(), reason="Snappy not available") +def test_snappy_decode_xerial(): + header = b'\x82SNAPPY\x00\x00\x00\x00\x01\x00\x00\x00\x01' + random_snappy = snappy_encode(b'SNAPPY' * 50, xerial_compatible=False) + block_len = len(random_snappy) + random_snappy2 = snappy_encode(b'XERIAL' * 50, xerial_compatible=False) + block_len2 = len(random_snappy2) + + to_test = header \ + + struct.pack('!i', block_len) + random_snappy \ + + struct.pack('!i', block_len2) + random_snappy2 \ + + assert snappy_decode(to_test) == (b'SNAPPY' * 50) + (b'XERIAL' * 50) + + +@pytest.mark.skipif(not has_snappy(), reason="Snappy not available") +def test_snappy_encode_xerial(): + to_ensure = ( + b'\x82SNAPPY\x00\x00\x00\x00\x01\x00\x00\x00\x01' + b'\x00\x00\x00\x18' + b'\xac\x02\x14SNAPPY\xfe\x06\x00\xfe\x06\x00\xfe\x06\x00\xfe\x06\x00\x96\x06\x00' + b'\x00\x00\x00\x18' + b'\xac\x02\x14XERIAL\xfe\x06\x00\xfe\x06\x00\xfe\x06\x00\xfe\x06\x00\x96\x06\x00' + ) + + to_test = (b'SNAPPY' * 50) + (b'XERIAL' * 50) + + compressed = snappy_encode(to_test, xerial_compatible=True, xerial_blocksize=300) + assert compressed == to_ensure + + +@pytest.mark.skipif(not has_lz4(), reason="LZ4 not available") +def test_lz4(): + for i in xrange(1000): + b1 = random_string(100).encode('utf-8') + b2 = lz4_decode(lz4_encode(b1)) + assert b1 == b2 diff -Nru python-kafka-python-0.9.2/test/test_conn.py python-kafka-python-1.0.1/test/test_conn.py --- python-kafka-python-0.9.2/test/test_conn.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/test/test_conn.py 2016-02-17 18:37:58.000000000 +0000 @@ -1,20 +1,22 @@ import socket import struct +from threading import Thread import mock -import unittest2 +from . import unittest -from kafka.common import * -from kafka.conn import * +from kafka.common import ConnectionError +from kafka.conn import KafkaConnection, collect_hosts, DEFAULT_SOCKET_TIMEOUT_SECONDS -class ConnTest(unittest2.TestCase): +class ConnTest(unittest.TestCase): def setUp(self): + self.config = { 'host': 'localhost', 'port': 9090, 'request_id': 0, - 'payload': 'test data', - 'payload2': 'another packet' + 'payload': b'test data', + 'payload2': b'another packet' } # Mocking socket.create_connection will cause _sock to always be a @@ -35,12 +37,12 @@ struct.pack('>%ds' % payload_size, self.config['payload']), struct.pack('>i', payload2_size), struct.pack('>%ds' % payload2_size, self.config['payload2']), - '' + b'' ] # Create a connection object self.conn = KafkaConnection(self.config['host'], self.config['port']) - + # Reset any mock counts caused by __init__ self.MockCreateConn.reset_mock() @@ -120,7 +122,7 @@ def test_recv(self): - self.assertEquals(self.conn.recv(self.config['request_id']), self.config['payload']) + self.assertEqual(self.conn.recv(self.config['request_id']), self.config['payload']) def test_recv__reconnects_on_dirty_conn(self): @@ -151,8 +153,25 @@ def test_recv__doesnt_consume_extra_data_in_stream(self): # Here just test that each call to recv will return a single payload - self.assertEquals(self.conn.recv(self.config['request_id']), self.config['payload']) - self.assertEquals(self.conn.recv(self.config['request_id']), self.config['payload2']) + self.assertEqual(self.conn.recv(self.config['request_id']), self.config['payload']) + self.assertEqual(self.conn.recv(self.config['request_id']), self.config['payload2']) + + def test_get_connected_socket(self): + s = self.conn.get_connected_socket() + + self.assertEqual(s, self.MockCreateConn()) + + def test_get_connected_socket_on_dirty_conn(self): + # Dirty the connection + try: + self.conn._raise_connection_error() + except ConnectionError: + pass + + # Test that get_connected_socket tries to connect + self.assertEqual(self.MockCreateConn.call_count, 0) + self.conn.get_connected_socket() + self.assertEqual(self.MockCreateConn.call_count, 1) def test_close__object_is_reusable(self): @@ -162,3 +181,45 @@ self.conn.send(self.config['request_id'], self.config['payload']) self.assertEqual(self.MockCreateConn.call_count, 1) self.conn._sock.sendall.assert_called_with(self.config['payload']) + + +class TestKafkaConnection(unittest.TestCase): + @mock.patch('socket.create_connection') + def test_copy(self, socket): + """KafkaConnection copies work as expected""" + + conn = KafkaConnection('kafka', 9092) + self.assertEqual(socket.call_count, 1) + + copy = conn.copy() + self.assertEqual(socket.call_count, 1) + self.assertEqual(copy.host, 'kafka') + self.assertEqual(copy.port, 9092) + self.assertEqual(copy._sock, None) + + copy.reinit() + self.assertEqual(socket.call_count, 2) + self.assertNotEqual(copy._sock, None) + + @mock.patch('socket.create_connection') + def test_copy_thread(self, socket): + """KafkaConnection copies work in other threads""" + + err = [] + copy = KafkaConnection('kafka', 9092).copy() + + def thread_func(err, copy): + try: + self.assertEqual(copy.host, 'kafka') + self.assertEqual(copy.port, 9092) + self.assertNotEqual(copy._sock, None) + except Exception as e: + err.append(e) + else: + err.append(None) + thread = Thread(target=thread_func, args=(err, copy)) + thread.start() + thread.join() + + self.assertEqual(err, [None]) + self.assertEqual(socket.call_count, 2) diff -Nru python-kafka-python-0.9.2/test/test_consumer_group.py python-kafka-python-1.0.1/test/test_consumer_group.py --- python-kafka-python-0.9.2/test/test_consumer_group.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/test/test_consumer_group.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,117 @@ +import collections +import logging +import threading +import os +import time + +import pytest +import six + +from kafka import SimpleClient, SimpleProducer +from kafka.common import TopicPartition +from kafka.conn import BrokerConnection, ConnectionStates +from kafka.consumer.group import KafkaConsumer + +from test.conftest import version +from test.testutil import random_string + + +@pytest.fixture +def simple_client(kafka_broker): + connect_str = 'localhost:' + str(kafka_broker.port) + return SimpleClient(connect_str) + + +@pytest.fixture +def topic(simple_client): + topic = random_string(5) + simple_client.ensure_topic_exists(topic) + return topic + + +@pytest.mark.skipif(not version(), reason="No KAFKA_VERSION set") +def test_consumer(kafka_broker, version): + + # 0.8.2 brokers need a topic to function well + if version >= (0, 8, 2) and version < (0, 9): + topic(simple_client(kafka_broker)) + + connect_str = 'localhost:' + str(kafka_broker.port) + consumer = KafkaConsumer(bootstrap_servers=connect_str) + consumer.poll(500) + assert len(consumer._client._conns) > 0 + node_id = list(consumer._client._conns.keys())[0] + assert consumer._client._conns[node_id].state is ConnectionStates.CONNECTED + + +@pytest.mark.skipif(version() < (0, 9), reason='Unsupported Kafka Version') +@pytest.mark.skipif(not version(), reason="No KAFKA_VERSION set") +def test_group(kafka_broker, topic): + num_partitions = 4 + connect_str = 'localhost:' + str(kafka_broker.port) + consumers = {} + stop = {} + messages = collections.defaultdict(list) + def consumer_thread(i): + assert i not in consumers + assert i not in stop + stop[i] = threading.Event() + consumers[i] = KafkaConsumer(topic, + bootstrap_servers=connect_str, + heartbeat_interval_ms=500) + while not stop[i].is_set(): + for tp, records in six.itervalues(consumers[i].poll()): + messages[i][tp].extend(records) + consumers[i].close() + del consumers[i] + del stop[i] + + num_consumers = 4 + for i in range(num_consumers): + t = threading.Thread(target=consumer_thread, args=(i,)) + t.daemon = True + t.start() + + try: + timeout = time.time() + 35 + while True: + for c in range(num_consumers): + + # Verify all consumers have been created + if c not in consumers: + break + + # Verify all consumers have an assignment + elif not consumers[c].assignment(): + break + + # Verify all consumers are in the same generation + generations = set() + for consumer in six.itervalues(consumers): + generations.add(consumer._coordinator.generation) + if len(generations) != 1: + break + + # If all checks passed, log state and break while loop + else: + for c in range(num_consumers): + logging.info("[%s] %s %s: %s", c, + consumers[c]._coordinator.generation, + consumers[c]._coordinator.member_id, + consumers[c].assignment()) + break + assert time.time() < timeout, "timeout waiting for assignments" + + group_assignment = set() + for c in range(num_consumers): + assert len(consumers[c].assignment()) != 0 + assert set.isdisjoint(consumers[c].assignment(), group_assignment) + group_assignment.update(consumers[c].assignment()) + + assert group_assignment == set([ + TopicPartition(topic, partition) + for partition in range(num_partitions)]) + + finally: + for c in range(num_consumers): + stop[c].set() diff -Nru python-kafka-python-0.9.2/test/test_consumer_integration.py python-kafka-python-1.0.1/test/test_consumer_integration.py --- python-kafka-python-0.9.2/test/test_consumer_integration.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/test/test_consumer_integration.py 2016-02-17 18:37:58.000000000 +0000 @@ -1,11 +1,23 @@ +import logging import os -from datetime import datetime -from kafka import * # noqa -from kafka.common import * # noqa -from kafka.consumer import MAX_FETCH_BUFFER_SIZE_BYTES -from fixtures import ZookeeperFixture, KafkaFixture -from testutil import * +from six.moves import xrange + +from . import unittest +from kafka import ( + KafkaConsumer, MultiProcessConsumer, SimpleConsumer, create_message +) +from kafka.common import ( + ProduceRequestPayload, ConsumerFetchSizeTooSmall, + OffsetOutOfRangeError, TopicPartition +) +from kafka.consumer.base import MAX_FETCH_BUFFER_SIZE_BYTES + +from test.fixtures import ZookeeperFixture, KafkaFixture +from test.testutil import ( + KafkaIntegrationTestCase, kafka_versions, random_string, Timer +) + class TestConsumerIntegration(KafkaIntegrationTestCase): @classmethod @@ -14,8 +26,11 @@ return cls.zk = ZookeeperFixture.instance() - cls.server1 = KafkaFixture.instance(0, cls.zk.host, cls.zk.port) - cls.server2 = KafkaFixture.instance(1, cls.zk.host, cls.zk.port) + chroot = random_string(10) + cls.server1 = KafkaFixture.instance(0, cls.zk.host, cls.zk.port, + zk_chroot=chroot) + cls.server2 = KafkaFixture.instance(1, cls.zk.host, cls.zk.port, + zk_chroot=chroot) cls.server = cls.server1 # Bootstrapping server @@ -30,20 +45,44 @@ def send_messages(self, partition, messages): messages = [ create_message(self.msg(str(msg))) for msg in messages ] - produce = ProduceRequest(self.topic, partition, messages = messages) + produce = ProduceRequestPayload(self.topic, partition, messages = messages) resp, = self.client.send_produce_request([produce]) - self.assertEquals(resp.error, 0) + self.assertEqual(resp.error, 0) return [ x.value for x in messages ] def assert_message_count(self, messages, num_messages): # Make sure we got them all - self.assertEquals(len(messages), num_messages) + self.assertEqual(len(messages), num_messages) # Make sure there are no duplicates - self.assertEquals(len(set(messages)), num_messages) + self.assertEqual(len(set(messages)), num_messages) + + def consumer(self, **kwargs): + if os.environ['KAFKA_VERSION'] == "0.8.0": + # Kafka 0.8.0 simply doesn't support offset requests, so hard code it being off + kwargs['group'] = None + kwargs['auto_commit'] = False + else: + kwargs.setdefault('group', None) + kwargs.setdefault('auto_commit', False) + + consumer_class = kwargs.pop('consumer', SimpleConsumer) + group = kwargs.pop('group', None) + topic = kwargs.pop('topic', self.topic) + + if consumer_class in [SimpleConsumer, MultiProcessConsumer]: + kwargs.setdefault('iter_timeout', 0) + + return consumer_class(self.client, group, topic, **kwargs) + + def kafka_consumer(self, **configs): + brokers = '%s:%d' % (self.server.host, self.server.port) + consumer = KafkaConsumer(self.topic, + bootstrap_servers=brokers, + **configs) + return consumer - @kafka_versions("all") def test_simple_consumer(self): self.send_messages(0, range(0, 100)) self.send_messages(1, range(100, 200)) @@ -55,7 +94,63 @@ consumer.stop() - @kafka_versions("all") + def test_simple_consumer_smallest_offset_reset(self): + self.send_messages(0, range(0, 100)) + self.send_messages(1, range(100, 200)) + + consumer = self.consumer(auto_offset_reset='smallest') + # Move fetch offset ahead of 300 message (out of range) + consumer.seek(300, 2) + # Since auto_offset_reset is set to smallest we should read all 200 + # messages from beginning. + self.assert_message_count([message for message in consumer], 200) + + def test_simple_consumer_largest_offset_reset(self): + self.send_messages(0, range(0, 100)) + self.send_messages(1, range(100, 200)) + + # Default largest + consumer = self.consumer() + # Move fetch offset ahead of 300 message (out of range) + consumer.seek(300, 2) + # Since auto_offset_reset is set to largest we should not read any + # messages. + self.assert_message_count([message for message in consumer], 0) + # Send 200 new messages to the queue + self.send_messages(0, range(200, 300)) + self.send_messages(1, range(300, 400)) + # Since the offset is set to largest we should read all the new messages. + self.assert_message_count([message for message in consumer], 200) + + def test_simple_consumer_no_reset(self): + self.send_messages(0, range(0, 100)) + self.send_messages(1, range(100, 200)) + + # Default largest + consumer = self.consumer(auto_offset_reset=None) + # Move fetch offset ahead of 300 message (out of range) + consumer.seek(300, 2) + with self.assertRaises(OffsetOutOfRangeError): + consumer.get_message() + + @kafka_versions('>=0.8.1') + def test_simple_consumer_load_initial_offsets(self): + self.send_messages(0, range(0, 100)) + self.send_messages(1, range(100, 200)) + + # Create 1st consumer and change offsets + consumer = self.consumer(group='test_simple_consumer_load_initial_offsets') + self.assertEqual(consumer.offsets, {0: 0, 1: 0}) + consumer.offsets.update({0:51, 1:101}) + # Update counter after manual offsets update + consumer.count_since_commit += 1 + consumer.commit() + + # Create 2nd consumer and check initial offsets + consumer = self.consumer(group='test_simple_consumer_load_initial_offsets', + auto_commit=False) + self.assertEqual(consumer.offsets, {0: 51, 1: 101}) + def test_simple_consumer__seek(self): self.send_messages(0, range(0, 100)) self.send_messages(1, range(100, 200)) @@ -70,49 +165,85 @@ consumer.seek(-13, 2) self.assert_message_count([ message for message in consumer ], 13) + # Set absolute offset + consumer.seek(100) + self.assert_message_count([ message for message in consumer ], 0) + consumer.seek(100, partition=0) + self.assert_message_count([ message for message in consumer ], 0) + consumer.seek(101, partition=1) + self.assert_message_count([ message for message in consumer ], 0) + consumer.seek(90, partition=0) + self.assert_message_count([ message for message in consumer ], 10) + consumer.seek(20, partition=1) + self.assert_message_count([ message for message in consumer ], 80) + consumer.seek(0, partition=1) + self.assert_message_count([ message for message in consumer ], 100) + consumer.stop() - @kafka_versions("all") def test_simple_consumer_blocking(self): consumer = self.consumer() - # Ask for 5 messages, nothing in queue, block 5 seconds + # Ask for 5 messages, nothing in queue, block 1 second with Timer() as t: - messages = consumer.get_messages(block=True, timeout=5) + messages = consumer.get_messages(block=True, timeout=1) self.assert_message_count(messages, 0) - self.assertGreaterEqual(t.interval, 5) + self.assertGreaterEqual(t.interval, 1) - self.send_messages(0, range(0, 10)) + self.send_messages(0, range(0, 5)) + self.send_messages(1, range(5, 10)) # Ask for 5 messages, 10 in queue. Get 5 back, no blocking with Timer() as t: - messages = consumer.get_messages(count=5, block=True, timeout=5) + messages = consumer.get_messages(count=5, block=True, timeout=3) self.assert_message_count(messages, 5) - self.assertLessEqual(t.interval, 1) + self.assertLess(t.interval, 3) - # Ask for 10 messages, get 5 back, block 5 seconds + # Ask for 10 messages, get 5 back, block 1 second with Timer() as t: - messages = consumer.get_messages(count=10, block=True, timeout=5) + messages = consumer.get_messages(count=10, block=True, timeout=1) self.assert_message_count(messages, 5) - self.assertGreaterEqual(t.interval, 5) + self.assertGreaterEqual(t.interval, 1) + + # Ask for 10 messages, 5 in queue, ask to block for 1 message or 1 + # second, get 5 back, no blocking + self.send_messages(0, range(0, 3)) + self.send_messages(1, range(3, 5)) + with Timer() as t: + messages = consumer.get_messages(count=10, block=1, timeout=1) + self.assert_message_count(messages, 5) + self.assertLessEqual(t.interval, 1) consumer.stop() - @kafka_versions("all") def test_simple_consumer_pending(self): + # make sure that we start with no pending messages + consumer = self.consumer() + self.assertEquals(consumer.pending(), 0) + self.assertEquals(consumer.pending(partitions=[0]), 0) + self.assertEquals(consumer.pending(partitions=[1]), 0) + # Produce 10 messages to partitions 0 and 1 self.send_messages(0, range(0, 10)) self.send_messages(1, range(10, 20)) consumer = self.consumer() - self.assertEquals(consumer.pending(), 20) - self.assertEquals(consumer.pending(partitions=[0]), 10) - self.assertEquals(consumer.pending(partitions=[1]), 10) - + self.assertEqual(consumer.pending(), 20) + self.assertEqual(consumer.pending(partitions=[0]), 10) + self.assertEqual(consumer.pending(partitions=[1]), 10) + + # move to last message, so one partition should have 1 pending + # message and other 0 + consumer.seek(-1, 2) + self.assertEqual(consumer.pending(), 1) + + pending_part1 = consumer.pending(partitions=[0]) + pending_part2 = consumer.pending(partitions=[1]) + self.assertEquals(set([0, 1]), set([pending_part1, pending_part2])) consumer.stop() - @kafka_versions("all") + @unittest.skip('MultiProcessConsumer deprecated and these tests are flaky') def test_multi_process_consumer(self): # Produce 100 messages to partitions 0 and 1 self.send_messages(0, range(0, 100)) @@ -124,16 +255,16 @@ consumer.stop() - @kafka_versions("all") + @unittest.skip('MultiProcessConsumer deprecated and these tests are flaky') def test_multi_process_consumer_blocking(self): consumer = self.consumer(consumer = MultiProcessConsumer) - # Ask for 5 messages, No messages in queue, block 5 seconds + # Ask for 5 messages, No messages in queue, block 1 second with Timer() as t: - messages = consumer.get_messages(block=True, timeout=5) + messages = consumer.get_messages(block=True, timeout=1) self.assert_message_count(messages, 0) - self.assertGreaterEqual(t.interval, 5) + self.assertGreaterEqual(t.interval, 1) # Send 10 messages self.send_messages(0, range(0, 10)) @@ -144,28 +275,60 @@ self.assert_message_count(messages, 5) self.assertLessEqual(t.interval, 1) - # Ask for 10 messages, 5 in queue, block 5 seconds + # Ask for 10 messages, 5 in queue, block 1 second with Timer() as t: - messages = consumer.get_messages(count=10, block=True, timeout=5) + messages = consumer.get_messages(count=10, block=True, timeout=1) self.assert_message_count(messages, 5) - self.assertGreaterEqual(t.interval, 5) + self.assertGreaterEqual(t.interval, 1) + + # Ask for 10 messages, 5 in queue, ask to block for 1 message or 1 + # second, get at least one back, no blocking + self.send_messages(0, range(0, 5)) + with Timer() as t: + messages = consumer.get_messages(count=10, block=1, timeout=1) + received_message_count = len(messages) + self.assertGreaterEqual(received_message_count, 1) + self.assert_message_count(messages, received_message_count) + self.assertLessEqual(t.interval, 1) consumer.stop() - @kafka_versions("all") + @unittest.skip('MultiProcessConsumer deprecated and these tests are flaky') def test_multi_proc_pending(self): self.send_messages(0, range(0, 10)) self.send_messages(1, range(10, 20)) - consumer = MultiProcessConsumer(self.client, "group1", self.topic, auto_commit=False) - - self.assertEquals(consumer.pending(), 20) - self.assertEquals(consumer.pending(partitions=[0]), 10) - self.assertEquals(consumer.pending(partitions=[1]), 10) + # set group to None and auto_commit to False to avoid interactions w/ + # offset commit/fetch apis + consumer = MultiProcessConsumer(self.client, None, self.topic, + auto_commit=False, iter_timeout=0) + + self.assertEqual(consumer.pending(), 20) + self.assertEqual(consumer.pending(partitions=[0]), 10) + self.assertEqual(consumer.pending(partitions=[1]), 10) consumer.stop() - @kafka_versions("all") + @unittest.skip('MultiProcessConsumer deprecated and these tests are flaky') + @kafka_versions('>=0.8.1') + def test_multi_process_consumer_load_initial_offsets(self): + self.send_messages(0, range(0, 10)) + self.send_messages(1, range(10, 20)) + + # Create 1st consumer and change offsets + consumer = self.consumer(group='test_multi_process_consumer_load_initial_offsets') + self.assertEqual(consumer.offsets, {0: 0, 1: 0}) + consumer.offsets.update({0:5, 1:15}) + # Update counter after manual offsets update + consumer.count_since_commit += 1 + consumer.commit() + + # Create 2nd consumer and check initial offsets + consumer = self.consumer(consumer = MultiProcessConsumer, + group='test_multi_process_consumer_load_initial_offsets', + auto_commit=False) + self.assertEqual(consumer.offsets, {0: 5, 1: 15}) + def test_large_messages(self): # Produce 10 "normal" size messages small_messages = self.send_messages(0, [ str(x) for x in range(10) ]) @@ -182,7 +345,6 @@ consumer.stop() - @kafka_versions("all") def test_huge_messages(self): huge_message, = self.send_messages(0, [ create_message(random_string(MAX_FETCH_BUFFER_SIZE_BYTES + 10)), @@ -209,17 +371,19 @@ # Consume giant message successfully message = big_consumer.get_message(block=False, timeout=10) self.assertIsNotNone(message) - self.assertEquals(message.message.value, huge_message) + self.assertEqual(message.message.value, huge_message) big_consumer.stop() - @kafka_versions("0.8.1", "0.8.1.1") + @kafka_versions('>=0.8.1') def test_offset_behavior__resuming_behavior(self): - msgs1 = self.send_messages(0, range(0, 100)) - msgs2 = self.send_messages(1, range(100, 200)) + self.send_messages(0, range(0, 100)) + self.send_messages(1, range(100, 200)) # Start a consumer consumer1 = self.consumer( + group='test_offset_behavior__resuming_behavior', + auto_commit=True, auto_commit_every_t = None, auto_commit_every_n = 20, ) @@ -230,6 +394,8 @@ # The total offset across both partitions should be at 180 consumer2 = self.consumer( + group='test_offset_behavior__resuming_behavior', + auto_commit=True, auto_commit_every_t = None, auto_commit_every_n = 20, ) @@ -240,8 +406,47 @@ consumer1.stop() consumer2.stop() + @unittest.skip('MultiProcessConsumer deprecated and these tests are flaky') + @kafka_versions('>=0.8.1') + def test_multi_process_offset_behavior__resuming_behavior(self): + self.send_messages(0, range(0, 100)) + self.send_messages(1, range(100, 200)) + + # Start a consumer + consumer1 = self.consumer( + consumer=MultiProcessConsumer, + group='test_multi_process_offset_behavior__resuming_behavior', + auto_commit=True, + auto_commit_every_t = None, + auto_commit_every_n = 20, + ) + + # Grab the first 195 messages + output_msgs1 = [] + idx = 0 + for message in consumer1: + output_msgs1.append(message.message.value) + idx += 1 + if idx >= 195: + break + self.assert_message_count(output_msgs1, 195) + + # The total offset across both partitions should be at 180 + consumer2 = self.consumer( + consumer=MultiProcessConsumer, + group='test_multi_process_offset_behavior__resuming_behavior', + auto_commit=True, + auto_commit_every_t = None, + auto_commit_every_n = 20, + ) + + # 181-200 + self.assert_message_count([ message for message in consumer2 ], 20) + + consumer1.stop() + consumer2.stop() + # TODO: Make this a unit test -- should not require integration - @kafka_versions("all") def test_fetch_buffer_size(self): # Test parameters (see issue 135 / PR 136) @@ -257,20 +462,97 @@ consumer = self.consumer(buffer_size=1024, max_buffer_size=2048) messages = [ message for message in consumer ] - self.assertEquals(len(messages), 2) + self.assertEqual(len(messages), 2) - def consumer(self, **kwargs): - if os.environ['KAFKA_VERSION'] == "0.8.0": - # Kafka 0.8.0 simply doesn't support offset requests, so hard code it being off - kwargs['auto_commit'] = False - else: - kwargs.setdefault('auto_commit', True) + def test_kafka_consumer(self): + self.send_messages(0, range(0, 100)) + self.send_messages(1, range(100, 200)) - consumer_class = kwargs.pop('consumer', SimpleConsumer) - group = kwargs.pop('group', self.id()) - topic = kwargs.pop('topic', self.topic) + # Start a consumer + consumer = self.kafka_consumer(auto_offset_reset='earliest') + n = 0 + messages = {0: set(), 1: set()} + for m in consumer: + logging.debug("Consumed message %s" % repr(m)) + n += 1 + messages[m.partition].add(m.offset) + if n >= 200: + break + + self.assertEqual(len(messages[0]), 100) + self.assertEqual(len(messages[1]), 100) + + def test_kafka_consumer__blocking(self): + TIMEOUT_MS = 500 + consumer = self.kafka_consumer(auto_offset_reset='earliest', + consumer_timeout_ms=TIMEOUT_MS) + + # Manual assignment avoids overhead of consumer group mgmt + consumer.unsubscribe() + consumer.assign([TopicPartition(self.topic, 0)]) - if consumer_class == SimpleConsumer: - kwargs.setdefault('iter_timeout', 0) + # Ask for 5 messages, nothing in queue, block 500ms + with Timer() as t: + with self.assertRaises(StopIteration): + msg = next(consumer) + self.assertGreaterEqual(t.interval, TIMEOUT_MS / 1000.0 ) - return consumer_class(self.client, group, topic, **kwargs) + self.send_messages(0, range(0, 10)) + + # Ask for 5 messages, 10 in queue. Get 5 back, no blocking + messages = set() + with Timer() as t: + for i in range(5): + msg = next(consumer) + messages.add((msg.partition, msg.offset)) + self.assertEqual(len(messages), 5) + self.assertLess(t.interval, TIMEOUT_MS / 1000.0 ) + + # Ask for 10 messages, get 5 back, block 500ms + messages = set() + with Timer() as t: + with self.assertRaises(StopIteration): + for i in range(10): + msg = next(consumer) + messages.add((msg.partition, msg.offset)) + self.assertEqual(len(messages), 5) + self.assertGreaterEqual(t.interval, TIMEOUT_MS / 1000.0 ) + + @kafka_versions('>=0.8.1') + def test_kafka_consumer__offset_commit_resume(self): + GROUP_ID = random_string(10) + + self.send_messages(0, range(0, 100)) + self.send_messages(1, range(100, 200)) + + # Start a consumer + consumer1 = self.kafka_consumer( + group_id=GROUP_ID, + enable_auto_commit=True, + auto_commit_interval_ms=100, + auto_offset_reset='earliest', + ) + + # Grab the first 180 messages + output_msgs1 = [] + for _ in xrange(180): + m = next(consumer1) + output_msgs1.append(m) + self.assert_message_count(output_msgs1, 180) + consumer1.close() + + # The total offset across both partitions should be at 180 + consumer2 = self.kafka_consumer( + group_id=GROUP_ID, + enable_auto_commit=True, + auto_commit_interval_ms=100, + auto_offset_reset='earliest', + ) + + # 181-200 + output_msgs2 = [] + for _ in xrange(20): + m = next(consumer2) + output_msgs2.append(m) + self.assert_message_count(output_msgs2, 20) + self.assertEqual(len(set(output_msgs1) | set(output_msgs2)), 200) diff -Nru python-kafka-python-0.9.2/test/test_consumer.py python-kafka-python-1.0.1/test/test_consumer.py --- python-kafka-python-0.9.2/test/test_consumer.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/test/test_consumer.py 2016-02-17 18:37:58.000000000 +0000 @@ -1,22 +1,133 @@ -import os -import random -import struct -import unittest2 from mock import MagicMock, patch +from . import unittest -from kafka import KafkaClient -from kafka.consumer import SimpleConsumer +from kafka import SimpleConsumer, KafkaConsumer, MultiProcessConsumer from kafka.common import ( - ProduceRequest, BrokerMetadata, PartitionMetadata, - TopicAndPartition, KafkaUnavailableError, - LeaderUnavailableError, PartitionUnavailableError -) -from kafka.protocol import ( - create_message, KafkaProtocol + KafkaConfigurationError, FetchResponsePayload, OffsetFetchResponsePayload, + FailedPayloadsError, OffsetAndMessage, + NotLeaderForPartitionError, UnknownTopicOrPartitionError ) -class TestKafkaConsumer(unittest2.TestCase): + +class TestKafkaConsumer(unittest.TestCase): def test_non_integer_partitions(self): with self.assertRaises(AssertionError): - consumer = SimpleConsumer(MagicMock(), 'group', 'topic', partitions = [ '0' ]) + SimpleConsumer(MagicMock(), 'group', 'topic', partitions = [ '0' ]) + + +class TestMultiProcessConsumer(unittest.TestCase): + def test_partition_list(self): + client = MagicMock() + partitions = (0,) + with patch.object(MultiProcessConsumer, 'fetch_last_known_offsets') as fetch_last_known_offsets: + MultiProcessConsumer(client, 'testing-group', 'testing-topic', partitions=partitions) + self.assertEqual(fetch_last_known_offsets.call_args[0], (partitions,) ) + self.assertEqual(client.get_partition_ids_for_topic.call_count, 0) # pylint: disable=no-member + +class TestSimpleConsumer(unittest.TestCase): + def test_simple_consumer_failed_payloads(self): + client = MagicMock() + consumer = SimpleConsumer(client, group=None, + topic='topic', partitions=[0, 1], + auto_commit=False) + + def failed_payloads(payload): + return FailedPayloadsError(payload) + + client.send_fetch_request.side_effect = self.fail_requests_factory(failed_payloads) + + # This should not raise an exception + consumer.get_messages(5) + + def test_simple_consumer_leader_change(self): + client = MagicMock() + consumer = SimpleConsumer(client, group=None, + topic='topic', partitions=[0, 1], + auto_commit=False) + + # Mock so that only the first request gets a valid response + def not_leader(request): + return FetchResponsePayload(request.topic, request.partition, + NotLeaderForPartitionError.errno, -1, ()) + + client.send_fetch_request.side_effect = self.fail_requests_factory(not_leader) + + # This should not raise an exception + consumer.get_messages(20) + + # client should have updated metadata + self.assertGreaterEqual(client.reset_topic_metadata.call_count, 1) + self.assertGreaterEqual(client.load_metadata_for_topics.call_count, 1) + + def test_simple_consumer_unknown_topic_partition(self): + client = MagicMock() + consumer = SimpleConsumer(client, group=None, + topic='topic', partitions=[0, 1], + auto_commit=False) + + # Mock so that only the first request gets a valid response + def unknown_topic_partition(request): + return FetchResponsePayload(request.topic, request.partition, + UnknownTopicOrPartitionError.errno, -1, ()) + + client.send_fetch_request.side_effect = self.fail_requests_factory(unknown_topic_partition) + + # This should not raise an exception + with self.assertRaises(UnknownTopicOrPartitionError): + consumer.get_messages(20) + + def test_simple_consumer_commit_does_not_raise(self): + client = MagicMock() + client.get_partition_ids_for_topic.return_value = [0, 1] + + def mock_offset_fetch_request(group, payloads, **kwargs): + return [OffsetFetchResponsePayload(p.topic, p.partition, 0, b'', 0) for p in payloads] + + client.send_offset_fetch_request.side_effect = mock_offset_fetch_request + + def mock_offset_commit_request(group, payloads, **kwargs): + raise FailedPayloadsError(payloads[0]) + + client.send_offset_commit_request.side_effect = mock_offset_commit_request + + consumer = SimpleConsumer(client, group='foobar', + topic='topic', partitions=[0, 1], + auto_commit=False) + + # Mock internal commit check + consumer.count_since_commit = 10 + + # This should not raise an exception + self.assertFalse(consumer.commit(partitions=[0, 1])) + + def test_simple_consumer_reset_partition_offset(self): + client = MagicMock() + + def mock_offset_request(payloads, **kwargs): + raise FailedPayloadsError(payloads[0]) + + client.send_offset_request.side_effect = mock_offset_request + + consumer = SimpleConsumer(client, group='foobar', + topic='topic', partitions=[0, 1], + auto_commit=False) + + # This should not raise an exception + self.assertEqual(consumer.reset_partition_offset(0), None) + + @staticmethod + def fail_requests_factory(error_factory): + # Mock so that only the first request gets a valid response + def fail_requests(payloads, **kwargs): + responses = [ + FetchResponsePayload(payloads[0].topic, payloads[0].partition, 0, 0, + [OffsetAndMessage( + payloads[0].offset + i, + "msg %d" % (payloads[0].offset + i)) + for i in range(10)]), + ] + for failure in payloads[1:]: + responses.append(error_factory(failure)) + return responses + return fail_requests diff -Nru python-kafka-python-0.9.2/test/test_context.py python-kafka-python-1.0.1/test/test_context.py --- python-kafka-python-0.9.2/test/test_context.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/test/test_context.py 2016-01-23 22:22:32.000000000 +0000 @@ -0,0 +1,117 @@ +""" +OffsetCommitContext tests. +""" +from . import unittest + +from mock import MagicMock, patch + +from kafka.common import OffsetOutOfRangeError +from kafka.context import OffsetCommitContext + + +class TestOffsetCommitContext(unittest.TestCase): + """ + OffsetCommitContext tests. + """ + + def setUp(self): + self.client = MagicMock() + self.consumer = MagicMock() + self.topic = "topic" + self.group = "group" + self.partition = 0 + self.consumer.topic = self.topic + self.consumer.group = self.group + self.consumer.client = self.client + self.consumer.offsets = {self.partition: 0} + self.context = OffsetCommitContext(self.consumer) + + def test_noop(self): + """ + Should revert consumer after context exit with no mark() call. + """ + with self.context: + # advance offset + self.consumer.offsets = {self.partition: 1} + + # offset restored + self.assertEqual(self.consumer.offsets, {self.partition: 0}) + # and seek called with relative zero delta + self.assertEqual(self.consumer.seek.call_count, 1) + self.assertEqual(self.consumer.seek.call_args[0], (0, 1)) + + def test_mark(self): + """ + Should remain at marked location ater context exit. + """ + with self.context as context: + context.mark(self.partition, 0) + # advance offset + self.consumer.offsets = {self.partition: 1} + + # offset sent to client + self.assertEqual(self.client.send_offset_commit_request.call_count, 1) + + # offset remains advanced + self.assertEqual(self.consumer.offsets, {self.partition: 1}) + + # and seek called with relative zero delta + self.assertEqual(self.consumer.seek.call_count, 1) + self.assertEqual(self.consumer.seek.call_args[0], (0, 1)) + + def test_mark_multiple(self): + """ + Should remain at highest marked location after context exit. + """ + with self.context as context: + context.mark(self.partition, 0) + context.mark(self.partition, 1) + context.mark(self.partition, 2) + # advance offset + self.consumer.offsets = {self.partition: 3} + + # offset sent to client + self.assertEqual(self.client.send_offset_commit_request.call_count, 1) + + # offset remains advanced + self.assertEqual(self.consumer.offsets, {self.partition: 3}) + + # and seek called with relative zero delta + self.assertEqual(self.consumer.seek.call_count, 1) + self.assertEqual(self.consumer.seek.call_args[0], (0, 1)) + + def test_rollback(self): + """ + Should rollback to initial offsets on context exit with exception. + """ + with self.assertRaises(Exception): + with self.context as context: + context.mark(self.partition, 0) + # advance offset + self.consumer.offsets = {self.partition: 1} + + raise Exception("Intentional failure") + + # offset rolled back (ignoring mark) + self.assertEqual(self.consumer.offsets, {self.partition: 0}) + + # and seek called with relative zero delta + self.assertEqual(self.consumer.seek.call_count, 1) + self.assertEqual(self.consumer.seek.call_args[0], (0, 1)) + + def test_out_of_range(self): + """ + Should reset to beginning of valid offsets on `OffsetOutOfRangeError` + """ + def _seek(offset, whence): + # seek must be called with 0, 0 to find the beginning of the range + self.assertEqual(offset, 0) + self.assertEqual(whence, 0) + # set offsets to something different + self.consumer.offsets = {self.partition: 100} + + with patch.object(self.consumer, "seek", _seek): + with self.context: + raise OffsetOutOfRangeError() + + self.assertEqual(self.consumer.offsets, {self.partition: 100}) diff -Nru python-kafka-python-0.9.2/test/test_coordinator.py python-kafka-python-1.0.1/test/test_coordinator.py --- python-kafka-python-0.9.2/test/test_coordinator.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/test/test_coordinator.py 2016-02-19 15:56:44.000000000 +0000 @@ -0,0 +1,575 @@ +# pylint: skip-file +from __future__ import absolute_import + +import pytest + +from kafka.client_async import KafkaClient +from kafka.common import TopicPartition, OffsetAndMetadata +from kafka.consumer.subscription_state import ( + SubscriptionState, ConsumerRebalanceListener) +from kafka.coordinator.assignors.range import RangePartitionAssignor +from kafka.coordinator.assignors.roundrobin import RoundRobinPartitionAssignor +from kafka.coordinator.consumer import ConsumerCoordinator +from kafka.coordinator.protocol import ( + ConsumerProtocolMemberMetadata, ConsumerProtocolMemberAssignment) +from kafka.conn import ConnectionStates +from kafka.future import Future +from kafka.protocol.commit import ( + OffsetCommitRequest_v0, OffsetCommitRequest_v1, OffsetCommitRequest_v2, + OffsetCommitResponse, OffsetFetchRequest_v0, OffsetFetchRequest_v1, + OffsetFetchResponse) +from kafka.protocol.metadata import MetadataResponse +from kafka.util import WeakMethod + +import kafka.common as Errors + + +@pytest.fixture +def conn(mocker): + conn = mocker.patch('kafka.client_async.BrokerConnection') + conn.return_value = conn + conn.state = ConnectionStates.CONNECTED + conn.send.return_value = Future().success( + MetadataResponse( + [(0, 'foo', 12), (1, 'bar', 34)], # brokers + [])) # topics + return conn + + +@pytest.fixture +def coordinator(conn): + return ConsumerCoordinator(KafkaClient(), SubscriptionState()) + + +def test_init(conn): + cli = KafkaClient() + coordinator = ConsumerCoordinator(cli, SubscriptionState()) + + # metadata update on init + assert cli.cluster._need_update is True + assert WeakMethod(coordinator._handle_metadata_update) in cli.cluster._listeners + + +@pytest.mark.parametrize("api_version", [(0, 8, 0), (0, 8, 1), (0, 8, 2), (0, 9)]) +def test_autocommit_enable_api_version(conn, api_version): + coordinator = ConsumerCoordinator( + KafkaClient(), SubscriptionState(), api_version=api_version) + if api_version < (0, 8, 1): + assert coordinator._auto_commit_task is None + else: + assert coordinator._auto_commit_task is not None + + +def test_protocol_type(coordinator): + assert coordinator.protocol_type() is 'consumer' + + +def test_group_protocols(coordinator): + # Requires a subscription + try: + coordinator.group_protocols() + except AssertionError: + pass + else: + assert False, 'Exception not raised when expected' + + coordinator._subscription.subscribe(topics=['foobar']) + assert coordinator.group_protocols() == [ + ('range', ConsumerProtocolMemberMetadata( + RangePartitionAssignor.version, + ['foobar'], + b'')), + ('roundrobin', ConsumerProtocolMemberMetadata( + RoundRobinPartitionAssignor.version, + ['foobar'], + b'')), + ] + + +@pytest.mark.parametrize('api_version', [(0, 8), (0, 8, 1), (0, 8, 2), (0, 9)]) +def test_pattern_subscription(coordinator, api_version): + coordinator.config['api_version'] = api_version + coordinator._subscription.subscribe(pattern='foo') + assert coordinator._subscription.subscription == set([]) + assert coordinator._subscription_metadata_changed() is False + assert coordinator._subscription.needs_partition_assignment is False + + cluster = coordinator._client.cluster + cluster.update_metadata(MetadataResponse( + # brokers + [(0, 'foo', 12), (1, 'bar', 34)], + # topics + [(0, 'fizz', []), + (0, 'foo1', [(0, 0, 0, [], [])]), + (0, 'foo2', [(0, 0, 1, [], [])])])) + assert coordinator._subscription.subscription == set(['foo1', 'foo2']) + + # 0.9 consumers should trigger dynamic partition assignment + if api_version >= (0, 9): + assert coordinator._subscription.needs_partition_assignment is True + assert coordinator._subscription.assignment == {} + + # earlier consumers get all partitions assigned locally + else: + assert coordinator._subscription.needs_partition_assignment is False + assert set(coordinator._subscription.assignment.keys()) == set([ + TopicPartition('foo1', 0), + TopicPartition('foo2', 0)]) + + +def test_lookup_assignor(coordinator): + assert coordinator._lookup_assignor('roundrobin') is RoundRobinPartitionAssignor + assert coordinator._lookup_assignor('range') is RangePartitionAssignor + assert coordinator._lookup_assignor('foobar') is None + + +def test_join_complete(mocker, coordinator): + coordinator._subscription.subscribe(topics=['foobar']) + assignor = RoundRobinPartitionAssignor() + coordinator.config['assignors'] = (assignor,) + mocker.spy(assignor, 'on_assignment') + assert assignor.on_assignment.call_count == 0 + assignment = ConsumerProtocolMemberAssignment(0, [('foobar', [0, 1])], b'') + coordinator._on_join_complete( + 0, 'member-foo', 'roundrobin', assignment.encode()) + assert assignor.on_assignment.call_count == 1 + assignor.on_assignment.assert_called_with(assignment) + + +def test_subscription_listener(mocker, coordinator): + listener = mocker.MagicMock(spec=ConsumerRebalanceListener) + coordinator._subscription.subscribe( + topics=['foobar'], + listener=listener) + + coordinator._on_join_prepare(0, 'member-foo') + assert listener.on_partitions_revoked.call_count == 1 + listener.on_partitions_revoked.assert_called_with(set([])) + + assignment = ConsumerProtocolMemberAssignment(0, [('foobar', [0, 1])], b'') + coordinator._on_join_complete( + 0, 'member-foo', 'roundrobin', assignment.encode()) + assert listener.on_partitions_assigned.call_count == 1 + listener.on_partitions_assigned.assert_called_with(set([ + TopicPartition('foobar', 0), + TopicPartition('foobar', 1)])) + + +def test_subscription_listener_failure(mocker, coordinator): + listener = mocker.MagicMock(spec=ConsumerRebalanceListener) + coordinator._subscription.subscribe( + topics=['foobar'], + listener=listener) + + # exception raised in listener should not be re-raised by coordinator + listener.on_partitions_revoked.side_effect = Exception('crash') + coordinator._on_join_prepare(0, 'member-foo') + assert listener.on_partitions_revoked.call_count == 1 + + assignment = ConsumerProtocolMemberAssignment(0, [('foobar', [0, 1])], b'') + coordinator._on_join_complete( + 0, 'member-foo', 'roundrobin', assignment.encode()) + assert listener.on_partitions_assigned.call_count == 1 + + +def test_perform_assignment(mocker, coordinator): + member_metadata = { + 'member-foo': ConsumerProtocolMemberMetadata(0, ['foo1'], b''), + 'member-bar': ConsumerProtocolMemberMetadata(0, ['foo1'], b'') + } + assignments = { + 'member-foo': ConsumerProtocolMemberAssignment( + 0, [('foo1', [0])], b''), + 'member-bar': ConsumerProtocolMemberAssignment( + 0, [('foo1', [1])], b'') + } + + mocker.patch.object(RoundRobinPartitionAssignor, 'assign') + RoundRobinPartitionAssignor.assign.return_value = assignments + + ret = coordinator._perform_assignment( + 'member-foo', 'roundrobin', + [(member, metadata.encode()) + for member, metadata in member_metadata.items()]) + + assert RoundRobinPartitionAssignor.assign.call_count == 1 + RoundRobinPartitionAssignor.assign.assert_called_with( + coordinator._client.cluster, member_metadata) + assert ret == assignments + + +def test_on_join_prepare(coordinator): + coordinator._subscription.subscribe(topics=['foobar']) + coordinator._on_join_prepare(0, 'member-foo') + assert coordinator._subscription.needs_partition_assignment is True + + +def test_need_rejoin(coordinator): + # No subscription - no rejoin + assert coordinator.need_rejoin() is False + + coordinator._subscription.subscribe(topics=['foobar']) + assert coordinator.need_rejoin() is True + + coordinator._subscription.needs_partition_assignment = False + coordinator.rejoin_needed = False + assert coordinator.need_rejoin() is False + + coordinator._subscription.needs_partition_assignment = True + assert coordinator.need_rejoin() is True + + +def test_refresh_committed_offsets_if_needed(mocker, coordinator): + mocker.patch.object(ConsumerCoordinator, 'fetch_committed_offsets', + return_value = { + TopicPartition('foobar', 0): OffsetAndMetadata(123, b''), + TopicPartition('foobar', 1): OffsetAndMetadata(234, b'')}) + coordinator._subscription.assign_from_user([TopicPartition('foobar', 0)]) + assert coordinator._subscription.needs_fetch_committed_offsets is True + coordinator.refresh_committed_offsets_if_needed() + assignment = coordinator._subscription.assignment + assert assignment[TopicPartition('foobar', 0)].committed == 123 + assert TopicPartition('foobar', 1) not in assignment + assert coordinator._subscription.needs_fetch_committed_offsets is False + + +def test_fetch_committed_offsets(mocker, coordinator): + + # No partitions, no IO polling + mocker.patch.object(coordinator._client, 'poll') + assert coordinator.fetch_committed_offsets([]) == {} + assert coordinator._client.poll.call_count == 0 + + # general case -- send offset fetch request, get successful future + mocker.patch.object(coordinator, 'ensure_coordinator_known') + mocker.patch.object(coordinator, '_send_offset_fetch_request', + return_value=Future().success('foobar')) + partitions = [TopicPartition('foobar', 0)] + ret = coordinator.fetch_committed_offsets(partitions) + assert ret == 'foobar' + coordinator._send_offset_fetch_request.assert_called_with(partitions) + assert coordinator._client.poll.call_count == 1 + + # Failed future is raised if not retriable + coordinator._send_offset_fetch_request.return_value = Future().failure(AssertionError) + coordinator._client.poll.reset_mock() + try: + coordinator.fetch_committed_offsets(partitions) + except AssertionError: + pass + else: + assert False, 'Exception not raised when expected' + assert coordinator._client.poll.call_count == 1 + + coordinator._client.poll.reset_mock() + coordinator._send_offset_fetch_request.side_effect = [ + Future().failure(Errors.RequestTimedOutError), + Future().success('fizzbuzz')] + + ret = coordinator.fetch_committed_offsets(partitions) + assert ret == 'fizzbuzz' + assert coordinator._client.poll.call_count == 2 # call + retry + + +def test_close(mocker, coordinator): + mocker.patch.object(coordinator, '_maybe_auto_commit_offsets_sync') + mocker.patch.object(coordinator, '_handle_leave_group_response') + mocker.patch.object(coordinator, 'coordinator_unknown', return_value=False) + coordinator.coordinator_id = 0 + coordinator.generation = 1 + cli = coordinator._client + mocker.patch.object(cli, 'unschedule') + mocker.patch.object(cli, 'send', return_value=Future().success('foobar')) + mocker.patch.object(cli, 'poll') + + coordinator.close() + assert coordinator._maybe_auto_commit_offsets_sync.call_count == 1 + cli.unschedule.assert_called_with(coordinator.heartbeat_task) + coordinator._handle_leave_group_response.assert_called_with('foobar') + + assert coordinator.generation == -1 + assert coordinator.member_id == '' + assert coordinator.rejoin_needed is True + + +@pytest.fixture +def offsets(): + return { + TopicPartition('foobar', 0): OffsetAndMetadata(123, b''), + TopicPartition('foobar', 1): OffsetAndMetadata(234, b''), + } + + +def test_commit_offsets_async(mocker, coordinator, offsets): + mocker.patch.object(coordinator._client, 'poll') + mocker.patch.object(coordinator, 'ensure_coordinator_known') + mocker.patch.object(coordinator, '_send_offset_commit_request', + return_value=Future().success('fizzbuzz')) + ret = coordinator.commit_offsets_async(offsets) + assert isinstance(ret, Future) + assert coordinator._send_offset_commit_request.call_count == 1 + + +def test_commit_offsets_sync(mocker, coordinator, offsets): + mocker.patch.object(coordinator, 'ensure_coordinator_known') + mocker.patch.object(coordinator, '_send_offset_commit_request', + return_value=Future().success('fizzbuzz')) + cli = coordinator._client + mocker.patch.object(cli, 'poll') + + # No offsets, no calls + assert coordinator.commit_offsets_sync({}) is None + assert coordinator._send_offset_commit_request.call_count == 0 + assert cli.poll.call_count == 0 + + ret = coordinator.commit_offsets_sync(offsets) + assert coordinator._send_offset_commit_request.call_count == 1 + assert cli.poll.call_count == 1 + assert ret == 'fizzbuzz' + + # Failed future is raised if not retriable + coordinator._send_offset_commit_request.return_value = Future().failure(AssertionError) + coordinator._client.poll.reset_mock() + try: + coordinator.commit_offsets_sync(offsets) + except AssertionError: + pass + else: + assert False, 'Exception not raised when expected' + assert coordinator._client.poll.call_count == 1 + + coordinator._client.poll.reset_mock() + coordinator._send_offset_commit_request.side_effect = [ + Future().failure(Errors.RequestTimedOutError), + Future().success('fizzbuzz')] + + ret = coordinator.commit_offsets_sync(offsets) + assert ret == 'fizzbuzz' + assert coordinator._client.poll.call_count == 2 # call + retry + + +@pytest.mark.parametrize( + 'api_version,enable,error,task_disable,commit_offsets,warn,exc', [ + ((0, 8), True, None, False, False, False, False), + ((0, 9), False, None, False, False, False, False), + ((0, 9), True, Errors.UnknownMemberIdError(), True, True, True, False), + ((0, 9), True, Errors.IllegalGenerationError(), True, True, True, False), + ((0, 9), True, Errors.RebalanceInProgressError(), True, True, True, False), + ((0, 9), True, Exception(), True, True, False, True), + ((0, 9), True, None, True, True, False, False), + ]) +def test_maybe_auto_commit_offsets_sync(mocker, coordinator, + api_version, enable, error, task_disable, + commit_offsets, warn, exc): + auto_commit_task = mocker.patch.object(coordinator, '_auto_commit_task') + commit_sync = mocker.patch.object(coordinator, 'commit_offsets_sync', + side_effect=error) + mock_warn = mocker.patch('kafka.coordinator.consumer.log.warning') + mock_exc = mocker.patch('kafka.coordinator.consumer.log.exception') + + coordinator.config['api_version'] = api_version + coordinator.config['enable_auto_commit'] = enable + assert coordinator._maybe_auto_commit_offsets_sync() is None + assert auto_commit_task.disable.call_count == (1 if task_disable else 0) + assert commit_sync.call_count == (1 if commit_offsets else 0) + assert mock_warn.call_count == (1 if warn else 0) + assert mock_exc.call_count == (1 if exc else 0) + + +@pytest.fixture +def patched_coord(mocker, coordinator): + coordinator._subscription.subscribe(topics=['foobar']) + coordinator._subscription.needs_partition_assignment = False + mocker.patch.object(coordinator, 'coordinator_unknown') + coordinator.coordinator_unknown.return_value = False + coordinator.coordinator_id = 0 + mocker.patch.object(coordinator._client, 'least_loaded_node', + return_value=1) + mocker.patch.object(coordinator._client, 'ready', return_value=True) + mocker.patch.object(coordinator._client, 'send') + mocker.spy(coordinator, '_failed_request') + mocker.spy(coordinator, '_handle_offset_commit_response') + mocker.spy(coordinator, '_handle_offset_fetch_response') + return coordinator + + +def test_send_offset_commit_request_fail(patched_coord, offsets): + patched_coord.coordinator_unknown.return_value = True + patched_coord.coordinator_id = None + + # No offsets + ret = patched_coord._send_offset_commit_request({}) + assert isinstance(ret, Future) + assert ret.succeeded() + + # No coordinator + ret = patched_coord._send_offset_commit_request(offsets) + assert ret.failed() + assert isinstance(ret.exception, Errors.GroupCoordinatorNotAvailableError) + + +@pytest.mark.parametrize('api_version,req_type', [ + ((0, 8, 1), OffsetCommitRequest_v0), + ((0, 8, 2), OffsetCommitRequest_v1), + ((0, 9), OffsetCommitRequest_v2)]) +def test_send_offset_commit_request_versions(patched_coord, offsets, + api_version, req_type): + # assuming fixture sets coordinator=0, least_loaded_node=1 + expect_node = 0 if api_version >= (0, 8, 2) else 1 + patched_coord.config['api_version'] = api_version + + patched_coord._send_offset_commit_request(offsets) + (node, request), _ = patched_coord._client.send.call_args + assert node == expect_node, 'Unexpected coordinator node' + assert isinstance(request, req_type) + + +def test_send_offset_commit_request_failure(patched_coord, offsets): + _f = Future() + patched_coord._client.send.return_value = _f + future = patched_coord._send_offset_commit_request(offsets) + (node, request), _ = patched_coord._client.send.call_args + error = Exception() + _f.failure(error) + patched_coord._failed_request.assert_called_with(0, request, future, error) + assert future.failed() + assert future.exception is error + + +def test_send_offset_commit_request_success(patched_coord, offsets): + _f = Future() + patched_coord._client.send.return_value = _f + future = patched_coord._send_offset_commit_request(offsets) + (node, request), _ = patched_coord._client.send.call_args + response = OffsetCommitResponse([('foobar', [(0, 0), (1, 0)])]) + _f.success(response) + patched_coord._handle_offset_commit_response.assert_called_with( + offsets, future, response) + + +@pytest.mark.parametrize('response,error,dead,reassign', [ + (OffsetCommitResponse([('foobar', [(0, 30), (1, 30)])]), + Errors.GroupAuthorizationFailedError, False, False), + (OffsetCommitResponse([('foobar', [(0, 12), (1, 12)])]), + Errors.OffsetMetadataTooLargeError, False, False), + (OffsetCommitResponse([('foobar', [(0, 28), (1, 28)])]), + Errors.InvalidCommitOffsetSizeError, False, False), + (OffsetCommitResponse([('foobar', [(0, 14), (1, 14)])]), + Errors.GroupLoadInProgressError, False, False), + (OffsetCommitResponse([('foobar', [(0, 15), (1, 15)])]), + Errors.GroupCoordinatorNotAvailableError, True, False), + (OffsetCommitResponse([('foobar', [(0, 16), (1, 16)])]), + Errors.NotCoordinatorForGroupError, True, False), + (OffsetCommitResponse([('foobar', [(0, 7), (1, 7)])]), + Errors.RequestTimedOutError, True, False), + (OffsetCommitResponse([('foobar', [(0, 25), (1, 25)])]), + Errors.UnknownMemberIdError, False, True), + (OffsetCommitResponse([('foobar', [(0, 22), (1, 22)])]), + Errors.IllegalGenerationError, False, True), + (OffsetCommitResponse([('foobar', [(0, 27), (1, 27)])]), + Errors.RebalanceInProgressError, False, True), + (OffsetCommitResponse([('foobar', [(0, 17), (1, 17)])]), + Errors.InvalidTopicError, False, False), + (OffsetCommitResponse([('foobar', [(0, 29), (1, 29)])]), + Errors.TopicAuthorizationFailedError, False, False), +]) +def test_handle_offset_commit_response(patched_coord, offsets, + response, error, dead, reassign): + future = Future() + patched_coord._handle_offset_commit_response(offsets, future, response) + assert isinstance(future.exception, error) + assert patched_coord.coordinator_id is (None if dead else 0) + assert patched_coord._subscription.needs_partition_assignment is reassign + + +@pytest.fixture +def partitions(): + return [TopicPartition('foobar', 0), TopicPartition('foobar', 1)] + + +def test_send_offset_fetch_request_fail(patched_coord, partitions): + patched_coord.coordinator_unknown.return_value = True + patched_coord.coordinator_id = None + + # No partitions + ret = patched_coord._send_offset_fetch_request([]) + assert isinstance(ret, Future) + assert ret.succeeded() + assert ret.value == {} + + # No coordinator + ret = patched_coord._send_offset_fetch_request(partitions) + assert ret.failed() + assert isinstance(ret.exception, Errors.GroupCoordinatorNotAvailableError) + + +@pytest.mark.parametrize('api_version,req_type', [ + ((0, 8, 1), OffsetFetchRequest_v0), + ((0, 8, 2), OffsetFetchRequest_v1), + ((0, 9), OffsetFetchRequest_v1)]) +def test_send_offset_fetch_request_versions(patched_coord, partitions, + api_version, req_type): + # assuming fixture sets coordinator=0, least_loaded_node=1 + expect_node = 0 if api_version >= (0, 8, 2) else 1 + patched_coord.config['api_version'] = api_version + + patched_coord._send_offset_fetch_request(partitions) + (node, request), _ = patched_coord._client.send.call_args + assert node == expect_node, 'Unexpected coordinator node' + assert isinstance(request, req_type) + + +def test_send_offset_fetch_request_failure(patched_coord, partitions): + _f = Future() + patched_coord._client.send.return_value = _f + future = patched_coord._send_offset_fetch_request(partitions) + (node, request), _ = patched_coord._client.send.call_args + error = Exception() + _f.failure(error) + patched_coord._failed_request.assert_called_with(0, request, future, error) + assert future.failed() + assert future.exception is error + + +def test_send_offset_fetch_request_success(patched_coord, partitions): + _f = Future() + patched_coord._client.send.return_value = _f + future = patched_coord._send_offset_fetch_request(partitions) + (node, request), _ = patched_coord._client.send.call_args + response = OffsetFetchResponse([('foobar', [(0, 0), (1, 0)])]) + _f.success(response) + patched_coord._handle_offset_fetch_response.assert_called_with( + future, response) + + +@pytest.mark.parametrize('response,error,dead,reassign', [ + #(OffsetFetchResponse([('foobar', [(0, 123, b'', 30), (1, 234, b'', 30)])]), + # Errors.GroupAuthorizationFailedError, False, False), + #(OffsetFetchResponse([('foobar', [(0, 123, b'', 7), (1, 234, b'', 7)])]), + # Errors.RequestTimedOutError, True, False), + #(OffsetFetchResponse([('foobar', [(0, 123, b'', 27), (1, 234, b'', 27)])]), + # Errors.RebalanceInProgressError, False, True), + (OffsetFetchResponse([('foobar', [(0, 123, b'', 14), (1, 234, b'', 14)])]), + Errors.GroupLoadInProgressError, False, False), + (OffsetFetchResponse([('foobar', [(0, 123, b'', 16), (1, 234, b'', 16)])]), + Errors.NotCoordinatorForGroupError, True, False), + (OffsetFetchResponse([('foobar', [(0, 123, b'', 25), (1, 234, b'', 25)])]), + Errors.UnknownMemberIdError, False, True), + (OffsetFetchResponse([('foobar', [(0, 123, b'', 22), (1, 234, b'', 22)])]), + Errors.IllegalGenerationError, False, True), + (OffsetFetchResponse([('foobar', [(0, 123, b'', 29), (1, 234, b'', 29)])]), + Errors.TopicAuthorizationFailedError, False, False), + (OffsetFetchResponse([('foobar', [(0, 123, b'', 0), (1, 234, b'', 0)])]), + None, False, False), +]) +def test_handle_offset_fetch_response(patched_coord, offsets, + response, error, dead, reassign): + future = Future() + patched_coord._handle_offset_fetch_response(future, response) + if error is not None: + assert isinstance(future.exception, error) + else: + assert future.succeeded() + assert future.value == offsets + assert patched_coord.coordinator_id is (None if dead else 0) + assert patched_coord._subscription.needs_partition_assignment is reassign diff -Nru python-kafka-python-0.9.2/test/test_failover_integration.py python-kafka-python-1.0.1/test/test_failover_integration.py --- python-kafka-python-0.9.2/test/test_failover_integration.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/test/test_failover_integration.py 2016-02-17 18:37:58.000000000 +0000 @@ -1,57 +1,73 @@ import logging import os import time -import unittest2 -from kafka import * # noqa -from kafka.common import * # noqa -from kafka.producer import Producer -from fixtures import ZookeeperFixture, KafkaFixture -from testutil import * +from kafka import SimpleClient, SimpleConsumer, KeyedProducer +from kafka.common import ( + TopicPartition, FailedPayloadsError, ConnectionError, RequestTimedOutError +) +from kafka.producer.base import Producer + +from test.fixtures import ZookeeperFixture, KafkaFixture +from test.testutil import KafkaIntegrationTestCase, random_string + + +log = logging.getLogger(__name__) class TestFailover(KafkaIntegrationTestCase): create_client = False - @classmethod - def setUpClass(cls): # noqa + def setUp(self): if not os.environ.get('KAFKA_VERSION'): - return + self.skipTest('integration test requires KAFKA_VERSION') zk_chroot = random_string(10) - replicas = 2 - partitions = 2 - - # mini zookeeper, 2 kafka brokers - cls.zk = ZookeeperFixture.instance() - kk_args = [cls.zk.host, cls.zk.port, zk_chroot, replicas, partitions] - cls.brokers = [KafkaFixture.instance(i, *kk_args) for i in range(replicas)] + replicas = 3 + partitions = 3 - hosts = ['%s:%d' % (b.host, b.port) for b in cls.brokers] - cls.client = KafkaClient(hosts) + # mini zookeeper, 3 kafka brokers + self.zk = ZookeeperFixture.instance() + kk_args = [self.zk.host, self.zk.port] + kk_kwargs = {'zk_chroot': zk_chroot, 'replicas': replicas, + 'partitions': partitions} + self.brokers = [KafkaFixture.instance(i, *kk_args, **kk_kwargs) + for i in range(replicas)] + + hosts = ['%s:%d' % (b.host, b.port) for b in self.brokers] + self.client = SimpleClient(hosts, timeout=2) + super(TestFailover, self).setUp() - @classmethod - def tearDownClass(cls): + def tearDown(self): + super(TestFailover, self).tearDown() if not os.environ.get('KAFKA_VERSION'): return - cls.client.close() - for broker in cls.brokers: + self.client.close() + for broker in self.brokers: broker.close() - cls.zk.close() + self.zk.close() - @kafka_versions("all") def test_switch_leader(self): - key, topic, partition = random_string(5), self.topic, 0 + topic = self.topic + partition = 0 - # Test the base class Producer -- send_messages to a specific partition - producer = Producer(self.client, async=False) + # Testing the base Producer class here so that we can easily send + # messages to a specific partition, kill the leader for that partition + # and check that after another broker takes leadership the producer + # is able to resume sending messages + + # require that the server commit messages to all in-sync replicas + # so that failover doesn't lose any messages on server-side + # and we can assert that server-side message count equals client-side + producer = Producer(self.client, async=False, + req_acks=Producer.ACK_AFTER_CLUSTER_COMMIT) - # Send 10 random messages - self._send_random_messages(producer, topic, partition, 10) + # Send 100 random messages to a specific partition + self._send_random_messages(producer, topic, partition, 100) # kill leader for partition - broker = self._kill_leader(topic, partition) + self._kill_leader(topic, partition) # expect failure, but dont wait more than 60 secs to recover recovered = False @@ -59,88 +75,164 @@ timeout = 60 while not recovered and (time.time() - started) < timeout: try: - logging.debug("attempting to send 'success' message after leader killed") - producer.send_messages(topic, partition, 'success') - logging.debug("success!") + log.debug("attempting to send 'success' message after leader killed") + producer.send_messages(topic, partition, b'success') + log.debug("success!") recovered = True - except FailedPayloadsError, ConnectionError: - logging.debug("caught exception sending message -- will retry") + except (FailedPayloadsError, ConnectionError, RequestTimedOutError): + log.debug("caught exception sending message -- will retry") continue # Verify we successfully sent the message self.assertTrue(recovered) # send some more messages to new leader - self._send_random_messages(producer, topic, partition, 10) + self._send_random_messages(producer, topic, partition, 100) # count number of messages - count = self._count_messages('test_switch_leader group', topic, - partitions=(partition,)) + # Should be equal to 100 before + 1 recovery + 100 after + # at_least=True because exactly once delivery isn't really a thing + self.assert_message_count(topic, 201, partitions=(partition,), + at_least=True) - # Should be equal to 10 before + 1 recovery + 10 after - self.assertEquals(count, 21) - - - #@kafka_versions("all") - @unittest2.skip("async producer does not support reliable failover yet") def test_switch_leader_async(self): - key, topic, partition = random_string(5), self.topic, 0 + topic = self.topic + partition = 0 # Test the base class Producer -- send_messages to a specific partition - producer = Producer(self.client, async=True) + producer = Producer(self.client, async=True, + batch_send_every_n=15, + batch_send_every_t=3, + req_acks=Producer.ACK_AFTER_CLUSTER_COMMIT, + async_log_messages_on_error=False) # Send 10 random messages self._send_random_messages(producer, topic, partition, 10) + self._send_random_messages(producer, topic, partition + 1, 10) # kill leader for partition - broker = self._kill_leader(topic, partition) + self._kill_leader(topic, partition) - logging.debug("attempting to send 'success' message after leader killed") + log.debug("attempting to send 'success' message after leader killed") # in async mode, this should return immediately - producer.send_messages(topic, partition, 'success') + producer.send_messages(topic, partition, b'success') + producer.send_messages(topic, partition + 1, b'success') # send to new leader self._send_random_messages(producer, topic, partition, 10) + self._send_random_messages(producer, topic, partition + 1, 10) - # wait until producer queue is empty - while not producer.queue.empty(): - time.sleep(0.1) + # Stop the producer and wait for it to shutdown producer.stop() + started = time.time() + timeout = 60 + while (time.time() - started) < timeout: + if not producer.thread.is_alive(): + break + time.sleep(0.1) + else: + self.fail('timeout waiting for producer queue to empty') # count number of messages - count = self._count_messages('test_switch_leader_async group', topic, - partitions=(partition,)) - # Should be equal to 10 before + 1 recovery + 10 after - self.assertEquals(count, 21) + # at_least=True because exactly once delivery isn't really a thing + self.assert_message_count(topic, 21, partitions=(partition,), + at_least=True) + self.assert_message_count(topic, 21, partitions=(partition + 1,), + at_least=True) + + def test_switch_leader_keyed_producer(self): + topic = self.topic + producer = KeyedProducer(self.client, async=False) + + # Send 10 random messages + for _ in range(10): + key = random_string(3).encode('utf-8') + msg = random_string(10).encode('utf-8') + producer.send_messages(topic, key, msg) + + # kill leader for partition 0 + self._kill_leader(topic, 0) + + recovered = False + started = time.time() + timeout = 60 + while not recovered and (time.time() - started) < timeout: + try: + key = random_string(3).encode('utf-8') + msg = random_string(10).encode('utf-8') + producer.send_messages(topic, key, msg) + if producer.partitioners[topic].partition(key) == 0: + recovered = True + except (FailedPayloadsError, ConnectionError): + log.debug("caught exception sending message -- will retry") + continue + + # Verify we successfully sent the message + self.assertTrue(recovered) + + # send some more messages just to make sure no more exceptions + for _ in range(10): + key = random_string(3).encode('utf-8') + msg = random_string(10).encode('utf-8') + producer.send_messages(topic, key, msg) + + def test_switch_leader_simple_consumer(self): + producer = Producer(self.client, async=False) + consumer = SimpleConsumer(self.client, None, self.topic, partitions=None, auto_commit=False, iter_timeout=10) + self._send_random_messages(producer, self.topic, 0, 2) + consumer.get_messages() + self._kill_leader(self.topic, 0) + consumer.get_messages() def _send_random_messages(self, producer, topic, partition, n): for j in range(n): - logging.debug('_send_random_message to %s:%d -- try %d', topic, partition, j) - resp = producer.send_messages(topic, partition, random_string(10)) - if len(resp) > 0: - self.assertEquals(resp[0].error, 0) - logging.debug('_send_random_message to %s:%d -- try %d success', topic, partition, j) + msg = 'msg {0}: {1}'.format(j, random_string(10)) + log.debug('_send_random_message %s to %s:%d', msg, topic, partition) + while True: + try: + producer.send_messages(topic, partition, msg.encode('utf-8')) + except: + log.exception('failure in _send_random_messages - retrying') + continue + else: + break def _kill_leader(self, topic, partition): - leader = self.client.topics_to_brokers[TopicAndPartition(topic, partition)] + leader = self.client.topics_to_brokers[TopicPartition(topic, partition)] broker = self.brokers[leader.nodeId] broker.close() return broker - def _count_messages(self, group, topic, timeout=1, partitions=None): + def assert_message_count(self, topic, check_count, timeout=10, + partitions=None, at_least=False): hosts = ','.join(['%s:%d' % (broker.host, broker.port) for broker in self.brokers]) - client = KafkaClient(hosts) - consumer = SimpleConsumer(client, group, topic, + client = SimpleClient(hosts) + consumer = SimpleConsumer(client, None, topic, partitions=partitions, auto_commit=False, iter_timeout=timeout) - count = consumer.pending(partitions) + started_at = time.time() + pending = -1 + while pending < check_count and (time.time() - started_at < timeout): + try: + pending = consumer.pending(partitions) + except FailedPayloadsError: + pass + time.sleep(0.5) + consumer.stop() client.close() - return count + + if pending < check_count: + self.fail('Too few pending messages: found %d, expected %d' % + (pending, check_count)) + elif pending > check_count and not at_least: + self.fail('Too many pending messages: found %d, expected %d' % + (pending, check_count)) + return True diff -Nru python-kafka-python-0.9.2/test/test_fetcher.py python-kafka-python-1.0.1/test/test_fetcher.py --- python-kafka-python-0.9.2/test/test_fetcher.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/test/test_fetcher.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,101 @@ +# pylint: skip-file +from __future__ import absolute_import + +import pytest + +from kafka.client_async import KafkaClient +from kafka.common import TopicPartition, OffsetAndMetadata +from kafka.consumer.fetcher import Fetcher +from kafka.consumer.subscription_state import SubscriptionState +from kafka.future import Future +from kafka.protocol.fetch import FetchRequest + +import kafka.common as Errors + + +@pytest.fixture +def client(mocker): + return mocker.Mock(spec=KafkaClient) + + +@pytest.fixture +def subscription_state(): + return SubscriptionState() + + +@pytest.fixture +def fetcher(client, subscription_state): + subscription_state.subscribe(topics=['foobar']) + assignment = [TopicPartition('foobar', i) for i in range(3)] + subscription_state.assign_from_subscribed(assignment) + for tp in assignment: + subscription_state.seek(tp, 0) + return Fetcher(client, subscription_state) + + +def test_init_fetches(fetcher, mocker): + fetch_requests = [ + FetchRequest(-1, fetcher.config['fetch_max_wait_ms'], + fetcher.config['fetch_min_bytes'], + [('foobar', [ + (0, 0, fetcher.config['max_partition_fetch_bytes']), + (1, 0, fetcher.config['max_partition_fetch_bytes']), + ])]), + FetchRequest(-1, fetcher.config['fetch_max_wait_ms'], + fetcher.config['fetch_min_bytes'], + [('foobar', [ + (2, 0, fetcher.config['max_partition_fetch_bytes']), + ])]) + ] + + mocker.patch.object(fetcher, '_create_fetch_requests', + return_value = dict(enumerate(fetch_requests))) + + fetcher._records.append('foobar') + ret = fetcher.init_fetches() + assert fetcher._create_fetch_requests.call_count == 0 + assert ret == [] + fetcher._records.clear() + + fetcher._iterator = 'foo' + ret = fetcher.init_fetches() + assert fetcher._create_fetch_requests.call_count == 0 + assert ret == [] + fetcher._iterator = None + + ret = fetcher.init_fetches() + for node, request in enumerate(fetch_requests): + fetcher._client.send.assert_any_call(node, request) + assert len(ret) == len(fetch_requests) + + +def test_update_fetch_positions(fetcher, mocker): + mocker.patch.object(fetcher, '_reset_offset') + partition = TopicPartition('foobar', 0) + + # unassigned partition + fetcher.update_fetch_positions([TopicPartition('fizzbuzz', 0)]) + assert fetcher._reset_offset.call_count == 0 + + # fetchable partition (has offset, not paused) + fetcher.update_fetch_positions([partition]) + assert fetcher._reset_offset.call_count == 0 + + # partition needs reset, no committed offset + fetcher._subscriptions.need_offset_reset(partition) + fetcher._subscriptions.assignment[partition].awaiting_reset = False + fetcher.update_fetch_positions([partition]) + fetcher._reset_offset.assert_called_with(partition) + assert fetcher._subscriptions.assignment[partition].awaiting_reset is True + fetcher.update_fetch_positions([partition]) + fetcher._reset_offset.assert_called_with(partition) + + # partition needs reset, has committed offset + fetcher._reset_offset.reset_mock() + fetcher._subscriptions.need_offset_reset(partition) + fetcher._subscriptions.assignment[partition].awaiting_reset = False + fetcher._subscriptions.assignment[partition].committed = 123 + mocker.patch.object(fetcher._subscriptions, 'seek') + fetcher.update_fetch_positions([partition]) + assert fetcher._reset_offset.call_count == 0 + fetcher._subscriptions.seek.assert_called_with(partition, 123) diff -Nru python-kafka-python-0.9.2/test/test_package.py python-kafka-python-1.0.1/test/test_package.py --- python-kafka-python-0.9.2/test/test_package.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/test/test_package.py 2016-02-17 18:37:58.000000000 +0000 @@ -1,29 +1,28 @@ -import unittest2 +from . import unittest -class TestPackage(unittest2.TestCase): + +class TestPackage(unittest.TestCase): def test_top_level_namespace(self): import kafka as kafka1 - self.assertEquals(kafka1.KafkaClient.__name__, "KafkaClient") - self.assertEquals(kafka1.client.__name__, "kafka.client") - self.assertEquals(kafka1.codec.__name__, "kafka.codec") + self.assertEqual(kafka1.KafkaConsumer.__name__, "KafkaConsumer") + self.assertEqual(kafka1.consumer.__name__, "kafka.consumer") + self.assertEqual(kafka1.codec.__name__, "kafka.codec") def test_submodule_namespace(self): import kafka.client as client1 - self.assertEquals(client1.__name__, "kafka.client") - self.assertEquals(client1.KafkaClient.__name__, "KafkaClient") + self.assertEqual(client1.__name__, "kafka.client") from kafka import client as client2 - self.assertEquals(client2.__name__, "kafka.client") - self.assertEquals(client2.KafkaClient.__name__, "KafkaClient") + self.assertEqual(client2.__name__, "kafka.client") - from kafka.client import KafkaClient as KafkaClient1 - self.assertEquals(KafkaClient1.__name__, "KafkaClient") + from kafka.client import SimpleClient as SimpleClient1 + self.assertEqual(SimpleClient1.__name__, "SimpleClient") from kafka.codec import gzip_encode as gzip_encode1 - self.assertEquals(gzip_encode1.__name__, "gzip_encode") + self.assertEqual(gzip_encode1.__name__, "gzip_encode") - from kafka import KafkaClient as KafkaClient2 - self.assertEquals(KafkaClient2.__name__, "KafkaClient") + from kafka import SimpleClient as SimpleClient2 + self.assertEqual(SimpleClient2.__name__, "SimpleClient") from kafka.codec import snappy_encode - self.assertEquals(snappy_encode.__name__, "snappy_encode") + self.assertEqual(snappy_encode.__name__, "snappy_encode") diff -Nru python-kafka-python-0.9.2/test/test_partitioner.py python-kafka-python-1.0.1/test/test_partitioner.py --- python-kafka-python-0.9.2/test/test_partitioner.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/test/test_partitioner.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,43 @@ +import pytest +import six + +from kafka.partitioner import Murmur2Partitioner +from kafka.partitioner.default import DefaultPartitioner + + +def test_default_partitioner(): + partitioner = DefaultPartitioner() + all_partitions = list(range(100)) + available = all_partitions + # partitioner should return the same partition for the same key + p1 = partitioner(b'foo', all_partitions, available) + p2 = partitioner(b'foo', all_partitions, available) + assert p1 == p2 + assert p1 in all_partitions + + # when key is None, choose one of available partitions + assert partitioner(None, all_partitions, [123]) == 123 + + # with fallback to all_partitions + assert partitioner(None, all_partitions, []) in all_partitions + + +def test_hash_bytes(): + p = Murmur2Partitioner(range(1000)) + assert p.partition(bytearray(b'test')) == p.partition(b'test') + + +def test_hash_encoding(): + p = Murmur2Partitioner(range(1000)) + assert p.partition('test') == p.partition(u'test') + + +def test_murmur2_java_compatibility(): + p = Murmur2Partitioner(range(1000)) + # compare with output from Kafka's org.apache.kafka.clients.producer.Partitioner + assert p.partition(b'') == 681 + assert p.partition(b'a') == 524 + assert p.partition(b'ab') == 434 + assert p.partition(b'abc') == 107 + assert p.partition(b'123456789') == 566 + assert p.partition(b'\x00 ') == 742 diff -Nru python-kafka-python-0.9.2/test/test_producer_integration.py python-kafka-python-1.0.1/test/test_producer_integration.py --- python-kafka-python-0.9.2/test/test_producer_integration.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/test/test_producer_integration.py 2016-02-17 18:37:58.000000000 +0000 @@ -2,14 +2,25 @@ import time import uuid -from kafka import * # noqa -from kafka.common import * # noqa -from kafka.codec import has_gzip, has_snappy -from fixtures import ZookeeperFixture, KafkaFixture -from testutil import * +from six.moves import range + +from kafka import ( + SimpleProducer, KeyedProducer, + create_message, create_gzip_message, create_snappy_message, + RoundRobinPartitioner, HashedPartitioner +) +from kafka.codec import has_snappy +from kafka.common import ( + FetchRequestPayload, ProduceRequestPayload, + UnknownTopicOrPartitionError, LeaderNotAvailableError +) +from kafka.producer.base import Producer + +from test.fixtures import ZookeeperFixture, KafkaFixture +from test.testutil import KafkaIntegrationTestCase, kafka_versions + class TestKafkaProducerIntegration(KafkaIntegrationTestCase): - topic = 'produce_topic' @classmethod def setUpClass(cls): # noqa @@ -27,38 +38,40 @@ cls.server.close() cls.zk.close() - @kafka_versions("all") def test_produce_many_simple(self): start_offset = self.current_offset(self.topic, 0) self.assert_produce_request( - [ create_message("Test message %d" % i) for i in range(100) ], + [create_message(("Test message %d" % i).encode('utf-8')) + for i in range(100)], start_offset, 100, ) self.assert_produce_request( - [ create_message("Test message %d" % i) for i in range(100) ], + [create_message(("Test message %d" % i).encode('utf-8')) + for i in range(100)], start_offset+100, 100, ) - @kafka_versions("all") def test_produce_10k_simple(self): start_offset = self.current_offset(self.topic, 0) self.assert_produce_request( - [ create_message("Test message %d" % i) for i in range(10000) ], + [create_message(("Test message %d" % i).encode('utf-8')) + for i in range(10000)], start_offset, 10000, ) - @kafka_versions("all") def test_produce_many_gzip(self): start_offset = self.current_offset(self.topic, 0) - message1 = create_gzip_message(["Gzipped 1 %d" % i for i in range(100)]) - message2 = create_gzip_message(["Gzipped 2 %d" % i for i in range(100)]) + message1 = create_gzip_message([ + (("Gzipped 1 %d" % i).encode('utf-8'), None) for i in range(100)]) + message2 = create_gzip_message([ + (("Gzipped 2 %d" % i).encode('utf-8'), None) for i in range(100)]) self.assert_produce_request( [ message1, message2 ], @@ -66,49 +79,51 @@ 200, ) - @kafka_versions("all") def test_produce_many_snappy(self): self.skipTest("All snappy integration tests fail with nosnappyjava") start_offset = self.current_offset(self.topic, 0) self.assert_produce_request([ - create_snappy_message(["Snappy 1 %d" % i for i in range(100)]), - create_snappy_message(["Snappy 2 %d" % i for i in range(100)]), + create_snappy_message([("Snappy 1 %d" % i, None) for i in range(100)]), + create_snappy_message([("Snappy 2 %d" % i, None) for i in range(100)]), ], start_offset, 200, ) - @kafka_versions("all") def test_produce_mixed(self): start_offset = self.current_offset(self.topic, 0) msg_count = 1+100 messages = [ - create_message("Just a plain message"), - create_gzip_message(["Gzipped %d" % i for i in range(100)]), + create_message(b"Just a plain message"), + create_gzip_message([ + (("Gzipped %d" % i).encode('utf-8'), None) for i in range(100)]), ] # All snappy integration tests fail with nosnappyjava if False and has_snappy(): msg_count += 100 - messages.append(create_snappy_message(["Snappy %d" % i for i in range(100)])) + messages.append(create_snappy_message([("Snappy %d" % i, None) for i in range(100)])) self.assert_produce_request(messages, start_offset, msg_count) - @kafka_versions("all") def test_produce_100k_gzipped(self): start_offset = self.current_offset(self.topic, 0) self.assert_produce_request([ - create_gzip_message(["Gzipped batch 1, message %d" % i for i in range(50000)]) + create_gzip_message([ + (("Gzipped batch 1, message %d" % i).encode('utf-8'), None) + for i in range(50000)]) ], start_offset, 50000, ) self.assert_produce_request([ - create_gzip_message(["Gzipped batch 1, message %d" % i for i in range(50000)]) + create_gzip_message([ + (("Gzipped batch 1, message %d" % i).encode('utf-8'), None) + for i in range(50000)]) ], start_offset+50000, 50000, @@ -118,42 +133,38 @@ # SimpleProducer Tests # ############################ - @kafka_versions("all") - def test_simple_producer(self): - start_offset0 = self.current_offset(self.topic, 0) - start_offset1 = self.current_offset(self.topic, 1) + def test_simple_producer_new_topic(self): producer = SimpleProducer(self.client) + resp = producer.send_messages('new_topic', self.msg('foobar')) + self.assert_produce_response(resp, 0) + producer.stop() + + def test_simple_producer(self): + partitions = self.client.get_partition_ids_for_topic(self.topic) + start_offsets = [self.current_offset(self.topic, p) for p in partitions] + + producer = SimpleProducer(self.client, random_start=False) # Goes to first partition, randomly. resp = producer.send_messages(self.topic, self.msg("one"), self.msg("two")) - self.assert_produce_response(resp, start_offset0) + self.assert_produce_response(resp, start_offsets[0]) # Goes to the next partition, randomly. resp = producer.send_messages(self.topic, self.msg("three")) - self.assert_produce_response(resp, start_offset1) + self.assert_produce_response(resp, start_offsets[1]) - self.assert_fetch_offset(0, start_offset0, [ self.msg("one"), self.msg("two") ]) - self.assert_fetch_offset(1, start_offset1, [ self.msg("three") ]) + self.assert_fetch_offset(partitions[0], start_offsets[0], [ self.msg("one"), self.msg("two") ]) + self.assert_fetch_offset(partitions[1], start_offsets[1], [ self.msg("three") ]) # Goes back to the first partition because there's only two partitions resp = producer.send_messages(self.topic, self.msg("four"), self.msg("five")) - self.assert_produce_response(resp, start_offset0+2) - self.assert_fetch_offset(0, start_offset0, [ self.msg("one"), self.msg("two"), self.msg("four"), self.msg("five") ]) + self.assert_produce_response(resp, start_offsets[0]+2) + self.assert_fetch_offset(partitions[0], start_offsets[0], [ self.msg("one"), self.msg("two"), self.msg("four"), self.msg("five") ]) producer.stop() - @kafka_versions("all") - def test_produce__new_topic_fails_with_reasonable_error(self): - new_topic = 'new_topic_{guid}'.format(guid = str(uuid.uuid4())) - producer = SimpleProducer(self.client) - - # At first it doesn't exist - with self.assertRaises(UnknownTopicOrPartitionError): - resp = producer.send_messages(new_topic, self.msg("one")) - - @kafka_versions("all") def test_producer_random_order(self): - producer = SimpleProducer(self.client, random_start = True) + producer = SimpleProducer(self.client, random_start=True) resp1 = producer.send_messages(self.topic, self.msg("one"), self.msg("two")) resp2 = producer.send_messages(self.topic, self.msg("three")) resp3 = producer.send_messages(self.topic, self.msg("four"), self.msg("five")) @@ -161,9 +172,8 @@ self.assertEqual(resp1[0].partition, resp3[0].partition) self.assertNotEqual(resp1[0].partition, resp2[0].partition) - @kafka_versions("all") def test_producer_ordered_start(self): - producer = SimpleProducer(self.client, random_start = False) + producer = SimpleProducer(self.client, random_start=False) resp1 = producer.send_messages(self.topic, self.msg("one"), self.msg("two")) resp2 = producer.send_messages(self.topic, self.msg("three")) resp3 = producer.send_messages(self.topic, self.msg("four"), self.msg("five")) @@ -172,102 +182,37 @@ self.assertEqual(resp2[0].partition, 1) self.assertEqual(resp3[0].partition, 0) - @kafka_versions("all") - def test_round_robin_partitioner(self): - start_offset0 = self.current_offset(self.topic, 0) - start_offset1 = self.current_offset(self.topic, 1) - - producer = KeyedProducer(self.client, partitioner=RoundRobinPartitioner) - resp1 = producer.send(self.topic, "key1", self.msg("one")) - resp2 = producer.send(self.topic, "key2", self.msg("two")) - resp3 = producer.send(self.topic, "key3", self.msg("three")) - resp4 = producer.send(self.topic, "key4", self.msg("four")) - - self.assert_produce_response(resp1, start_offset0+0) - self.assert_produce_response(resp2, start_offset1+0) - self.assert_produce_response(resp3, start_offset0+1) - self.assert_produce_response(resp4, start_offset1+1) - - self.assert_fetch_offset(0, start_offset0, [ self.msg("one"), self.msg("three") ]) - self.assert_fetch_offset(1, start_offset1, [ self.msg("two"), self.msg("four") ]) - - producer.stop() - - @kafka_versions("all") - def test_hashed_partitioner(self): - start_offset0 = self.current_offset(self.topic, 0) - start_offset1 = self.current_offset(self.topic, 1) - - producer = KeyedProducer(self.client, partitioner=HashedPartitioner) - resp1 = producer.send(self.topic, 1, self.msg("one")) - resp2 = producer.send(self.topic, 2, self.msg("two")) - resp3 = producer.send(self.topic, 3, self.msg("three")) - resp4 = producer.send(self.topic, 3, self.msg("four")) - resp5 = producer.send(self.topic, 4, self.msg("five")) - - self.assert_produce_response(resp1, start_offset1+0) - self.assert_produce_response(resp2, start_offset0+0) - self.assert_produce_response(resp3, start_offset1+1) - self.assert_produce_response(resp4, start_offset1+2) - self.assert_produce_response(resp5, start_offset0+1) - - self.assert_fetch_offset(0, start_offset0, [ self.msg("two"), self.msg("five") ]) - self.assert_fetch_offset(1, start_offset1, [ self.msg("one"), self.msg("three"), self.msg("four") ]) - - producer.stop() - - @kafka_versions("all") - def test_acks_none(self): - start_offset0 = self.current_offset(self.topic, 0) - start_offset1 = self.current_offset(self.topic, 1) + def test_async_simple_producer(self): + partition = self.client.get_partition_ids_for_topic(self.topic)[0] + start_offset = self.current_offset(self.topic, partition) - producer = SimpleProducer(self.client, req_acks=SimpleProducer.ACK_NOT_REQUIRED) + producer = SimpleProducer(self.client, async=True, random_start=False) resp = producer.send_messages(self.topic, self.msg("one")) - self.assertEquals(len(resp), 0) + self.assertEqual(len(resp), 0) - self.assert_fetch_offset(0, start_offset0, [ self.msg("one") ]) + # flush messages producer.stop() - @kafka_versions("all") - def test_acks_local_write(self): - start_offset0 = self.current_offset(self.topic, 0) - start_offset1 = self.current_offset(self.topic, 1) - - producer = SimpleProducer(self.client, req_acks=SimpleProducer.ACK_AFTER_LOCAL_WRITE) - resp = producer.send_messages(self.topic, self.msg("one")) + self.assert_fetch_offset(partition, start_offset, [ self.msg("one") ]) - self.assert_produce_response(resp, start_offset0) - self.assert_fetch_offset(0, start_offset0, [ self.msg("one") ]) - producer.stop() - - @kafka_versions("all") - def test_acks_cluster_commit(self): - start_offset0 = self.current_offset(self.topic, 0) - start_offset1 = self.current_offset(self.topic, 1) + def test_batched_simple_producer__triggers_by_message(self): + partitions = self.client.get_partition_ids_for_topic(self.topic) + start_offsets = [self.current_offset(self.topic, p) for p in partitions] + # Configure batch producer + batch_messages = 5 + batch_interval = 5 producer = SimpleProducer( self.client, - req_acks=SimpleProducer.ACK_AFTER_CLUSTER_COMMIT) - - resp = producer.send_messages(self.topic, self.msg("one")) - self.assert_produce_response(resp, start_offset0) - self.assert_fetch_offset(0, start_offset0, [ self.msg("one") ]) - - producer.stop() - - @kafka_versions("all") - def test_batched_simple_producer__triggers_by_message(self): - start_offset0 = self.current_offset(self.topic, 0) - start_offset1 = self.current_offset(self.topic, 1) - - producer = SimpleProducer(self.client, - batch_send=True, - batch_send_every_n=5, - batch_send_every_t=20) - - # Send 5 messages and do a fetch - resp = producer.send_messages(self.topic, + async=True, + batch_send_every_n=batch_messages, + batch_send_every_t=batch_interval, + random_start=False) + + # Send 4 messages -- should not trigger a batch + resp = producer.send_messages( + self.topic, self.msg("one"), self.msg("two"), self.msg("three"), @@ -275,48 +220,64 @@ ) # Batch mode is async. No ack - self.assertEquals(len(resp), 0) + self.assertEqual(len(resp), 0) # It hasn't sent yet - self.assert_fetch_offset(0, start_offset0, []) - self.assert_fetch_offset(1, start_offset1, []) + self.assert_fetch_offset(partitions[0], start_offsets[0], []) + self.assert_fetch_offset(partitions[1], start_offsets[1], []) - resp = producer.send_messages(self.topic, + # send 3 more messages -- should trigger batch on first 5 + resp = producer.send_messages( + self.topic, self.msg("five"), self.msg("six"), self.msg("seven"), ) # Batch mode is async. No ack - self.assertEquals(len(resp), 0) + self.assertEqual(len(resp), 0) - self.assert_fetch_offset(0, start_offset0, [ + # Wait until producer has pulled all messages from internal queue + # this should signal that the first batch was sent, and the producer + # is now waiting for enough messages to batch again (or a timeout) + timeout = 5 + start = time.time() + while not producer.queue.empty(): + if time.time() - start > timeout: + self.fail('timeout waiting for producer queue to empty') + time.sleep(0.1) + + # send messages groups all *msgs in a single call to the same partition + # so we should see all messages from the first call in one partition + self.assert_fetch_offset(partitions[0], start_offsets[0], [ self.msg("one"), self.msg("two"), self.msg("three"), self.msg("four"), ]) - self.assert_fetch_offset(1, start_offset1, [ + # Because we are batching every 5 messages, we should only see one + self.assert_fetch_offset(partitions[1], start_offsets[1], [ self.msg("five"), - # self.msg("six"), - # self.msg("seven"), ]) producer.stop() - @kafka_versions("all") def test_batched_simple_producer__triggers_by_time(self): - start_offset0 = self.current_offset(self.topic, 0) - start_offset1 = self.current_offset(self.topic, 1) + partitions = self.client.get_partition_ids_for_topic(self.topic) + start_offsets = [self.current_offset(self.topic, p) for p in partitions] - producer = SimpleProducer(self.client, - batch_send=True, - batch_send_every_n=100, - batch_send_every_t=5) + batch_interval = 5 + producer = SimpleProducer( + self.client, + async=True, + batch_send_every_n=100, + batch_send_every_t=batch_interval, + random_start=False) # Send 5 messages and do a fetch - resp = producer.send_messages(self.topic, + resp = producer.send_messages( + self.topic, self.msg("one"), self.msg("two"), self.msg("three"), @@ -324,11 +285,11 @@ ) # Batch mode is async. No ack - self.assertEquals(len(resp), 0) + self.assertEqual(len(resp), 0) # It hasn't sent yet - self.assert_fetch_offset(0, start_offset0, []) - self.assert_fetch_offset(1, start_offset1, []) + self.assert_fetch_offset(partitions[0], start_offsets[0], []) + self.assert_fetch_offset(partitions[1], start_offsets[1], []) resp = producer.send_messages(self.topic, self.msg("five"), @@ -337,19 +298,19 @@ ) # Batch mode is async. No ack - self.assertEquals(len(resp), 0) + self.assertEqual(len(resp), 0) # Wait the timeout out - time.sleep(5) + time.sleep(batch_interval) - self.assert_fetch_offset(0, start_offset0, [ + self.assert_fetch_offset(partitions[0], start_offsets[0], [ self.msg("one"), self.msg("two"), self.msg("three"), self.msg("four"), ]) - self.assert_fetch_offset(1, start_offset1, [ + self.assert_fetch_offset(partitions[1], start_offsets[1], [ self.msg("five"), self.msg("six"), self.msg("seven"), @@ -357,42 +318,162 @@ producer.stop() - @kafka_versions("all") - def test_async_simple_producer(self): - start_offset0 = self.current_offset(self.topic, 0) - start_offset1 = self.current_offset(self.topic, 1) - producer = SimpleProducer(self.client, async=True) - resp = producer.send_messages(self.topic, self.msg("one")) - self.assertEquals(len(resp), 0) + ############################ + # KeyedProducer Tests # + ############################ + + @kafka_versions('>=0.8.1') + def test_keyedproducer_null_payload(self): + partitions = self.client.get_partition_ids_for_topic(self.topic) + start_offsets = [self.current_offset(self.topic, p) for p in partitions] + + producer = KeyedProducer(self.client, partitioner=RoundRobinPartitioner) + key = "test" - self.assert_fetch_offset(0, start_offset0, [ self.msg("one") ]) + resp = producer.send_messages(self.topic, self.key("key1"), self.msg("one")) + self.assert_produce_response(resp, start_offsets[0]) + resp = producer.send_messages(self.topic, self.key("key2"), None) + self.assert_produce_response(resp, start_offsets[1]) + resp = producer.send_messages(self.topic, self.key("key3"), None) + self.assert_produce_response(resp, start_offsets[0]+1) + resp = producer.send_messages(self.topic, self.key("key4"), self.msg("four")) + self.assert_produce_response(resp, start_offsets[1]+1) + + self.assert_fetch_offset(partitions[0], start_offsets[0], [ self.msg("one"), None ]) + self.assert_fetch_offset(partitions[1], start_offsets[1], [ None, self.msg("four") ]) + + producer.stop() + + def test_round_robin_partitioner(self): + partitions = self.client.get_partition_ids_for_topic(self.topic) + start_offsets = [self.current_offset(self.topic, p) for p in partitions] + + producer = KeyedProducer(self.client, partitioner=RoundRobinPartitioner) + resp1 = producer.send_messages(self.topic, self.key("key1"), self.msg("one")) + resp2 = producer.send_messages(self.topic, self.key("key2"), self.msg("two")) + resp3 = producer.send_messages(self.topic, self.key("key3"), self.msg("three")) + resp4 = producer.send_messages(self.topic, self.key("key4"), self.msg("four")) + + self.assert_produce_response(resp1, start_offsets[0]+0) + self.assert_produce_response(resp2, start_offsets[1]+0) + self.assert_produce_response(resp3, start_offsets[0]+1) + self.assert_produce_response(resp4, start_offsets[1]+1) + + self.assert_fetch_offset(partitions[0], start_offsets[0], [ self.msg("one"), self.msg("three") ]) + self.assert_fetch_offset(partitions[1], start_offsets[1], [ self.msg("two"), self.msg("four") ]) + + producer.stop() + + def test_hashed_partitioner(self): + partitions = self.client.get_partition_ids_for_topic(self.topic) + start_offsets = [self.current_offset(self.topic, p) for p in partitions] + + producer = KeyedProducer(self.client, partitioner=HashedPartitioner) + resp1 = producer.send_messages(self.topic, self.key("1"), self.msg("one")) + resp2 = producer.send_messages(self.topic, self.key("2"), self.msg("two")) + resp3 = producer.send_messages(self.topic, self.key("3"), self.msg("three")) + resp4 = producer.send_messages(self.topic, self.key("3"), self.msg("four")) + resp5 = producer.send_messages(self.topic, self.key("4"), self.msg("five")) + + offsets = {partitions[0]: start_offsets[0], partitions[1]: start_offsets[1]} + messages = {partitions[0]: [], partitions[1]: []} + + keys = [self.key(k) for k in ["1", "2", "3", "3", "4"]] + resps = [resp1, resp2, resp3, resp4, resp5] + msgs = [self.msg(m) for m in ["one", "two", "three", "four", "five"]] + + for key, resp, msg in zip(keys, resps, msgs): + k = hash(key) % 2 + partition = partitions[k] + offset = offsets[partition] + self.assert_produce_response(resp, offset) + offsets[partition] += 1 + messages[partition].append(msg) + + self.assert_fetch_offset(partitions[0], start_offsets[0], messages[partitions[0]]) + self.assert_fetch_offset(partitions[1], start_offsets[1], messages[partitions[1]]) producer.stop() - @kafka_versions("all") def test_async_keyed_producer(self): - start_offset0 = self.current_offset(self.topic, 0) - start_offset1 = self.current_offset(self.topic, 1) + partition = self.client.get_partition_ids_for_topic(self.topic)[0] + start_offset = self.current_offset(self.topic, partition) producer = KeyedProducer(self.client, partitioner = RoundRobinPartitioner, async=True) - resp = producer.send(self.topic, "key1", self.msg("one")) - self.assertEquals(len(resp), 0) + resp = producer.send_messages(self.topic, self.key("key1"), self.msg("one")) + self.assertEqual(len(resp), 0) + + # wait for the server to report a new highwatermark + while self.current_offset(self.topic, partition) == start_offset: + time.sleep(0.1) + + self.assert_fetch_offset(partition, start_offset, [ self.msg("one") ]) + + producer.stop() + + ############################ + # Producer ACK Tests # + ############################ + + def test_acks_none(self): + partition = self.client.get_partition_ids_for_topic(self.topic)[0] + start_offset = self.current_offset(self.topic, partition) + + producer = Producer( + self.client, + req_acks=Producer.ACK_NOT_REQUIRED, + ) + resp = producer.send_messages(self.topic, partition, self.msg("one")) + + # No response from produce request with no acks required + self.assertEqual(len(resp), 0) + + # But the message should still have been delivered + self.assert_fetch_offset(partition, start_offset, [ self.msg("one") ]) + producer.stop() + + def test_acks_local_write(self): + partition = self.client.get_partition_ids_for_topic(self.topic)[0] + start_offset = self.current_offset(self.topic, partition) + + producer = Producer( + self.client, + req_acks=Producer.ACK_AFTER_LOCAL_WRITE, + ) + resp = producer.send_messages(self.topic, partition, self.msg("one")) + + self.assert_produce_response(resp, start_offset) + self.assert_fetch_offset(partition, start_offset, [ self.msg("one") ]) + + producer.stop() + + def test_acks_cluster_commit(self): + partition = self.client.get_partition_ids_for_topic(self.topic)[0] + start_offset = self.current_offset(self.topic, partition) + + producer = Producer( + self.client, + req_acks=Producer.ACK_AFTER_CLUSTER_COMMIT, + ) - self.assert_fetch_offset(0, start_offset0, [ self.msg("one") ]) + resp = producer.send_messages(self.topic, partition, self.msg("one")) + self.assert_produce_response(resp, start_offset) + self.assert_fetch_offset(partition, start_offset, [ self.msg("one") ]) producer.stop() - def assert_produce_request(self, messages, initial_offset, message_ct): - produce = ProduceRequest(self.topic, 0, messages=messages) + def assert_produce_request(self, messages, initial_offset, message_ct, + partition=0): + produce = ProduceRequestPayload(self.topic, partition, messages=messages) # There should only be one response message from the server. # This will throw an exception if there's more than one. resp = self.client.send_produce_request([ produce ]) self.assert_produce_response(resp, initial_offset) - self.assertEqual(self.current_offset(self.topic, 0), initial_offset + message_ct) + self.assertEqual(self.current_offset(self.topic, partition), initial_offset + message_ct) def assert_produce_response(self, resp, initial_offset): self.assertEqual(len(resp), 1) @@ -403,11 +484,11 @@ # There should only be one response message from the server. # This will throw an exception if there's more than one. - resp, = self.client.send_fetch_request([ FetchRequest(self.topic, partition, start_offset, 1024) ]) + resp, = self.client.send_fetch_request([FetchRequestPayload(self.topic, partition, start_offset, 1024)]) - self.assertEquals(resp.error, 0) - self.assertEquals(resp.partition, partition) + self.assertEqual(resp.error, 0) + self.assertEqual(resp.partition, partition) messages = [ x.message.value for x in resp.messages ] self.assertEqual(messages, expected_messages) - self.assertEquals(resp.highwaterMark, start_offset+len(expected_messages)) + self.assertEqual(resp.highwaterMark, start_offset+len(expected_messages)) diff -Nru python-kafka-python-0.9.2/test/test_producer_legacy.py python-kafka-python-1.0.1/test/test_producer_legacy.py --- python-kafka-python-0.9.2/test/test_producer_legacy.py 1970-01-01 00:00:00.000000000 +0000 +++ python-kafka-python-1.0.1/test/test_producer_legacy.py 2016-02-17 18:37:58.000000000 +0000 @@ -0,0 +1,257 @@ +# -*- coding: utf-8 -*- + +import collections +import logging +import threading +import time + +from mock import MagicMock, patch +from . import unittest + +from kafka import SimpleClient, SimpleProducer, KeyedProducer +from kafka.common import ( + AsyncProducerQueueFull, FailedPayloadsError, NotLeaderForPartitionError, + ProduceResponsePayload, RetryOptions, TopicPartition +) +from kafka.producer.base import Producer, _send_upstream +from kafka.protocol import CODEC_NONE + +from six.moves import queue, xrange + + +class TestKafkaProducer(unittest.TestCase): + def test_producer_message_types(self): + + producer = Producer(MagicMock()) + topic = b"test-topic" + partition = 0 + + bad_data_types = (u'你怎么样?', 12, ['a', 'list'], + ('a', 'tuple'), {'a': 'dict'}, None,) + for m in bad_data_types: + with self.assertRaises(TypeError): + logging.debug("attempting to send message of type %s", type(m)) + producer.send_messages(topic, partition, m) + + good_data_types = (b'a string!',) + for m in good_data_types: + # This should not raise an exception + producer.send_messages(topic, partition, m) + + def test_keyedproducer_message_types(self): + client = MagicMock() + client.get_partition_ids_for_topic.return_value = [0, 1] + producer = KeyedProducer(client) + topic = b"test-topic" + key = b"testkey" + + bad_data_types = (u'你怎么样?', 12, ['a', 'list'], + ('a', 'tuple'), {'a': 'dict'},) + for m in bad_data_types: + with self.assertRaises(TypeError): + logging.debug("attempting to send message of type %s", type(m)) + producer.send_messages(topic, key, m) + + good_data_types = (b'a string!', None,) + for m in good_data_types: + # This should not raise an exception + producer.send_messages(topic, key, m) + + def test_topic_message_types(self): + client = MagicMock() + + def partitions(topic): + return [0, 1] + + client.get_partition_ids_for_topic = partitions + + producer = SimpleProducer(client, random_start=False) + topic = b"test-topic" + producer.send_messages(topic, b'hi') + assert client.send_produce_request.called + + @patch('kafka.producer.base._send_upstream') + def test_producer_async_queue_overfilled(self, mock): + queue_size = 2 + producer = Producer(MagicMock(), async=True, + async_queue_maxsize=queue_size) + + topic = b'test-topic' + partition = 0 + message = b'test-message' + + with self.assertRaises(AsyncProducerQueueFull): + message_list = [message] * (queue_size + 1) + producer.send_messages(topic, partition, *message_list) + self.assertEqual(producer.queue.qsize(), queue_size) + for _ in xrange(producer.queue.qsize()): + producer.queue.get() + + def test_producer_sync_fail_on_error(self): + error = FailedPayloadsError('failure') + with patch.object(SimpleClient, 'load_metadata_for_topics'): + with patch.object(SimpleClient, 'ensure_topic_exists'): + with patch.object(SimpleClient, 'get_partition_ids_for_topic', return_value=[0, 1]): + with patch.object(SimpleClient, '_send_broker_aware_request', return_value = [error]): + + client = SimpleClient(MagicMock()) + producer = SimpleProducer(client, async=False, sync_fail_on_error=False) + + # This should not raise + (response,) = producer.send_messages('foobar', b'test message') + self.assertEqual(response, error) + + producer = SimpleProducer(client, async=False, sync_fail_on_error=True) + with self.assertRaises(FailedPayloadsError): + producer.send_messages('foobar', b'test message') + + def test_cleanup_is_not_called_on_stopped_producer(self): + producer = Producer(MagicMock(), async=True) + producer.stopped = True + with patch.object(producer, 'stop') as mocked_stop: + producer._cleanup_func(producer) + self.assertEqual(mocked_stop.call_count, 0) + + def test_cleanup_is_called_on_running_producer(self): + producer = Producer(MagicMock(), async=True) + producer.stopped = False + with patch.object(producer, 'stop') as mocked_stop: + producer._cleanup_func(producer) + self.assertEqual(mocked_stop.call_count, 1) + + +class TestKafkaProducerSendUpstream(unittest.TestCase): + + def setUp(self): + self.client = MagicMock() + self.queue = queue.Queue() + + def _run_process(self, retries_limit=3, sleep_timeout=1): + # run _send_upstream process with the queue + stop_event = threading.Event() + retry_options = RetryOptions(limit=retries_limit, + backoff_ms=50, + retry_on_timeouts=False) + self.thread = threading.Thread( + target=_send_upstream, + args=(self.queue, self.client, CODEC_NONE, + 0.3, # batch time (seconds) + 3, # batch length + Producer.ACK_AFTER_LOCAL_WRITE, + Producer.DEFAULT_ACK_TIMEOUT, + retry_options, + stop_event)) + self.thread.daemon = True + self.thread.start() + time.sleep(sleep_timeout) + stop_event.set() + + def test_wo_retries(self): + + # lets create a queue and add 10 messages for 1 partition + for i in range(10): + self.queue.put((TopicPartition("test", 0), "msg %i", "key %i")) + + self._run_process() + + # the queue should be void at the end of the test + self.assertEqual(self.queue.empty(), True) + + # there should be 4 non-void cals: + # 3 batches of 3 msgs each + 1 batch of 1 message + self.assertEqual(self.client.send_produce_request.call_count, 4) + + def test_first_send_failed(self): + + # lets create a queue and add 10 messages for 10 different partitions + # to show how retries should work ideally + for i in range(10): + self.queue.put((TopicPartition("test", i), "msg %i", "key %i")) + + # Mock offsets counter for closure + offsets = collections.defaultdict(lambda: collections.defaultdict(lambda: 0)) + self.client.is_first_time = True + def send_side_effect(reqs, *args, **kwargs): + if self.client.is_first_time: + self.client.is_first_time = False + return [FailedPayloadsError(req) for req in reqs] + responses = [] + for req in reqs: + offset = offsets[req.topic][req.partition] + offsets[req.topic][req.partition] += len(req.messages) + responses.append( + ProduceResponsePayload(req.topic, req.partition, 0, offset) + ) + return responses + + self.client.send_produce_request.side_effect = send_side_effect + + self._run_process(2) + + # the queue should be void at the end of the test + self.assertEqual(self.queue.empty(), True) + + # there should be 5 non-void calls: 1st failed batch of 3 msgs + # plus 3 batches of 3 msgs each + 1 batch of 1 message + self.assertEqual(self.client.send_produce_request.call_count, 5) + + def test_with_limited_retries(self): + + # lets create a queue and add 10 messages for 10 different partitions + # to show how retries should work ideally + for i in range(10): + self.queue.put((TopicPartition("test", i), "msg %i" % i, "key %i" % i)) + + def send_side_effect(reqs, *args, **kwargs): + return [FailedPayloadsError(req) for req in reqs] + + self.client.send_produce_request.side_effect = send_side_effect + + self._run_process(3, 3) + + # the queue should be void at the end of the test + self.assertEqual(self.queue.empty(), True) + + # there should be 16 non-void calls: + # 3 initial batches of 3 msgs each + 1 initial batch of 1 msg + + # 3 retries of the batches above = (1 + 3 retries) * 4 batches = 16 + self.assertEqual(self.client.send_produce_request.call_count, 16) + + def test_async_producer_not_leader(self): + + for i in range(10): + self.queue.put((TopicPartition("test", i), "msg %i", "key %i")) + + # Mock offsets counter for closure + offsets = collections.defaultdict(lambda: collections.defaultdict(lambda: 0)) + self.client.is_first_time = True + def send_side_effect(reqs, *args, **kwargs): + if self.client.is_first_time: + self.client.is_first_time = False + return [ProduceResponsePayload(req.topic, req.partition, + NotLeaderForPartitionError.errno, -1) + for req in reqs] + + responses = [] + for req in reqs: + offset = offsets[req.topic][req.partition] + offsets[req.topic][req.partition] += len(req.messages) + responses.append( + ProduceResponsePayload(req.topic, req.partition, 0, offset) + ) + return responses + + self.client.send_produce_request.side_effect = send_side_effect + + self._run_process(2) + + # the queue should be void at the end of the test + self.assertEqual(self.queue.empty(), True) + + # there should be 5 non-void calls: 1st failed batch of 3 msgs + # + 3 batches of 3 msgs each + 1 batch of 1 msg = 1 + 3 + 1 = 5 + self.assertEqual(self.client.send_produce_request.call_count, 5) + + def tearDown(self): + for _ in xrange(self.queue.qsize()): + self.queue.get() diff -Nru python-kafka-python-0.9.2/test/test_producer.py python-kafka-python-1.0.1/test/test_producer.py --- python-kafka-python-0.9.2/test/test_producer.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/test/test_producer.py 2016-02-17 18:37:58.000000000 +0000 @@ -1,31 +1,48 @@ -# -*- coding: utf-8 -*- +import sys -import logging -import os -import random -import struct -import unittest2 - -from mock import MagicMock, patch - -from kafka import KafkaClient -from kafka.producer import Producer - -class TestKafkaProducer(unittest2.TestCase): - def test_producer_message_types(self): - - producer = Producer(MagicMock()) - topic = "test-topic" - partition = 0 - - bad_data_types = (u'你怎么样?', 12, ['a','list'], ('a','tuple'), {'a': 'dict'}) - for m in bad_data_types: - with self.assertRaises(TypeError): - logging.debug("attempting to send message of type %s", type(m)) - producer.send_messages(topic, partition, m) - - good_data_types = ('a string!',) - for m in good_data_types: - # This should not raise an exception - producer.send_messages(topic, partition, m) +import pytest +from kafka import KafkaConsumer, KafkaProducer +from test.conftest import version +from test.testutil import random_string + + +@pytest.mark.skipif(not version(), reason="No KAFKA_VERSION set") +@pytest.mark.parametrize("compression", [None, 'gzip', 'snappy', 'lz4']) +def test_end_to_end(kafka_broker, compression): + + if compression == 'lz4': + # LZ4 requires 0.8.2 + if version() < (0, 8, 2): + return + # LZ4 python libs dont work on python2.6 + elif sys.version_info < (2, 7): + return + + connect_str = 'localhost:' + str(kafka_broker.port) + producer = KafkaProducer(bootstrap_servers=connect_str, + max_block_ms=10000, + compression_type=compression, + value_serializer=str.encode) + consumer = KafkaConsumer(bootstrap_servers=connect_str, + group_id=None, + consumer_timeout_ms=10000, + auto_offset_reset='earliest', + value_deserializer=bytes.decode) + + topic = random_string(5) + + for i in range(1000): + producer.send(topic, 'msg %d' % i) + producer.flush() + producer.close() + + consumer.subscribe([topic]) + msgs = set() + for i in range(1000): + try: + msgs.add(next(consumer).value) + except StopIteration: + break + + assert msgs == set(['msg %d' % i for i in range(1000)]) diff -Nru python-kafka-python-0.9.2/test/test_protocol.py python-kafka-python-1.0.1/test/test_protocol.py --- python-kafka-python-0.9.2/test/test_protocol.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/test/test_protocol.py 2016-02-17 18:37:58.000000000 +0000 @@ -1,33 +1,29 @@ -import contextlib +#pylint: skip-file from contextlib import contextmanager import struct -import unittest2 -import mock -from mock import sentinel +import six +from mock import patch, sentinel +from . import unittest -from kafka import KafkaClient +from kafka.codec import has_snappy, gzip_decode, snappy_decode from kafka.common import ( - OffsetRequest, OffsetCommitRequest, OffsetFetchRequest, - OffsetResponse, OffsetCommitResponse, OffsetFetchResponse, - ProduceRequest, FetchRequest, Message, ChecksumError, - ConsumerFetchSizeTooSmall, ProduceResponse, FetchResponse, OffsetAndMessage, - BrokerMetadata, PartitionMetadata, TopicAndPartition, KafkaUnavailableError, - ProtocolError, LeaderUnavailableError, PartitionUnavailableError, - UnsupportedCodecError + OffsetRequestPayload, OffsetResponsePayload, + OffsetCommitRequestPayload, OffsetCommitResponsePayload, + OffsetFetchRequestPayload, OffsetFetchResponsePayload, + ProduceRequestPayload, ProduceResponsePayload, + FetchRequestPayload, FetchResponsePayload, + Message, ChecksumError, OffsetAndMessage, BrokerMetadata, + KafkaUnavailableError, UnsupportedCodecError, ConsumerFetchSizeTooSmall, + ProtocolError, ConsumerMetadataResponse ) -from kafka.codec import ( - has_snappy, gzip_encode, gzip_decode, - snappy_encode, snappy_decode -) -import kafka.protocol from kafka.protocol import ( ATTRIBUTE_CODEC_MASK, CODEC_NONE, CODEC_GZIP, CODEC_SNAPPY, KafkaProtocol, create_message, create_gzip_message, create_snappy_message, create_message_set ) -class TestProtocol(unittest2.TestCase): +class TestProtocol(unittest.TestCase): def test_create_message(self): payload = "test" key = "key" @@ -38,21 +34,21 @@ self.assertEqual(msg.value, payload) def test_create_gzip(self): - payloads = ["v1", "v2"] + payloads = [(b"v1", None), (b"v2", None)] msg = create_gzip_message(payloads) self.assertEqual(msg.magic, 0) self.assertEqual(msg.attributes, ATTRIBUTE_CODEC_MASK & CODEC_GZIP) self.assertEqual(msg.key, None) # Need to decode to check since gzipped payload is non-deterministic decoded = gzip_decode(msg.value) - expect = "".join([ + expect = b"".join([ struct.pack(">q", 0), # MsgSet offset struct.pack(">i", 16), # MsgSet size struct.pack(">i", 1285512130), # CRC struct.pack(">bb", 0, 0), # Magic, flags struct.pack(">i", -1), # -1 indicates a null key struct.pack(">i", 2), # Msg length (bytes) - "v1", # Message contents + b"v1", # Message contents struct.pack(">q", 0), # MsgSet offset struct.pack(">i", 16), # MsgSet size @@ -60,27 +56,57 @@ struct.pack(">bb", 0, 0), # Magic, flags struct.pack(">i", -1), # -1 indicates a null key struct.pack(">i", 2), # Msg length (bytes) - "v2", # Message contents + b"v2", # Message contents ]) self.assertEqual(decoded, expect) - @unittest2.skipUnless(has_snappy(), "Snappy not available") + def test_create_gzip_keyed(self): + payloads = [(b"v1", b"k1"), (b"v2", b"k2")] + msg = create_gzip_message(payloads) + self.assertEqual(msg.magic, 0) + self.assertEqual(msg.attributes, ATTRIBUTE_CODEC_MASK & CODEC_GZIP) + self.assertEqual(msg.key, None) + # Need to decode to check since gzipped payload is non-deterministic + decoded = gzip_decode(msg.value) + expect = b"".join([ + struct.pack(">q", 0), # MsgSet Offset + struct.pack(">i", 18), # Msg Size + struct.pack(">i", 1474775406), # CRC + struct.pack(">bb", 0, 0), # Magic, flags + struct.pack(">i", 2), # Length of key + b"k1", # Key + struct.pack(">i", 2), # Length of value + b"v1", # Value + + struct.pack(">q", 0), # MsgSet Offset + struct.pack(">i", 18), # Msg Size + struct.pack(">i", -16383415), # CRC + struct.pack(">bb", 0, 0), # Magic, flags + struct.pack(">i", 2), # Length of key + b"k2", # Key + struct.pack(">i", 2), # Length of value + b"v2", # Value + ]) + + self.assertEqual(decoded, expect) + + @unittest.skipUnless(has_snappy(), "Snappy not available") def test_create_snappy(self): - payloads = ["v1", "v2"] + payloads = [(b"v1", None), (b"v2", None)] msg = create_snappy_message(payloads) self.assertEqual(msg.magic, 0) self.assertEqual(msg.attributes, ATTRIBUTE_CODEC_MASK & CODEC_SNAPPY) self.assertEqual(msg.key, None) decoded = snappy_decode(msg.value) - expect = "".join([ + expect = b"".join([ struct.pack(">q", 0), # MsgSet offset struct.pack(">i", 16), # MsgSet size struct.pack(">i", 1285512130), # CRC struct.pack(">bb", 0, 0), # Magic, flags struct.pack(">i", -1), # -1 indicates a null key struct.pack(">i", 2), # Msg length (bytes) - "v1", # Message contents + b"v1", # Message contents struct.pack(">q", 0), # MsgSet offset struct.pack(">i", 16), # MsgSet size @@ -88,105 +114,138 @@ struct.pack(">bb", 0, 0), # Magic, flags struct.pack(">i", -1), # -1 indicates a null key struct.pack(">i", 2), # Msg length (bytes) - "v2", # Message contents + b"v2", # Message contents + ]) + + self.assertEqual(decoded, expect) + + @unittest.skipUnless(has_snappy(), "Snappy not available") + def test_create_snappy_keyed(self): + payloads = [(b"v1", b"k1"), (b"v2", b"k2")] + msg = create_snappy_message(payloads) + self.assertEqual(msg.magic, 0) + self.assertEqual(msg.attributes, ATTRIBUTE_CODEC_MASK & CODEC_SNAPPY) + self.assertEqual(msg.key, None) + decoded = snappy_decode(msg.value) + expect = b"".join([ + struct.pack(">q", 0), # MsgSet Offset + struct.pack(">i", 18), # Msg Size + struct.pack(">i", 1474775406), # CRC + struct.pack(">bb", 0, 0), # Magic, flags + struct.pack(">i", 2), # Length of key + b"k1", # Key + struct.pack(">i", 2), # Length of value + b"v1", # Value + + struct.pack(">q", 0), # MsgSet Offset + struct.pack(">i", 18), # Msg Size + struct.pack(">i", -16383415), # CRC + struct.pack(">bb", 0, 0), # Magic, flags + struct.pack(">i", 2), # Length of key + b"k2", # Key + struct.pack(">i", 2), # Length of value + b"v2", # Value ]) self.assertEqual(decoded, expect) def test_encode_message_header(self): - expect = "".join([ + expect = b"".join([ struct.pack(">h", 10), # API Key struct.pack(">h", 0), # API Version struct.pack(">i", 4), # Correlation Id struct.pack(">h", len("client3")), # Length of clientId - "client3", # ClientId + b"client3", # ClientId ]) - encoded = KafkaProtocol._encode_message_header("client3", 4, 10) + encoded = KafkaProtocol._encode_message_header(b"client3", 4, 10) self.assertEqual(encoded, expect) def test_encode_message(self): - message = create_message("test", "key") + message = create_message(b"test", b"key") encoded = KafkaProtocol._encode_message(message) - expect = "".join([ + expect = b"".join([ struct.pack(">i", -1427009701), # CRC struct.pack(">bb", 0, 0), # Magic, flags struct.pack(">i", 3), # Length of key - "key", # key + b"key", # key struct.pack(">i", 4), # Length of value - "test", # value + b"test", # value ]) self.assertEqual(encoded, expect) + @unittest.skip('needs updating for new protocol classes') def test_decode_message(self): - encoded = "".join([ + encoded = b"".join([ struct.pack(">i", -1427009701), # CRC struct.pack(">bb", 0, 0), # Magic, flags struct.pack(">i", 3), # Length of key - "key", # key + b"key", # key struct.pack(">i", 4), # Length of value - "test", # value + b"test", # value ]) offset = 10 (returned_offset, decoded_message) = list(KafkaProtocol._decode_message(encoded, offset))[0] self.assertEqual(returned_offset, offset) - self.assertEqual(decoded_message, create_message("test", "key")) + self.assertEqual(decoded_message, create_message(b"test", b"key")) def test_encode_message_failure(self): with self.assertRaises(ProtocolError): KafkaProtocol._encode_message(Message(1, 0, "key", "test")) + @unittest.skip('needs updating for new protocol classes') def test_encode_message_set(self): message_set = [ - create_message("v1", "k1"), - create_message("v2", "k2") + create_message(b"v1", b"k1"), + create_message(b"v2", b"k2") ] encoded = KafkaProtocol._encode_message_set(message_set) - expect = "".join([ + expect = b"".join([ struct.pack(">q", 0), # MsgSet Offset struct.pack(">i", 18), # Msg Size struct.pack(">i", 1474775406), # CRC struct.pack(">bb", 0, 0), # Magic, flags struct.pack(">i", 2), # Length of key - "k1", # Key + b"k1", # Key struct.pack(">i", 2), # Length of value - "v1", # Value + b"v1", # Value struct.pack(">q", 0), # MsgSet Offset struct.pack(">i", 18), # Msg Size struct.pack(">i", -16383415), # CRC struct.pack(">bb", 0, 0), # Magic, flags struct.pack(">i", 2), # Length of key - "k2", # Key + b"k2", # Key struct.pack(">i", 2), # Length of value - "v2", # Value + b"v2", # Value ]) self.assertEqual(encoded, expect) + @unittest.skip('needs updating for new protocol classes') def test_decode_message_set(self): - encoded = "".join([ + encoded = b"".join([ struct.pack(">q", 0), # MsgSet Offset struct.pack(">i", 18), # Msg Size struct.pack(">i", 1474775406), # CRC struct.pack(">bb", 0, 0), # Magic, flags struct.pack(">i", 2), # Length of key - "k1", # Key + b"k1", # Key struct.pack(">i", 2), # Length of value - "v1", # Value + b"v1", # Value struct.pack(">q", 1), # MsgSet Offset struct.pack(">i", 18), # Msg Size struct.pack(">i", -16383415), # CRC struct.pack(">bb", 0, 0), # Magic, flags struct.pack(">i", 2), # Length of key - "k2", # Key + b"k2", # Key struct.pack(">i", 2), # Length of value - "v2", # Value + b"v2", # Value ]) msgs = list(KafkaProtocol._decode_message_set_iter(encoded)) @@ -197,17 +256,18 @@ returned_offset2, decoded_message2 = msg2 self.assertEqual(returned_offset1, 0) - self.assertEqual(decoded_message1, create_message("v1", "k1")) + self.assertEqual(decoded_message1, create_message(b"v1", b"k1")) self.assertEqual(returned_offset2, 1) - self.assertEqual(decoded_message2, create_message("v2", "k2")) + self.assertEqual(decoded_message2, create_message(b"v2", b"k2")) + @unittest.skip('needs updating for new protocol classes') def test_decode_message_gzip(self): - gzip_encoded = ('\xc0\x11\xb2\xf0\x00\x01\xff\xff\xff\xff\x00\x00\x000' - '\x1f\x8b\x08\x00\xa1\xc1\xc5R\x02\xffc`\x80\x03\x01' - '\x9f\xf9\xd1\x87\x18\x18\xfe\x03\x01\x90\xc7Tf\xc8' - '\x80$wu\x1aW\x05\x92\x9c\x11\x00z\xc0h\x888\x00\x00' - '\x00') + gzip_encoded = (b'\xc0\x11\xb2\xf0\x00\x01\xff\xff\xff\xff\x00\x00\x000' + b'\x1f\x8b\x08\x00\xa1\xc1\xc5R\x02\xffc`\x80\x03\x01' + b'\x9f\xf9\xd1\x87\x18\x18\xfe\x03\x01\x90\xc7Tf\xc8' + b'\x80$wu\x1aW\x05\x92\x9c\x11\x00z\xc0h\x888\x00\x00' + b'\x00') offset = 11 messages = list(KafkaProtocol._decode_message(gzip_encoded, offset)) @@ -216,18 +276,19 @@ returned_offset1, decoded_message1 = msg1 self.assertEqual(returned_offset1, 0) - self.assertEqual(decoded_message1, create_message("v1")) + self.assertEqual(decoded_message1, create_message(b"v1")) returned_offset2, decoded_message2 = msg2 self.assertEqual(returned_offset2, 0) - self.assertEqual(decoded_message2, create_message("v2")) + self.assertEqual(decoded_message2, create_message(b"v2")) - @unittest2.skipUnless(has_snappy(), "Snappy not available") + @unittest.skip('needs updating for new protocol classes') + @unittest.skipUnless(has_snappy(), "Snappy not available") def test_decode_message_snappy(self): - snappy_encoded = ('\xec\x80\xa1\x95\x00\x02\xff\xff\xff\xff\x00\x00' - '\x00,8\x00\x00\x19\x01@\x10L\x9f[\xc2\x00\x00\xff' - '\xff\xff\xff\x00\x00\x00\x02v1\x19\x1bD\x00\x10\xd5' - '\x96\nx\x00\x00\xff\xff\xff\xff\x00\x00\x00\x02v2') + snappy_encoded = (b'\xec\x80\xa1\x95\x00\x02\xff\xff\xff\xff\x00\x00' + b'\x00,8\x00\x00\x19\x01@\x10L\x9f[\xc2\x00\x00\xff' + b'\xff\xff\xff\x00\x00\x00\x02v1\x19\x1bD\x00\x10\xd5' + b'\x96\nx\x00\x00\xff\xff\xff\xff\x00\x00\x00\x02v2') offset = 11 messages = list(KafkaProtocol._decode_message(snappy_encoded, offset)) self.assertEqual(len(messages), 2) @@ -236,87 +297,93 @@ returned_offset1, decoded_message1 = msg1 self.assertEqual(returned_offset1, 0) - self.assertEqual(decoded_message1, create_message("v1")) + self.assertEqual(decoded_message1, create_message(b"v1")) returned_offset2, decoded_message2 = msg2 self.assertEqual(returned_offset2, 0) - self.assertEqual(decoded_message2, create_message("v2")) + self.assertEqual(decoded_message2, create_message(b"v2")) + @unittest.skip('needs updating for new protocol classes') def test_decode_message_checksum_error(self): - invalid_encoded_message = "This is not a valid encoded message" + invalid_encoded_message = b"This is not a valid encoded message" iter = KafkaProtocol._decode_message(invalid_encoded_message, 0) self.assertRaises(ChecksumError, list, iter) # NOTE: The error handling in _decode_message_set_iter() is questionable. # If it's modified, the next two tests might need to be fixed. + @unittest.skip('needs updating for new protocol classes') def test_decode_message_set_fetch_size_too_small(self): with self.assertRaises(ConsumerFetchSizeTooSmall): list(KafkaProtocol._decode_message_set_iter('a')) + @unittest.skip('needs updating for new protocol classes') def test_decode_message_set_stop_iteration(self): - encoded = "".join([ + encoded = b"".join([ struct.pack(">q", 0), # MsgSet Offset struct.pack(">i", 18), # Msg Size struct.pack(">i", 1474775406), # CRC struct.pack(">bb", 0, 0), # Magic, flags struct.pack(">i", 2), # Length of key - "k1", # Key + b"k1", # Key struct.pack(">i", 2), # Length of value - "v1", # Value + b"v1", # Value struct.pack(">q", 1), # MsgSet Offset struct.pack(">i", 18), # Msg Size struct.pack(">i", -16383415), # CRC struct.pack(">bb", 0, 0), # Magic, flags struct.pack(">i", 2), # Length of key - "k2", # Key + b"k2", # Key struct.pack(">i", 2), # Length of value - "v2", # Value - "@1$%(Y!", # Random padding + b"v2", # Value + b"@1$%(Y!", # Random padding ]) - msgs = list(KafkaProtocol._decode_message_set_iter(encoded)) + msgs = MessageSet.decode(io.BytesIO(encoded)) self.assertEqual(len(msgs), 2) msg1, msg2 = msgs - returned_offset1, decoded_message1 = msg1 - returned_offset2, decoded_message2 = msg2 + returned_offset1, msg_size1, decoded_message1 = msg1 + returned_offset2, msg_size2, decoded_message2 = msg2 self.assertEqual(returned_offset1, 0) - self.assertEqual(decoded_message1, create_message("v1", "k1")) + self.assertEqual(decoded_message1.value, b"v1") + self.assertEqual(decoded_message1.key, b"k1") self.assertEqual(returned_offset2, 1) - self.assertEqual(decoded_message2, create_message("v2", "k2")) + self.assertEqual(decoded_message2.value, b"v2") + self.assertEqual(decoded_message2.key, b"k2") + @unittest.skip('needs updating for new protocol classes') def test_encode_produce_request(self): requests = [ - ProduceRequest("topic1", 0, [ - create_message("a"), - create_message("b") + ProduceRequestPayload("topic1", 0, [ + kafka.protocol.message.Message(b"a"), + kafka.protocol.message.Message(b"b") ]), - ProduceRequest("topic2", 1, [ - create_message("c") + ProduceRequestPayload("topic2", 1, [ + kafka.protocol.message.Message(b"c") ]) ] - msg_a_binary = KafkaProtocol._encode_message(create_message("a")) - msg_b_binary = KafkaProtocol._encode_message(create_message("b")) - msg_c_binary = KafkaProtocol._encode_message(create_message("c")) + msg_a_binary = KafkaProtocol._encode_message(create_message(b"a")) + msg_b_binary = KafkaProtocol._encode_message(create_message(b"b")) + msg_c_binary = KafkaProtocol._encode_message(create_message(b"c")) - header = "".join([ + header = b"".join([ struct.pack('>i', 0x94), # The length of the message overall struct.pack('>h', 0), # Msg Header, Message type = Produce struct.pack('>h', 0), # Msg Header, API version struct.pack('>i', 2), # Msg Header, Correlation ID - struct.pack('>h7s', 7, "client1"), # Msg Header, The client ID + struct.pack('>h7s', 7, b"client1"), # Msg Header, The client ID struct.pack('>h', 2), # Num acks required struct.pack('>i', 100), # Request Timeout struct.pack('>i', 2), # The number of requests ]) total_len = len(msg_a_binary) + len(msg_b_binary) - topic1 = "".join([ - struct.pack('>h6s', 6, 'topic1'), # The topic1 + topic1 = b"".join([ + struct.pack('>h6s', 6, b'topic1'), # The topic1 struct.pack('>i', 1), # One message set struct.pack('>i', 0), # Partition 0 struct.pack('>i', total_len + 24), # Size of the incoming message set @@ -328,8 +395,8 @@ msg_b_binary, # Actual message ]) - topic2 = "".join([ - struct.pack('>h6s', 6, 'topic2'), # The topic1 + topic2 = b"".join([ + struct.pack('>h6s', 6, b'topic2'), # The topic1 struct.pack('>i', 1), # One message set struct.pack('>i', 1), # Partition 1 struct.pack('>i', len(msg_c_binary) + 12), # Size of the incoming message set @@ -338,68 +405,75 @@ msg_c_binary, # Actual message ]) - expected1 = "".join([ header, topic1, topic2 ]) - expected2 = "".join([ header, topic2, topic1 ]) + expected1 = b"".join([ header, topic1, topic2 ]) + expected2 = b"".join([ header, topic2, topic1 ]) - encoded = KafkaProtocol.encode_produce_request("client1", 2, requests, 2, 100) + encoded = KafkaProtocol.encode_produce_request(b"client1", 2, requests, 2, 100) self.assertIn(encoded, [ expected1, expected2 ]) + @unittest.skip('needs updating for new protocol classes') def test_decode_produce_response(self): - t1 = "topic1" - t2 = "topic2" + t1 = b"topic1" + t2 = b"topic2" + _long = int + if six.PY2: + _long = long encoded = struct.pack('>iih%dsiihqihqh%dsiihq' % (len(t1), len(t2)), - 2, 2, len(t1), t1, 2, 0, 0, 10L, 1, 1, 20L, - len(t2), t2, 1, 0, 0, 30L) + 2, 2, len(t1), t1, 2, 0, 0, _long(10), 1, 1, _long(20), + len(t2), t2, 1, 0, 0, _long(30)) responses = list(KafkaProtocol.decode_produce_response(encoded)) self.assertEqual(responses, - [ProduceResponse(t1, 0, 0, 10L), - ProduceResponse(t1, 1, 1, 20L), - ProduceResponse(t2, 0, 0, 30L)]) + [ProduceResponse(t1, 0, 0, _long(10)), + ProduceResponse(t1, 1, 1, _long(20)), + ProduceResponse(t2, 0, 0, _long(30))]) + @unittest.skip('needs updating for new protocol classes') def test_encode_fetch_request(self): requests = [ - FetchRequest("topic1", 0, 10, 1024), - FetchRequest("topic2", 1, 20, 100), + FetchRequest(b"topic1", 0, 10, 1024), + FetchRequest(b"topic2", 1, 20, 100), ] - header = "".join([ + header = b"".join([ struct.pack('>i', 89), # The length of the message overall struct.pack('>h', 1), # Msg Header, Message type = Fetch struct.pack('>h', 0), # Msg Header, API version struct.pack('>i', 3), # Msg Header, Correlation ID - struct.pack('>h7s', 7, "client1"), # Msg Header, The client ID + struct.pack('>h7s', 7, b"client1"),# Msg Header, The client ID struct.pack('>i', -1), # Replica Id struct.pack('>i', 2), # Max wait time struct.pack('>i', 100), # Min bytes struct.pack('>i', 2), # Num requests ]) - topic1 = "".join([ - struct.pack('>h6s', 6, 'topic1'), # Topic + topic1 = b"".join([ + struct.pack('>h6s', 6, b'topic1'),# Topic struct.pack('>i', 1), # Num Payloads struct.pack('>i', 0), # Partition 0 struct.pack('>q', 10), # Offset struct.pack('>i', 1024), # Max Bytes ]) - topic2 = "".join([ - struct.pack('>h6s', 6, 'topic2'), # Topic + topic2 = b"".join([ + struct.pack('>h6s', 6, b'topic2'),# Topic struct.pack('>i', 1), # Num Payloads struct.pack('>i', 1), # Partition 0 struct.pack('>q', 20), # Offset struct.pack('>i', 100), # Max Bytes ]) - expected1 = "".join([ header, topic1, topic2 ]) - expected2 = "".join([ header, topic2, topic1 ]) + expected1 = b"".join([ header, topic1, topic2 ]) + expected2 = b"".join([ header, topic2, topic1 ]) - encoded = KafkaProtocol.encode_fetch_request("client1", 3, requests, 2, 100) + encoded = KafkaProtocol.encode_fetch_request(b"client1", 3, requests, 2, 100) self.assertIn(encoded, [ expected1, expected2 ]) + @unittest.skip('needs updating for new protocol classes') def test_decode_fetch_response(self): - t1 = "topic1" - t2 = "topic2" - msgs = map(create_message, ["message1", "hi", "boo", "foo", "so fun!"]) + t1 = b"topic1" + t2 = b"topic2" + msgs = [create_message(msg) + for msg in [b"message1", b"hi", b"boo", b"foo", b"so fun!"]] ms1 = KafkaProtocol._encode_message_set([msgs[0], msgs[1]]) ms2 = KafkaProtocol._encode_message_set([msgs[2]]) ms3 = KafkaProtocol._encode_message_set([msgs[3], msgs[4]]) @@ -412,130 +486,157 @@ responses = list(KafkaProtocol.decode_fetch_response(encoded)) def expand_messages(response): - return FetchResponse(response.topic, response.partition, - response.error, response.highwaterMark, - list(response.messages)) - - expanded_responses = map(expand_messages, responses) - expect = [FetchResponse(t1, 0, 0, 10, [OffsetAndMessage(0, msgs[0]), - OffsetAndMessage(0, msgs[1])]), - FetchResponse(t1, 1, 1, 20, [OffsetAndMessage(0, msgs[2])]), - FetchResponse(t2, 0, 0, 30, [OffsetAndMessage(0, msgs[3]), - OffsetAndMessage(0, msgs[4])])] + return FetchResponsePayload(response.topic, response.partition, + response.error, response.highwaterMark, + list(response.messages)) + + expanded_responses = list(map(expand_messages, responses)) + expect = [FetchResponsePayload(t1, 0, 0, 10, [OffsetAndMessage(0, msgs[0]), + OffsetAndMessage(0, msgs[1])]), + FetchResponsePayload(t1, 1, 1, 20, [OffsetAndMessage(0, msgs[2])]), + FetchResponsePayload(t2, 0, 0, 30, [OffsetAndMessage(0, msgs[3]), + OffsetAndMessage(0, msgs[4])])] self.assertEqual(expanded_responses, expect) + @unittest.skip('needs updating for new protocol classes') def test_encode_metadata_request_no_topics(self): - expected = "".join([ + expected = b"".join([ struct.pack(">i", 17), # Total length of the request struct.pack('>h', 3), # API key metadata fetch struct.pack('>h', 0), # API version struct.pack('>i', 4), # Correlation ID - struct.pack('>h3s', 3, "cid"), # The client ID + struct.pack('>h3s', 3, b"cid"),# The client ID struct.pack('>i', 0), # No topics, give all the data! ]) - encoded = KafkaProtocol.encode_metadata_request("cid", 4) + encoded = KafkaProtocol.encode_metadata_request(b"cid", 4) self.assertEqual(encoded, expected) + @unittest.skip('needs updating for new protocol classes') def test_encode_metadata_request_with_topics(self): - expected = "".join([ + expected = b"".join([ struct.pack(">i", 25), # Total length of the request struct.pack('>h', 3), # API key metadata fetch struct.pack('>h', 0), # API version struct.pack('>i', 4), # Correlation ID - struct.pack('>h3s', 3, "cid"), # The client ID + struct.pack('>h3s', 3, b"cid"),# The client ID struct.pack('>i', 2), # Number of topics in the request - struct.pack('>h2s', 2, "t1"), # Topic "t1" - struct.pack('>h2s', 2, "t2"), # Topic "t2" + struct.pack('>h2s', 2, b"t1"), # Topic "t1" + struct.pack('>h2s', 2, b"t2"), # Topic "t2" ]) - encoded = KafkaProtocol.encode_metadata_request("cid", 4, ["t1", "t2"]) + encoded = KafkaProtocol.encode_metadata_request(b"cid", 4, [b"t1", b"t2"]) self.assertEqual(encoded, expected) - def _create_encoded_metadata_response(self, broker_data, topic_data, - topic_errors, partition_errors): - encoded = struct.pack('>ii', 3, len(broker_data)) - for node_id, broker in broker_data.iteritems(): - encoded += struct.pack('>ih%dsi' % len(broker.host), node_id, - len(broker.host), broker.host, broker.port) - - encoded += struct.pack('>i', len(topic_data)) - for topic, partitions in topic_data.iteritems(): - encoded += struct.pack('>hh%dsi' % len(topic), topic_errors[topic], - len(topic), topic, len(partitions)) - for partition, metadata in partitions.iteritems(): - encoded += struct.pack('>hiii', - partition_errors[(topic, partition)], - partition, metadata.leader, - len(metadata.replicas)) + def _create_encoded_metadata_response(self, brokers, topics): + encoded = [] + encoded.append(struct.pack('>ii', 3, len(brokers))) + for broker in brokers: + encoded.append(struct.pack('>ih%dsi' % len(broker.host), + broker.nodeId, len(broker.host), + broker.host, broker.port)) + + encoded.append(struct.pack('>i', len(topics))) + for topic in topics: + encoded.append(struct.pack('>hh%dsi' % len(topic.topic), + topic.error, len(topic.topic), + topic.topic, len(topic.partitions))) + for metadata in topic.partitions: + encoded.append(struct.pack('>hiii', metadata.error, + metadata.partition, metadata.leader, + len(metadata.replicas))) if len(metadata.replicas) > 0: - encoded += struct.pack('>%di' % len(metadata.replicas), - *metadata.replicas) + encoded.append(struct.pack('>%di' % len(metadata.replicas), + *metadata.replicas)) - encoded += struct.pack('>i', len(metadata.isr)) + encoded.append(struct.pack('>i', len(metadata.isr))) if len(metadata.isr) > 0: - encoded += struct.pack('>%di' % len(metadata.isr), - *metadata.isr) - - return encoded + encoded.append(struct.pack('>%di' % len(metadata.isr), + *metadata.isr)) + return b''.join(encoded) + @unittest.skip('needs updating for new protocol classes') def test_decode_metadata_response(self): - node_brokers = { - 0: BrokerMetadata(0, "brokers1.kafka.rdio.com", 1000), - 1: BrokerMetadata(1, "brokers1.kafka.rdio.com", 1001), - 3: BrokerMetadata(3, "brokers2.kafka.rdio.com", 1000) - } - - topic_partitions = { - "topic1": { - 0: PartitionMetadata("topic1", 0, 1, (0, 2), (2,)), - 1: PartitionMetadata("topic1", 1, 3, (0, 1), (0, 1)) - }, - "topic2": { - 0: PartitionMetadata("topic2", 0, 0, (), ()) - } - } - topic_errors = {"topic1": 0, "topic2": 1} - partition_errors = { - ("topic1", 0): 0, - ("topic1", 1): 1, - ("topic2", 0): 0 - } + node_brokers = [ + BrokerMetadata(0, b"brokers1.kafka.rdio.com", 1000), + BrokerMetadata(1, b"brokers1.kafka.rdio.com", 1001), + BrokerMetadata(3, b"brokers2.kafka.rdio.com", 1000) + ] + + ''' + topic_partitions = [ + TopicMetadata(b"topic1", 0, [ + PartitionMetadata(b"topic1", 0, 1, (0, 2), (2,), 0), + PartitionMetadata(b"topic1", 1, 3, (0, 1), (0, 1), 1) + ]), + TopicMetadata(b"topic2", 1, [ + PartitionMetadata(b"topic2", 0, 0, (), (), 0), + ]), + ] encoded = self._create_encoded_metadata_response(node_brokers, - topic_partitions, - topic_errors, - partition_errors) + topic_partitions) decoded = KafkaProtocol.decode_metadata_response(encoded) self.assertEqual(decoded, (node_brokers, topic_partitions)) + ''' + + def test_encode_consumer_metadata_request(self): + expected = b"".join([ + struct.pack(">i", 17), # Total length of the request + struct.pack('>h', 10), # API key consumer metadata + struct.pack('>h', 0), # API version + struct.pack('>i', 4), # Correlation ID + struct.pack('>h3s', 3, b"cid"),# The client ID + struct.pack('>h2s', 2, b"g1"), # Group "g1" + ]) + + encoded = KafkaProtocol.encode_consumer_metadata_request(b"cid", 4, b"g1") + self.assertEqual(encoded, expected) + + def test_decode_consumer_metadata_response(self): + encoded = b"".join([ + struct.pack(">i", 42), # Correlation ID + struct.pack(">h", 0), # No Error + struct.pack(">i", 1), # Broker ID + struct.pack(">h23s", 23, b"brokers1.kafka.rdio.com"), # Broker Host + struct.pack(">i", 1000), # Broker Port + ]) + + results = KafkaProtocol.decode_consumer_metadata_response(encoded) + self.assertEqual(results, + ConsumerMetadataResponse(error = 0, nodeId = 1, host = b'brokers1.kafka.rdio.com', port = 1000) + ) + + @unittest.skip('needs updating for new protocol classes') def test_encode_offset_request(self): - expected = "".join([ + expected = b"".join([ struct.pack(">i", 21), # Total length of the request struct.pack('>h', 2), # Message type = offset fetch struct.pack('>h', 0), # API version struct.pack('>i', 4), # Correlation ID - struct.pack('>h3s', 3, "cid"), # The client ID + struct.pack('>h3s', 3, b"cid"), # The client ID struct.pack('>i', -1), # Replica Id struct.pack('>i', 0), # No topic/partitions ]) - encoded = KafkaProtocol.encode_offset_request("cid", 4) + encoded = KafkaProtocol.encode_offset_request(b"cid", 4) self.assertEqual(encoded, expected) + @unittest.skip('needs updating for new protocol classes') def test_encode_offset_request__no_payload(self): - expected = "".join([ + expected = b"".join([ struct.pack(">i", 65), # Total length of the request struct.pack('>h', 2), # Message type = offset fetch struct.pack('>h', 0), # API version struct.pack('>i', 4), # Correlation ID - struct.pack('>h3s', 3, "cid"), # The client ID + struct.pack('>h3s', 3, b"cid"), # The client ID struct.pack('>i', -1), # Replica Id struct.pack('>i', 1), # Num topics - struct.pack(">h6s", 6, "topic1"), # Topic for the request + struct.pack(">h6s", 6, b"topic1"),# Topic for the request struct.pack(">i", 2), # Two partitions struct.pack(">i", 3), # Partition 3 @@ -547,18 +648,19 @@ struct.pack(">i", 1), # One offset requested ]) - encoded = KafkaProtocol.encode_offset_request("cid", 4, [ - OffsetRequest('topic1', 3, -1, 1), - OffsetRequest('topic1', 4, -1, 1), + encoded = KafkaProtocol.encode_offset_request(b"cid", 4, [ + OffsetRequest(b'topic1', 3, -1, 1), + OffsetRequest(b'topic1', 4, -1, 1), ]) self.assertEqual(encoded, expected) + @unittest.skip('needs updating for new protocol classes') def test_decode_offset_response(self): - encoded = "".join([ + encoded = b"".join([ struct.pack(">i", 42), # Correlation ID struct.pack(">i", 1), # One topics - struct.pack(">h6s", 6, "topic1"), # First topic + struct.pack(">h6s", 6, b"topic1"),# First topic struct.pack(">i", 2), # Two partitions struct.pack(">i", 2), # Partition 2 @@ -574,24 +676,25 @@ results = KafkaProtocol.decode_offset_response(encoded) self.assertEqual(set(results), set([ - OffsetResponse(topic = 'topic1', partition = 2, error = 0, offsets=(4,)), - OffsetResponse(topic = 'topic1', partition = 4, error = 0, offsets=(8,)), + OffsetResponse(topic = b'topic1', partition = 2, error = 0, offsets=(4,)), + OffsetResponse(topic = b'topic1', partition = 4, error = 0, offsets=(8,)), ])) + @unittest.skip('needs updating for new protocol classes') def test_encode_offset_commit_request(self): - header = "".join([ + header = b"".join([ struct.pack('>i', 99), # Total message length struct.pack('>h', 8), # Message type = offset commit struct.pack('>h', 0), # API version struct.pack('>i', 42), # Correlation ID - struct.pack('>h9s', 9, "client_id"), # The client ID - struct.pack('>h8s', 8, "group_id"), # The group to commit for + struct.pack('>h9s', 9, b"client_id"),# The client ID + struct.pack('>h8s', 8, b"group_id"), # The group to commit for struct.pack('>i', 2), # Num topics ]) - topic1 = "".join([ - struct.pack(">h6s", 6, "topic1"), # Topic for the request + topic1 = b"".join([ + struct.pack(">h6s", 6, b"topic1"), # Topic for the request struct.pack(">i", 2), # Two partitions struct.pack(">i", 0), # Partition 0 struct.pack(">q", 123), # Offset 123 @@ -601,30 +704,31 @@ struct.pack(">h", -1), # Null metadata ]) - topic2 = "".join([ - struct.pack(">h6s", 6, "topic2"), # Topic for the request + topic2 = b"".join([ + struct.pack(">h6s", 6, b"topic2"), # Topic for the request struct.pack(">i", 1), # One partition struct.pack(">i", 2), # Partition 2 struct.pack(">q", 345), # Offset 345 struct.pack(">h", -1), # Null metadata ]) - expected1 = "".join([ header, topic1, topic2 ]) - expected2 = "".join([ header, topic2, topic1 ]) + expected1 = b"".join([ header, topic1, topic2 ]) + expected2 = b"".join([ header, topic2, topic1 ]) - encoded = KafkaProtocol.encode_offset_commit_request("client_id", 42, "group_id", [ - OffsetCommitRequest("topic1", 0, 123, None), - OffsetCommitRequest("topic1", 1, 234, None), - OffsetCommitRequest("topic2", 2, 345, None), + encoded = KafkaProtocol.encode_offset_commit_request(b"client_id", 42, b"group_id", [ + OffsetCommitRequest(b"topic1", 0, 123, None), + OffsetCommitRequest(b"topic1", 1, 234, None), + OffsetCommitRequest(b"topic2", 2, 345, None), ]) self.assertIn(encoded, [ expected1, expected2 ]) + @unittest.skip('needs updating for new protocol classes') def test_decode_offset_commit_response(self): - encoded = "".join([ + encoded = b"".join([ struct.pack(">i", 42), # Correlation ID struct.pack(">i", 1), # One topic - struct.pack(">h6s", 6, "topic1"), # First topic + struct.pack(">h6s", 6, b"topic1"),# First topic struct.pack(">i", 2), # Two partitions struct.pack(">i", 2), # Partition 2 @@ -636,85 +740,84 @@ results = KafkaProtocol.decode_offset_commit_response(encoded) self.assertEqual(set(results), set([ - OffsetCommitResponse(topic = 'topic1', partition = 2, error = 0), - OffsetCommitResponse(topic = 'topic1', partition = 4, error = 0), + OffsetCommitResponse(topic = b'topic1', partition = 2, error = 0), + OffsetCommitResponse(topic = b'topic1', partition = 4, error = 0), ])) + @unittest.skip('needs updating for new protocol classes') def test_encode_offset_fetch_request(self): - header = "".join([ + header = b"".join([ struct.pack('>i', 69), # Total message length struct.pack('>h', 9), # Message type = offset fetch struct.pack('>h', 0), # API version struct.pack('>i', 42), # Correlation ID - struct.pack('>h9s', 9, "client_id"), # The client ID - struct.pack('>h8s', 8, "group_id"), # The group to commit for + struct.pack('>h9s', 9, b"client_id"),# The client ID + struct.pack('>h8s', 8, b"group_id"), # The group to commit for struct.pack('>i', 2), # Num topics ]) - topic1 = "".join([ - struct.pack(">h6s", 6, "topic1"), # Topic for the request + topic1 = b"".join([ + struct.pack(">h6s", 6, b"topic1"), # Topic for the request struct.pack(">i", 2), # Two partitions struct.pack(">i", 0), # Partition 0 struct.pack(">i", 1), # Partition 1 ]) - topic2 = "".join([ - struct.pack(">h6s", 6, "topic2"), # Topic for the request + topic2 = b"".join([ + struct.pack(">h6s", 6, b"topic2"), # Topic for the request struct.pack(">i", 1), # One partitions struct.pack(">i", 2), # Partition 2 ]) - expected1 = "".join([ header, topic1, topic2 ]) - expected2 = "".join([ header, topic2, topic1 ]) + expected1 = b"".join([ header, topic1, topic2 ]) + expected2 = b"".join([ header, topic2, topic1 ]) - encoded = KafkaProtocol.encode_offset_fetch_request("client_id", 42, "group_id", [ - OffsetFetchRequest("topic1", 0), - OffsetFetchRequest("topic1", 1), - OffsetFetchRequest("topic2", 2), + encoded = KafkaProtocol.encode_offset_fetch_request(b"client_id", 42, b"group_id", [ + OffsetFetchRequest(b"topic1", 0), + OffsetFetchRequest(b"topic1", 1), + OffsetFetchRequest(b"topic2", 2), ]) self.assertIn(encoded, [ expected1, expected2 ]) + @unittest.skip('needs updating for new protocol classes') def test_decode_offset_fetch_response(self): - encoded = "".join([ + encoded = b"".join([ struct.pack(">i", 42), # Correlation ID struct.pack(">i", 1), # One topics - struct.pack(">h6s", 6, "topic1"), # First topic + struct.pack(">h6s", 6, b"topic1"),# First topic struct.pack(">i", 2), # Two partitions struct.pack(">i", 2), # Partition 2 struct.pack(">q", 4), # Offset 4 - struct.pack(">h4s", 4, "meta"), # Metadata + struct.pack(">h4s", 4, b"meta"), # Metadata struct.pack(">h", 0), # No error struct.pack(">i", 4), # Partition 4 struct.pack(">q", 8), # Offset 8 - struct.pack(">h4s", 4, "meta"), # Metadata + struct.pack(">h4s", 4, b"meta"), # Metadata struct.pack(">h", 0), # No error ]) results = KafkaProtocol.decode_offset_fetch_response(encoded) self.assertEqual(set(results), set([ - OffsetFetchResponse(topic = 'topic1', partition = 2, offset = 4, error = 0, metadata = "meta"), - OffsetFetchResponse(topic = 'topic1', partition = 4, offset = 8, error = 0, metadata = "meta"), + OffsetFetchResponse(topic = b'topic1', partition = 2, offset = 4, error = 0, metadata = b"meta"), + OffsetFetchResponse(topic = b'topic1', partition = 4, offset = 8, error = 0, metadata = b"meta"), ])) @contextmanager def mock_create_message_fns(self): - patches = contextlib.nested( - mock.patch.object(kafka.protocol, "create_message", - return_value=sentinel.message), - mock.patch.object(kafka.protocol, "create_gzip_message", - return_value=sentinel.gzip_message), - mock.patch.object(kafka.protocol, "create_snappy_message", - return_value=sentinel.snappy_message), - ) - - with patches: - yield + import kafka.protocol + with patch.object(kafka.protocol.legacy, "create_message", + return_value=sentinel.message): + with patch.object(kafka.protocol.legacy, "create_gzip_message", + return_value=sentinel.gzip_message): + with patch.object(kafka.protocol.legacy, "create_snappy_message", + return_value=sentinel.snappy_message): + yield def test_create_message_set(self): - messages = [1, 2, 3] + messages = [(1, "k1"), (2, "k2"), (3, "k3")] # Default codec is CODEC_NONE. Expect list of regular messages. expect = [sentinel.message] * len(messages) diff -Nru python-kafka-python-0.9.2/test/test_util.py python-kafka-python-1.0.1/test/test_util.py --- python-kafka-python-0.9.2/test/test_util.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/test/test_util.py 2016-02-17 18:37:58.000000000 +0000 @@ -1,19 +1,22 @@ # -*- coding: utf-8 -*- import struct -import unittest2 -import kafka.util + +import six +from . import unittest + import kafka.common +import kafka.util -class UtilTest(unittest2.TestCase): - @unittest2.skip("Unwritten") +class UtilTest(unittest.TestCase): + @unittest.skip("Unwritten") def test_relative_unpack(self): pass def test_write_int_string(self): self.assertEqual( - kafka.util.write_int_string('some string'), - '\x00\x00\x00\x0bsome string' + kafka.util.write_int_string(b'some string'), + b'\x00\x00\x00\x0bsome string' ) def test_write_int_string__unicode(self): @@ -21,34 +24,37 @@ kafka.util.write_int_string(u'unicode') #: :type: TypeError te = cm.exception - self.assertIn('unicode', te.message) - self.assertIn('to be str', te.message) + if six.PY2: + self.assertIn('unicode', str(te)) + else: + self.assertIn('str', str(te)) + self.assertIn('to be bytes', str(te)) def test_write_int_string__empty(self): self.assertEqual( - kafka.util.write_int_string(''), - '\x00\x00\x00\x00' + kafka.util.write_int_string(b''), + b'\x00\x00\x00\x00' ) def test_write_int_string__null(self): self.assertEqual( kafka.util.write_int_string(None), - '\xff\xff\xff\xff' + b'\xff\xff\xff\xff' ) def test_read_int_string(self): - self.assertEqual(kafka.util.read_int_string('\xff\xff\xff\xff', 0), (None, 4)) - self.assertEqual(kafka.util.read_int_string('\x00\x00\x00\x00', 0), ('', 4)) - self.assertEqual(kafka.util.read_int_string('\x00\x00\x00\x0bsome string', 0), ('some string', 15)) + self.assertEqual(kafka.util.read_int_string(b'\xff\xff\xff\xff', 0), (None, 4)) + self.assertEqual(kafka.util.read_int_string(b'\x00\x00\x00\x00', 0), (b'', 4)) + self.assertEqual(kafka.util.read_int_string(b'\x00\x00\x00\x0bsome string', 0), (b'some string', 15)) def test_read_int_string__insufficient_data(self): with self.assertRaises(kafka.common.BufferUnderflowError): - kafka.util.read_int_string('\x00\x00\x00\x021', 0) + kafka.util.read_int_string(b'\x00\x00\x00\x021', 0) def test_write_short_string(self): self.assertEqual( - kafka.util.write_short_string('some string'), - '\x00\x0bsome string' + kafka.util.write_short_string(b'some string'), + b'\x00\x0bsome string' ) def test_write_short_string__unicode(self): @@ -56,29 +62,32 @@ kafka.util.write_short_string(u'hello') #: :type: TypeError te = cm.exception - self.assertIn('unicode', te.message) - self.assertIn('to be str', te.message) + if six.PY2: + self.assertIn('unicode', str(te)) + else: + self.assertIn('str', str(te)) + self.assertIn('to be bytes', str(te)) def test_write_short_string__empty(self): self.assertEqual( - kafka.util.write_short_string(''), - '\x00\x00' + kafka.util.write_short_string(b''), + b'\x00\x00' ) def test_write_short_string__null(self): self.assertEqual( kafka.util.write_short_string(None), - '\xff\xff' + b'\xff\xff' ) def test_write_short_string__too_long(self): with self.assertRaises(struct.error): - kafka.util.write_short_string(' ' * 33000) + kafka.util.write_short_string(b' ' * 33000) def test_read_short_string(self): - self.assertEqual(kafka.util.read_short_string('\xff\xff', 0), (None, 2)) - self.assertEqual(kafka.util.read_short_string('\x00\x00', 0), ('', 2)) - self.assertEqual(kafka.util.read_short_string('\x00\x0bsome string', 0), ('some string', 13)) + self.assertEqual(kafka.util.read_short_string(b'\xff\xff', 0), (None, 2)) + self.assertEqual(kafka.util.read_short_string(b'\x00\x00', 0), (b'', 2)) + self.assertEqual(kafka.util.read_short_string(b'\x00\x0bsome string', 0), (b'some string', 13)) def test_read_int_string__insufficient_data2(self): with self.assertRaises(kafka.common.BufferUnderflowError): @@ -86,7 +95,7 @@ def test_relative_unpack2(self): self.assertEqual( - kafka.util.relative_unpack('>hh', '\x00\x01\x00\x00\x02', 0), + kafka.util.relative_unpack('>hh', b'\x00\x01\x00\x00\x02', 0), ((1, 0), 4) ) @@ -95,11 +104,10 @@ kafka.util.relative_unpack('>hh', '\x00', 0) def test_group_by_topic_and_partition(self): - t = kafka.common.TopicAndPartition + t = kafka.common.TopicPartition l = [ t("a", 1), - t("a", 1), t("a", 2), t("a", 3), t("b", 3), @@ -115,3 +123,8 @@ 3: t("b", 3), } }) + + # should not be able to group duplicate topic-partitions + t1 = t("a", 1) + with self.assertRaises(AssertionError): + kafka.util.group_by_topic_and_partition([t1, t1]) diff -Nru python-kafka-python-0.9.2/test/testutil.py python-kafka-python-1.0.1/test/testutil.py --- python-kafka-python-0.9.2/test/testutil.py 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/test/testutil.py 2016-02-17 18:37:58.000000000 +0000 @@ -1,15 +1,18 @@ import functools import logging +import operator import os import random import socket import string import time -import unittest2 import uuid -from kafka.common import OffsetRequest -from kafka import KafkaClient +from six.moves import xrange +from . import unittest + +from kafka import SimpleClient +from kafka.common import OffsetRequestPayload __all__ = [ 'random_string', @@ -20,19 +23,51 @@ ] def random_string(l): - s = "".join(random.choice(string.letters) for i in xrange(l)) - return s + return "".join(random.choice(string.ascii_letters) for i in xrange(l)) def kafka_versions(*versions): + + def version_str_to_list(s): + return list(map(int, s.split('.'))) # e.g., [0, 8, 1, 1] + + def construct_lambda(s): + if s[0].isdigit(): + op_str = '=' + v_str = s + elif s[1].isdigit(): + op_str = s[0] # ! < > = + v_str = s[1:] + elif s[2].isdigit(): + op_str = s[0:2] # >= <= + v_str = s[2:] + else: + raise ValueError('Unrecognized kafka version / operator: %s' % s) + + op_map = { + '=': operator.eq, + '!': operator.ne, + '>': operator.gt, + '<': operator.lt, + '>=': operator.ge, + '<=': operator.le + } + op = op_map[op_str] + version = version_str_to_list(v_str) + return lambda a: op(version_str_to_list(a), version) + + validators = map(construct_lambda, versions) + def kafka_versions(func): @functools.wraps(func) def wrapper(self): kafka_version = os.environ.get('KAFKA_VERSION') if not kafka_version: - self.skipTest("no kafka version specified") - elif 'all' not in versions and kafka_version not in versions: - self.skipTest("unsupported kafka version") + self.skipTest("no kafka version set in KAFKA_VERSION env var") + + for f in validators: + if not f(kafka_version): + self.skipTest("unsupported kafka version") return func(self) return wrapper @@ -45,21 +80,23 @@ sock.close() return port -class KafkaIntegrationTestCase(unittest2.TestCase): +class KafkaIntegrationTestCase(unittest.TestCase): create_client = True topic = None + zk = None server = None def setUp(self): super(KafkaIntegrationTestCase, self).setUp() if not os.environ.get('KAFKA_VERSION'): - return + self.skipTest('Integration test requires KAFKA_VERSION') if not self.topic: - self.topic = "%s-%s" % (self.id()[self.id().rindex(".") + 1:], random_string(10)) + topic = "%s-%s" % (self.id()[self.id().rindex(".") + 1:], random_string(10)) + self.topic = topic if self.create_client: - self.client = KafkaClient('%s:%d' % (self.server.host, self.server.port)) + self.client = SimpleClient('%s:%d' % (self.server.host, self.server.port)) self.client.ensure_topic_exists(self.topic) @@ -74,8 +111,15 @@ self.client.close() def current_offset(self, topic, partition): - offsets, = self.client.send_offset_request([ OffsetRequest(topic, partition, -1, 1) ]) - return offsets.offsets[0] + try: + offsets, = self.client.send_offset_request([OffsetRequestPayload(topic, partition, -1, 1)]) + except: + # XXX: We've seen some UnknownErrors here and cant debug w/o server logs + self.zk.child.dump_logs() + self.server.child.dump_logs() + raise + else: + return offsets.offsets[0] def msgs(self, iterable): return [ self.msg(x) for x in iterable ] @@ -84,7 +128,11 @@ if s not in self._messages: self._messages[s] = '%s-%s-%s' % (s, self.id(), str(uuid.uuid4())) - return self._messages[s] + return self._messages[s].encode('utf-8') + + def key(self, k): + return k.encode('utf-8') + class Timer(object): def __enter__(self): @@ -96,3 +144,5 @@ self.interval = self.end - self.start logging.basicConfig(level=logging.DEBUG) +logging.getLogger('test.fixtures').setLevel(logging.ERROR) +logging.getLogger('test.service').setLevel(logging.ERROR) diff -Nru python-kafka-python-0.9.2/VERSION python-kafka-python-1.0.1/VERSION --- python-kafka-python-0.9.2/VERSION 2014-08-27 21:24:39.000000000 +0000 +++ python-kafka-python-1.0.1/VERSION 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -0.9.2