mapper.py
#!/usr/bin/env python """A more advanced Mapper, using Python iterators and generators.""" import sys def read_input(file): for line in file: # split the line into words yield line.split() def main(separator=" "): # input comes from STDIN (standard input) data = read_input(sys.stdin) for words in data: # write the results to STDOUT (standard output); # what we output here will be the input for the # Reduce step, i.e. the input for reducer.py # # tab-delimited; the trivial word count is 1 for word in words: print "%s%s%d" % (word, separator, 1) if __name__ == "__main__": main()reducer.py
#!/usr/bin/env python """A more advanced Reducer, using Python iterators and generators.""" from itertools import groupby from operator import itemgetter import sys def read_mapper_output(file, separator=" "): for line in file: yield line.rstrip().split(separator, 1) def main(separator=" "): # input comes from STDIN (standard input) data = read_mapper_output(sys.stdin, separator=separator) # groupby groups multiple word-count pairs by word, # and creates an iterator that returns consecutive keys and their group: # current_word - string containing a word (the key) # group - iterator yielding all ["", " "] items for current_word, group in groupby(data, itemgetter(0)): try: total_count = sum(int(count) for current_word, count in group) print "%s%s%d" % (current_word, separator, total_count) except ValueError: # count was not a number, so silently discard this item pass if __name__ == "__main__": main()
轉(zhuǎn)自:http://www.michael-noll.com/tutorials/writing-an-hadoop-mapreduce-program-in-python/
文章版權(quán)歸作者所有,未經(jīng)允許請勿轉(zhuǎn)載,若此文章存在違規(guī)行為,您可以聯(lián)系管理員刪除。
轉(zhuǎn)載請注明本文地址:http://specialneedsforspecialkids.com/yun/45308.html
摘要: Caching Libraries for caching data. Beaker - A library for caching and sessions for use with web applications and stand-alone Python scripts and applications. dogpile.cache - dogpile.cache...
閱讀 3063·2021-11-24 10:34
閱讀 3322·2021-11-22 13:53
閱讀 2630·2021-11-22 12:03
閱讀 3598·2021-09-26 09:47
閱讀 3005·2021-09-23 11:21
閱讀 4772·2021-09-22 15:08
閱讀 3290·2021-07-23 10:59
閱讀 1258·2019-08-29 18:31