前段时间做leetcode题直接用到了python collections库里的Counter类(计数字典),自己骚操作了半天搞了一堆字典数组,还是没有优雅的达成目的,最终屈服调用Counter类,结果不需要我任何其他手段,内置方法就搞定了,也勾起了我的好奇心,来看看Counter类的源码。
两百多行代码不一下全贴,有碍观瞻,找些重要的研究一下
4
5 class Counter(dict):
6 '''Dict subclass for counting hashable items. Sometimes called a bag
7 or multiset. Elements are stored as dictionary keys and their counts
8 are stored as dictionary values.
9
可见Counter父类为dict
51 def __init__(self, iterable=None, **kwds):
52 '''Create a new, empty Counter object. And if given, count elements
53 from an input iterable. Or, initialize the count from another mapping
54 of elements to their counts.
55
56 >>> c = Counter() # a new, empty counter
57 >>> c = Counter('gallahad') # a new counter from an iterable
58 >>> c = Counter({'a': 4, 'b': 2}) # a new counter from a mapping
59 >>> c = Counter(a=4, b=2) # a new counter from keyword args
60
61 '''
62 super(Counter, self).__init__()
63 self.update(iterable, **kwds)
64
看到super,去研究了一番,发现是调用父类方法,比如这里的super就是调用了父类dict的init方法,即生成了一个字典,随后用到了自己类的update方法,下面来看一下
116 def update(self, iterable=None, **kwds):
117 """ 更新计数器,其实就是增加;如果原来没有,则新建,如果有则加一 """
118 '''Like dict.update() but add counts instead of replacing them.
119
120 Source can be an iterable, a dictionary, or another Counter instance.
121
122 >>> c = Counter('which')
123 >>> c.update('witch') # add elements from another iterable
124 >>> d = Counter('watch')
125 >>> c.update(d) # add elements from another counter
126 >>> c['h'] # four 'h' in which, witch, and watch
127
128 '''
129 # The regular dict.update() operation makes no sense here because the
130 # replace behavior results in the some of original untouched counts
131 # being mixed-in with all of the other counts for a mismash that
132 # doesn't have a straight-forward interpretation in most counting
133 # contexts. Instead, we implement straight-addition. Both the inputs
134 # and outputs are allowed to contain zero and negative counts.
135
136 if iterable is not None:
137 if isinstance(iterable, Mapping):
138 if self:
139 self_get = self.get
140 for elem, count in iterable.iteritems():
141 self[elem] = self_get(elem, 0) + count
142 else:
143 super(Counter, self).update(iterable) # fast path when counter is empty
144 else:
145 self_get = self.get
146 for elem in iterable:
147 self[elem] = self_get(elem, 0) + 1
148 if kwds:
149 self.update(kwds)
update算是该类的核心方法之一了,判断iterable变量是否传入,用isinstance函数判断是否是Mapping类(如dict),如果是这类数据{:},就直接用mapping的elem值加上本身Counter[elem]值来更新,否则就是加一
71 def most_common(self, n=None):
72
73 '''List the n most common elements and their counts from the most
74 common to the least. If n is None, then list all element counts.
75
76 >>> Counter('abcdeabcdabcaba').most_common(3)
77 [('a', 5), ('b', 4), ('c', 3)]
78
79 '''
80 # Emulate Bag.sortedByCount from Smalltalk
81 if n is None:
82 return sorted(self.iteritems(), key=_itemgetter(1), reverse=True)
83 return _heapq.nlargest(n, self.iteritems(), key=_itemgetter(1))
84
又一个counter类的重要方法,意义很明确,注意到这里利用iteritems()返回一个迭代器,指定key=_itemgetter(1),即使用迭代器第二个域,即counter的value来排序
不指定n时使用sorted函数,指定n时则使用 _heapq.nlargest函数,这个之前没有见过,应该是堆排序,度一下它的具体实现
heap数据类型源码:https://github.com/python/cpython/tree/2.7/Lib/heapq.py
python3.5的heapq模块代码比2.7的要清晰很多,本质没什么差别,主要方法都是建堆(最小堆与最大堆两种方法),push,pop,pushpop操作,在这些过程中调用sftup与siftdown方法,维护堆的性质,另外就是merge(合并两个堆)以及常用到ksmallest与klargest方法,他们的操作时间都是O(klgk),加上建堆时间为O(nlgn),总体时间复杂度为O(nlgn),看上去并没有什么优越性,然而在实际应用中是快一些的,回头具体研究下这个问题。
由上可见,Counter类除了字典以外没有使用什么复杂的数据结构,其most_common方法,实际上是调用heap模块进行了O(nlgn)的操作,设计还是较为简单清晰的,那么如果我直接在建字典的时候就对其维持堆的性质,会不会效率比Counter更高呢?
剩余的Counter类代码列在下面
85 def elements(self):
86 """ 计数器中的所有元素,注:此处非所有元素集合,而是包含所有元素集合的迭代器 """
87 '''Iterator over elements repeating each as many times as its count.
88
89 >>> c = Counter('ABCABC')
90 >>> sorted(c.elements())
91 ['A', 'A', 'B', 'B', 'C', 'C']
92
93 # Knuth's example for prime factors of 1836: 2**2 * 3**3 * 17**1
94 >>> prime_factors = Counter({2: 2, 3: 3, 17: 1})
95 >>> product = 1
96 >>> for factor in prime_factors.elements(): # loop over factors
97 ... product *= factor # and multiply them
98 >>> product
99
100 Note, if an element's count has been set to zero or is a negative
101 number, elements() will ignore it.
102
103 '''
104 # Emulate Bag.do from Smalltalk and Multiset.begin from C++.
105 return _chain.from_iterable(_starmap(_repeat, self.iteritems()))
106
107 # Override dict methods where necessary
108
109 @classmethod
110 def fromkeys(cls, iterable, v=None):
111 # There is no equivalent method for counters because setting v=1
112 # means that no element can have a count greater than one.
113 raise NotImplementedError(
114 'Counter.fromkeys() is undefined. Use Counter(iterable) instead.')
115
150
151 def subtract(self, iterable=None, **kwds):
152 """ 相减,原来的计数器中的每一个元素的数量减去后添加的元素的数量 """
153 '''Like dict.update() but subtracts counts instead of replacing them.
154 Counts can be reduced below zero. Both the inputs and outputs are
155 allowed to contain zero and negative counts.
156
157 Source can be an iterable, a dictionary, or another Counter instance.
158
159 >>> c = Counter('which')
160 >>> c.subtract('witch') # subtract elements from another iterable
161 >>> c.subtract(Counter('watch')) # subtract elements from another counter
162 >>> c['h'] # 2 in which, minus 1 in witch, minus 1 in watch
163 >>> c['w'] # 1 in which, minus 1 in witch, minus 1 in watch
164 -1
165
166 '''
167 if iterable is not None:
168 self_get = self.get
169 if isinstance(iterable, Mapping):
170 for elem, count in iterable.items():
171 self[elem] = self_get(elem, 0) - count
172 else:
173 for elem in iterable:
174 self[elem] = self_get(elem, 0) - 1
175 if kwds:
176 self.subtract(kwds)
177
178 def copy(self):
179 """ 拷贝 """
180 'Return a shallow copy.'
181 return self.__class__(self)
182
183 def __reduce__(self):
184 """ 返回一个元组(类型,元组) """
185 return self.__class__, (dict(self),)
186
187 def __delitem__(self, elem):
188 """ 删除元素 """
189 'Like dict.__delitem__() but does not raise KeyError for missing values.'
190 if elem in self:
191 super(Counter, self).__delitem__(elem)
192
193 def __repr__(self):
194 if not self:
195 return '%s()' % self.__class__.__name__
196 items = ', '.join(map('%r: %r'.__mod__, self.most_common()))
197 return '%s({%s})' % (self.__class__.__name__, items)
198
199 # Multiset-style mathematical operations discussed in:
200 # Knuth TAOCP Volume II section 4.6.3 exercise 19
201 # and at http://en.wikipedia.org/wiki/Multiset
202 #
203 # Outputs guaranteed to only include positive counts.
204 #
205 # To strip negative and zero counts, add-in an empty counter:
206 # c += Counter()
207
208 def __add__(self, other):
209 '''Add counts from two counters.
210
211 >>> Counter('abbb') + Counter('bcc')
212 Counter({'b': 4, 'c': 2, 'a': 1})
213
214 '''
215 if not isinstance(other, Counter):
216 return NotImplemented
217 result = Counter()
218 for elem, count in self.items():
219 newcount = count + other[elem]
220 if newcount > 0:
221 result[elem] = newcount
222 for elem, count in other.items():
223 if elem not in self and count > 0:
224 result[elem] = count
225 return result
226
227 def __sub__(self, other):
228 ''' Subtract count, but keep only results with positive counts.
229
230 >>> Counter('abbbc') - Counter('bccd')
231 Counter({'b': 2, 'a': 1})
232
233 '''
234 if not isinstance(other, Counter):
235 return NotImplemented
236 result = Counter()
237 for elem, count in self.items():
238 newcount = count - other[elem]
239 if newcount > 0:
240 result[elem] = newcount
241 for elem, count in other.items():
242 if elem not in self and count < 0:
243 result[elem] = 0 - count
244 return result
245
246 def __or__(self, other):
247 '''Union is the maximum of value in either of the input counters.
248
249 >>> Counter('abbb') | Counter('bcc')
250 Counter({'b': 3, 'c': 2, 'a': 1})
251
252 '''
253 if not isinstance(other, Counter):
254 return NotImplemented
255 result = Counter()
256 for elem, count in self.items():
257 other_count = other[elem]
258 newcount = other_count if count < other_count else count
259 if newcount > 0:
260 result[elem] = newcount
261 for elem, count in other.items():
262 if elem not in self and count > 0:
263 result[elem] = count
264 return result
265
266 def __and__(self, other):
267 ''' Intersection is the minimum of corresponding counts.
268
269 >>> Counter('abbb') & Counter('bcc')
270 Counter({'b': 1})
271
272 '''
273 if not isinstance(other, Counter):
274 return NotImplemented
275 result = Counter()
276 for elem, count in self.items():
277 other_count = other[elem]
278 newcount = count if count < other_count else other_count
279 if newcount > 0:
280 result[elem] = newcount
281 return result
282
283 Counter