1 /*
2  * Copyright (C) 2020 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.internal.util;
18 
19 import android.annotation.NonNull;
20 import android.annotation.Nullable;
21 import android.util.SparseArray;
22 import android.util.SparseIntArray;
23 
24 import java.util.ArrayList;
25 import java.util.Collections;
26 import java.util.List;
27 
28 /**
29  * <p>A utility which processes a sequence of input (stream) and output the heavy hitters
30  * (the frequent ones).</p>
31  *
32  * @param <T> The type of the input.
33  * @see <a href="https://en.wikipedia.org/wiki/Streaming_algorithm">Stream Algorithm</a> for
34  * the definion of heavy hitters and the list of algorithms for detecting it.
35  * <p>
36  * {@hide}
37  */
38 @android.ravenwood.annotation.RavenwoodKeepWholeClass
39 public interface HeavyHitterSketch<T> {
40     /**
41      * Return the default implementation.
42      *
43      * @return The default implementation.
44      */
newDefault()45     static <V> @NonNull HeavyHitterSketch<V> newDefault() {
46         return new HeavyHitterSketchImpl<V>();
47     }
48 
49     /**
50      * Set the configuration with given parameters
51      *
52      * @param inputSize The amount of the input.
53      * @param capacity  The maximum number of distinct input it should track; it defines the lower
54      *                  bound of the output.
55      */
setConfig(int inputSize, int capacity)56     void setConfig(int inputSize, int capacity);
57 
58     /**
59      * Add a new input to the current sketch.
60      *
61      * @param newInstance The new input
62      */
add(@ullable T newInstance)63     void add(@Nullable T newInstance);
64 
65     /**
66      * @param k      The number of heavy hitters it should return, k &lt; capacity, a value of 0
67      *               will be equivalent to capacity - 1
68      * @param holder The list array into which the elements of the tops are to be stored; a new list
69      *               would be created and returned if this parameter is null.
70      * @param freqs  Optional, the frequencies of each items in the returned result
71      * @return The top K heavy hitters(frequent input)
72      */
73     @Nullable
getTopHeavyHitters(int k, @Nullable List<T> holder, @Nullable List<Float> freqs)74     List<T> getTopHeavyHitters(int k, @Nullable List<T> holder, @Nullable List<Float> freqs);
75 
76     /**
77      * @param holder The list array into which the elements of the candidates are to be stored; a
78      *               new list would be created and returned if this parameter is null.
79      * @return The candidate heavy hitters so far, it could include false postives.
80      */
81     @Nullable
getCandidates(@ullable List<T> holder)82     List<T> getCandidates(@Nullable List<T> holder);
83 
84     /**
85      * Reset this heavy hitter counter
86      */
reset()87     void reset();
88 
89     /**
90      * @return The ratio of the input to be used as the validation data, Float.NaN means no
91      * validation is needed.
92      */
getRequiredValidationInputRatio()93     float getRequiredValidationInputRatio();
94 
95     /**
96      * The default implementation of the {@link HeavyHitterSketch}.
97      *
98      * <p>Currently it consists of two passes: the first pass will take the input into
99      * the MG(Misra–Gries) summary; while the secondary pass will validate the output against the
100      * input in order to eliminate false postivies.</p>
101      *
102      * <p>For sure there are better approaches which takes only one pass, but also comes along with
103      * overheads in terms of cpu/memory cost; the MG summary would be a trivial and good enough
104      * pick.</p>
105      *
106      * @param <T> The type of the input.
107      * @see <a href="https://en.wikipedia.org/wiki/Misra%E2%80%93Gries_summary">Misra–Gries
108      * summary</a> for the detailed explanation of the algorithm.
109      */
110     final class HeavyHitterSketchImpl<T> implements HeavyHitterSketch<T> {
111         /**
112          * The array to track the current heavy hitters, its size &lt; {@link #mCapacity}.
113          */
114         private final SparseArray<T> mObjects = new SparseArray<>();
115 
116         /**
117          * The frequencies of the current heavy hitters, its size &lt; {@link #mCapacity}.
118          */
119         private final SparseIntArray mFrequencies = new SparseIntArray();
120 
121         /**
122          * The amount of the input of each pass
123          */
124         private int mPassSize;
125 
126         /**
127          * The amount of the total input it expects
128          */
129         private int mTotalSize;
130 
131         /**
132          * The maximum number of distinct input it should track
133          */
134         private int mCapacity;
135 
136         /**
137          * The amount of inputs it already received.
138          */
139         private int mNumInputs;
140 
141         /**
142          * Whether or not it's configured properly.
143          */
144         private boolean mConfigured;
145 
146         /**
147          * Set the configuration with given parameters
148          *
149          * @param inputSize The amount of the input.
150          * @param capacity  The maximum number of distinct input it should track; it defines the
151          *                  lower bound of the output.
152          */
setConfig(final int inputSize, final int capacity)153         public void setConfig(final int inputSize, final int capacity) {
154             if (inputSize < capacity || inputSize <= 1) {
155                 mConfigured = false;
156                 throw new IllegalArgumentException();
157             }
158             reset();
159             mTotalSize = inputSize;
160             mPassSize = inputSize >> 1;
161             mCapacity = capacity;
162             mConfigured = true;
163         }
164 
165         /**
166          * Add a new input to the current sketch.
167          *
168          * @param newInstance The new input
169          */
170         @Override
add(@ullable final T newInstance)171         public void add(@Nullable final T newInstance) {
172             if (!mConfigured) {
173                 throw new IllegalStateException();
174             }
175             if (mNumInputs < mPassSize) {
176                 addToMGSummary(newInstance);
177             } else if (mNumInputs < mTotalSize) {
178                 // Validation pass
179                 validate(newInstance);
180             }
181         }
182 
183         /**
184          * Add an input to the MG summary.
185          *
186          * <p>Note the frequency in the result set is an estimation. Every (obj, freq') pair
187          * in the result set, will have the following property:
188          * <code>(freq - inputSize / capacity) &le; freq' &le; freq</code>
189          * The above freq' is the estimated frequency, while the freq is the actual frequency.
190          * </p>
191          */
addToMGSummary(@ullable final T newInstance)192         private void addToMGSummary(@Nullable final T newInstance) {
193             final int hashCode = newInstance != null ? newInstance.hashCode() : 0;
194             final int index = mObjects.indexOfKey(hashCode);
195             // MG summary
196             if (index >= 0) {
197                 mFrequencies.setValueAt(index, mFrequencies.valueAt(index) + 1);
198             } else if (mObjects.size() < mCapacity - 1) {
199                 mObjects.put(hashCode, newInstance);
200                 mFrequencies.put(hashCode, 1);
201             } else {
202                 for (int i = mFrequencies.size() - 1; i >= 0; i--) {
203                     final int val = mFrequencies.valueAt(i) - 1;
204                     if (val == 0) {
205                         mObjects.removeAt(i);
206                         mFrequencies.removeAt(i);
207                     } else {
208                         mFrequencies.setValueAt(i, val);
209                     }
210                 }
211             }
212             if (++mNumInputs == mPassSize) {
213                 // Clear all the frequencies as we are going to validate them next
214                 for (int i = mFrequencies.size() - 1; i >= 0; i--) {
215                     mFrequencies.setValueAt(i, 0);
216                 }
217             }
218         }
219 
220         /**
221          * Validate the results from MG summary; the ones with frequencies less than lower boundary
222          * will be removed from the set.
223          */
validate(@ullable final T newInstance)224         private void validate(@Nullable final T newInstance) {
225             final int hashCode = newInstance != null ? newInstance.hashCode() : 0;
226             final int index = mObjects.indexOfKey(hashCode);
227             if (index >= 0) {
228                 mFrequencies.setValueAt(index, mFrequencies.valueAt(index) + 1);
229             }
230             if (++mNumInputs == mTotalSize) {
231                 final int lower = mPassSize / mCapacity;
232                 // Remove any occurrences with frequencies less than lower boundary
233                 for (int i = mFrequencies.size() - 1; i >= 0; i--) {
234                     final int val = mFrequencies.valueAt(i);
235                     if (val < lower) {
236                         mFrequencies.removeAt(i);
237                         mObjects.removeAt(i);
238                     }
239                 }
240             }
241         }
242 
243         /**
244          * @param k      The number of heavy hitters it should return, k &lt; capacity, a value of 0
245          *               will be equivalent to capacity - 1
246          * @param holder The list array into which the elements of the tops are to be stored; a new
247          *               list would be created and returned if this parameter is null.
248          * @param freqs  Optional, the frequencies of each items in the returned result
249          * @return The top K heavy hitters(frequent input)
250          */
251         @Override
252         @Nullable
getTopHeavyHitters(final int k, final @Nullable List<T> holder, final @Nullable List<Float> freqs)253         public List<T> getTopHeavyHitters(final int k, final @Nullable List<T> holder,
254                 final @Nullable List<Float> freqs) {
255             if (!mConfigured) {
256                 throw new IllegalStateException();
257             }
258 
259             if (k >= mCapacity) {
260                 throw new IllegalArgumentException();
261             }
262 
263             if (mNumInputs < mTotalSize) {
264                 // It hasn't had all the inputs yet.
265                 throw new IllegalStateException();
266             }
267 
268             ArrayList<Integer> indexes = null;
269             for (int i = mFrequencies.size() - 1; i >= 0; i--) {
270                 final int val = mFrequencies.valueAt(i);
271                 if (val > 0) {
272                     if (indexes == null) {
273                         indexes = new ArrayList<>();
274                     }
275                     indexes.add(i);
276                 }
277             }
278             if (indexes == null) {
279                 return null;
280             }
281 
282             Collections.sort(indexes, (a, b) -> mFrequencies.valueAt(b) - mFrequencies.valueAt(a));
283 
284             final List<T> result = holder != null ? holder : new ArrayList<T>();
285             final int max = Math.min(k == 0 ? (mCapacity - 1) : k, indexes.size());
286             for (int i = 0; i < max; i++) {
287                 final int index = indexes.get(i);
288                 final T obj = mObjects.valueAt(index);
289                 if (obj != null) {
290                     result.add(obj);
291                     if (freqs != null) {
292                         freqs.add((float) mFrequencies.valueAt(index) / mPassSize);
293                     }
294                 }
295             }
296             return result;
297         }
298 
299         /**
300          * @param holder The list array into which the elements of the candidates are to be stored;
301          *               a new list would be created and returned if this parameter is null.
302          * @return The candidate heavy hitters so far, it could include false postives.
303          */
304         @Nullable
getCandidates(final @Nullable List<T> holder)305         public List<T> getCandidates(final @Nullable List<T> holder) {
306             if (!mConfigured) {
307                 throw new IllegalStateException();
308             }
309             if (mNumInputs < mPassSize) {
310                 // It hasn't done with the first pass yet, return nothing
311                 return null;
312             }
313 
314             List<T> result = holder != null ? holder : new ArrayList<T>();
315             for (int i = mObjects.size() - 1; i >= 0; i--) {
316                 final T obj = mObjects.valueAt(i);
317                 if (obj != null) {
318                     result.add(obj);
319                 }
320             }
321             return result;
322         }
323 
324         /**
325          * Reset this heavy hitter counter
326          */
327         @Override
reset()328         public void reset() {
329             mNumInputs = 0;
330             mObjects.clear();
331             mFrequencies.clear();
332         }
333 
334         /**
335          * @return The ratio of the input to be used as the validation data, Float.NaN means no
336          * validation is needed.
337          */
getRequiredValidationInputRatio()338         public float getRequiredValidationInputRatio() {
339             return 0.5f;
340         }
341     }
342 }
343