1 /* 2 * Copyright (C) 2020 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.internal.util; 18 19 import android.annotation.NonNull; 20 import android.annotation.Nullable; 21 import android.util.SparseArray; 22 import android.util.SparseIntArray; 23 24 import java.util.ArrayList; 25 import java.util.Collections; 26 import java.util.List; 27 28 /** 29 * <p>A utility which processes a sequence of input (stream) and output the heavy hitters 30 * (the frequent ones).</p> 31 * 32 * @param <T> The type of the input. 33 * @see <a href="https://en.wikipedia.org/wiki/Streaming_algorithm">Stream Algorithm</a> for 34 * the definion of heavy hitters and the list of algorithms for detecting it. 35 * <p> 36 * {@hide} 37 */ 38 @android.ravenwood.annotation.RavenwoodKeepWholeClass 39 public interface HeavyHitterSketch<T> { 40 /** 41 * Return the default implementation. 42 * 43 * @return The default implementation. 44 */ newDefault()45 static <V> @NonNull HeavyHitterSketch<V> newDefault() { 46 return new HeavyHitterSketchImpl<V>(); 47 } 48 49 /** 50 * Set the configuration with given parameters 51 * 52 * @param inputSize The amount of the input. 53 * @param capacity The maximum number of distinct input it should track; it defines the lower 54 * bound of the output. 55 */ setConfig(int inputSize, int capacity)56 void setConfig(int inputSize, int capacity); 57 58 /** 59 * Add a new input to the current sketch. 60 * 61 * @param newInstance The new input 62 */ add(@ullable T newInstance)63 void add(@Nullable T newInstance); 64 65 /** 66 * @param k The number of heavy hitters it should return, k < capacity, a value of 0 67 * will be equivalent to capacity - 1 68 * @param holder The list array into which the elements of the tops are to be stored; a new list 69 * would be created and returned if this parameter is null. 70 * @param freqs Optional, the frequencies of each items in the returned result 71 * @return The top K heavy hitters(frequent input) 72 */ 73 @Nullable getTopHeavyHitters(int k, @Nullable List<T> holder, @Nullable List<Float> freqs)74 List<T> getTopHeavyHitters(int k, @Nullable List<T> holder, @Nullable List<Float> freqs); 75 76 /** 77 * @param holder The list array into which the elements of the candidates are to be stored; a 78 * new list would be created and returned if this parameter is null. 79 * @return The candidate heavy hitters so far, it could include false postives. 80 */ 81 @Nullable getCandidates(@ullable List<T> holder)82 List<T> getCandidates(@Nullable List<T> holder); 83 84 /** 85 * Reset this heavy hitter counter 86 */ reset()87 void reset(); 88 89 /** 90 * @return The ratio of the input to be used as the validation data, Float.NaN means no 91 * validation is needed. 92 */ getRequiredValidationInputRatio()93 float getRequiredValidationInputRatio(); 94 95 /** 96 * The default implementation of the {@link HeavyHitterSketch}. 97 * 98 * <p>Currently it consists of two passes: the first pass will take the input into 99 * the MG(Misra–Gries) summary; while the secondary pass will validate the output against the 100 * input in order to eliminate false postivies.</p> 101 * 102 * <p>For sure there are better approaches which takes only one pass, but also comes along with 103 * overheads in terms of cpu/memory cost; the MG summary would be a trivial and good enough 104 * pick.</p> 105 * 106 * @param <T> The type of the input. 107 * @see <a href="https://en.wikipedia.org/wiki/Misra%E2%80%93Gries_summary">Misra–Gries 108 * summary</a> for the detailed explanation of the algorithm. 109 */ 110 final class HeavyHitterSketchImpl<T> implements HeavyHitterSketch<T> { 111 /** 112 * The array to track the current heavy hitters, its size < {@link #mCapacity}. 113 */ 114 private final SparseArray<T> mObjects = new SparseArray<>(); 115 116 /** 117 * The frequencies of the current heavy hitters, its size < {@link #mCapacity}. 118 */ 119 private final SparseIntArray mFrequencies = new SparseIntArray(); 120 121 /** 122 * The amount of the input of each pass 123 */ 124 private int mPassSize; 125 126 /** 127 * The amount of the total input it expects 128 */ 129 private int mTotalSize; 130 131 /** 132 * The maximum number of distinct input it should track 133 */ 134 private int mCapacity; 135 136 /** 137 * The amount of inputs it already received. 138 */ 139 private int mNumInputs; 140 141 /** 142 * Whether or not it's configured properly. 143 */ 144 private boolean mConfigured; 145 146 /** 147 * Set the configuration with given parameters 148 * 149 * @param inputSize The amount of the input. 150 * @param capacity The maximum number of distinct input it should track; it defines the 151 * lower bound of the output. 152 */ setConfig(final int inputSize, final int capacity)153 public void setConfig(final int inputSize, final int capacity) { 154 if (inputSize < capacity || inputSize <= 1) { 155 mConfigured = false; 156 throw new IllegalArgumentException(); 157 } 158 reset(); 159 mTotalSize = inputSize; 160 mPassSize = inputSize >> 1; 161 mCapacity = capacity; 162 mConfigured = true; 163 } 164 165 /** 166 * Add a new input to the current sketch. 167 * 168 * @param newInstance The new input 169 */ 170 @Override add(@ullable final T newInstance)171 public void add(@Nullable final T newInstance) { 172 if (!mConfigured) { 173 throw new IllegalStateException(); 174 } 175 if (mNumInputs < mPassSize) { 176 addToMGSummary(newInstance); 177 } else if (mNumInputs < mTotalSize) { 178 // Validation pass 179 validate(newInstance); 180 } 181 } 182 183 /** 184 * Add an input to the MG summary. 185 * 186 * <p>Note the frequency in the result set is an estimation. Every (obj, freq') pair 187 * in the result set, will have the following property: 188 * <code>(freq - inputSize / capacity) ≤ freq' ≤ freq</code> 189 * The above freq' is the estimated frequency, while the freq is the actual frequency. 190 * </p> 191 */ addToMGSummary(@ullable final T newInstance)192 private void addToMGSummary(@Nullable final T newInstance) { 193 final int hashCode = newInstance != null ? newInstance.hashCode() : 0; 194 final int index = mObjects.indexOfKey(hashCode); 195 // MG summary 196 if (index >= 0) { 197 mFrequencies.setValueAt(index, mFrequencies.valueAt(index) + 1); 198 } else if (mObjects.size() < mCapacity - 1) { 199 mObjects.put(hashCode, newInstance); 200 mFrequencies.put(hashCode, 1); 201 } else { 202 for (int i = mFrequencies.size() - 1; i >= 0; i--) { 203 final int val = mFrequencies.valueAt(i) - 1; 204 if (val == 0) { 205 mObjects.removeAt(i); 206 mFrequencies.removeAt(i); 207 } else { 208 mFrequencies.setValueAt(i, val); 209 } 210 } 211 } 212 if (++mNumInputs == mPassSize) { 213 // Clear all the frequencies as we are going to validate them next 214 for (int i = mFrequencies.size() - 1; i >= 0; i--) { 215 mFrequencies.setValueAt(i, 0); 216 } 217 } 218 } 219 220 /** 221 * Validate the results from MG summary; the ones with frequencies less than lower boundary 222 * will be removed from the set. 223 */ validate(@ullable final T newInstance)224 private void validate(@Nullable final T newInstance) { 225 final int hashCode = newInstance != null ? newInstance.hashCode() : 0; 226 final int index = mObjects.indexOfKey(hashCode); 227 if (index >= 0) { 228 mFrequencies.setValueAt(index, mFrequencies.valueAt(index) + 1); 229 } 230 if (++mNumInputs == mTotalSize) { 231 final int lower = mPassSize / mCapacity; 232 // Remove any occurrences with frequencies less than lower boundary 233 for (int i = mFrequencies.size() - 1; i >= 0; i--) { 234 final int val = mFrequencies.valueAt(i); 235 if (val < lower) { 236 mFrequencies.removeAt(i); 237 mObjects.removeAt(i); 238 } 239 } 240 } 241 } 242 243 /** 244 * @param k The number of heavy hitters it should return, k < capacity, a value of 0 245 * will be equivalent to capacity - 1 246 * @param holder The list array into which the elements of the tops are to be stored; a new 247 * list would be created and returned if this parameter is null. 248 * @param freqs Optional, the frequencies of each items in the returned result 249 * @return The top K heavy hitters(frequent input) 250 */ 251 @Override 252 @Nullable getTopHeavyHitters(final int k, final @Nullable List<T> holder, final @Nullable List<Float> freqs)253 public List<T> getTopHeavyHitters(final int k, final @Nullable List<T> holder, 254 final @Nullable List<Float> freqs) { 255 if (!mConfigured) { 256 throw new IllegalStateException(); 257 } 258 259 if (k >= mCapacity) { 260 throw new IllegalArgumentException(); 261 } 262 263 if (mNumInputs < mTotalSize) { 264 // It hasn't had all the inputs yet. 265 throw new IllegalStateException(); 266 } 267 268 ArrayList<Integer> indexes = null; 269 for (int i = mFrequencies.size() - 1; i >= 0; i--) { 270 final int val = mFrequencies.valueAt(i); 271 if (val > 0) { 272 if (indexes == null) { 273 indexes = new ArrayList<>(); 274 } 275 indexes.add(i); 276 } 277 } 278 if (indexes == null) { 279 return null; 280 } 281 282 Collections.sort(indexes, (a, b) -> mFrequencies.valueAt(b) - mFrequencies.valueAt(a)); 283 284 final List<T> result = holder != null ? holder : new ArrayList<T>(); 285 final int max = Math.min(k == 0 ? (mCapacity - 1) : k, indexes.size()); 286 for (int i = 0; i < max; i++) { 287 final int index = indexes.get(i); 288 final T obj = mObjects.valueAt(index); 289 if (obj != null) { 290 result.add(obj); 291 if (freqs != null) { 292 freqs.add((float) mFrequencies.valueAt(index) / mPassSize); 293 } 294 } 295 } 296 return result; 297 } 298 299 /** 300 * @param holder The list array into which the elements of the candidates are to be stored; 301 * a new list would be created and returned if this parameter is null. 302 * @return The candidate heavy hitters so far, it could include false postives. 303 */ 304 @Nullable getCandidates(final @Nullable List<T> holder)305 public List<T> getCandidates(final @Nullable List<T> holder) { 306 if (!mConfigured) { 307 throw new IllegalStateException(); 308 } 309 if (mNumInputs < mPassSize) { 310 // It hasn't done with the first pass yet, return nothing 311 return null; 312 } 313 314 List<T> result = holder != null ? holder : new ArrayList<T>(); 315 for (int i = mObjects.size() - 1; i >= 0; i--) { 316 final T obj = mObjects.valueAt(i); 317 if (obj != null) { 318 result.add(obj); 319 } 320 } 321 return result; 322 } 323 324 /** 325 * Reset this heavy hitter counter 326 */ 327 @Override reset()328 public void reset() { 329 mNumInputs = 0; 330 mObjects.clear(); 331 mFrequencies.clear(); 332 } 333 334 /** 335 * @return The ratio of the input to be used as the validation data, Float.NaN means no 336 * validation is needed. 337 */ getRequiredValidationInputRatio()338 public float getRequiredValidationInputRatio() { 339 return 0.5f; 340 } 341 } 342 } 343