1 /*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef KMEANS_H
18 #define KMEANS_H
19
20 #include <cstdlib>
21 #include <math.h>
22
23 // Helper functions
24
25 template <typename T, typename N>
sum(T values[],int len,int dimension,int stride,N dst[])26 inline void sum(T values[], int len, int dimension, int stride, N dst[]) {
27 int x, y;
28 // zero out dst vector
29 for (x = 0; x < dimension; x++) {
30 dst[x] = 0;
31 }
32 for (x = 0; x < len; x+= stride) {
33 for (y = 0; y < dimension; y++) {
34 dst[y] += values[x + y];
35 }
36 }
37 }
38
39 template <typename T, typename N>
set(T val1[],N val2[],int dimension)40 inline void set(T val1[], N val2[], int dimension) {
41 int x;
42 for (x = 0; x < dimension; x++) {
43 val1[x] = val2[x];
44 }
45 }
46
47 template <typename T, typename N>
add(T val[],N dst[],int dimension)48 inline void add(T val[], N dst[], int dimension) {
49 int x;
50 for (x = 0; x < dimension; x++) {
51 dst[x] += val[x];
52 }
53 }
54
55 template <typename T, typename N>
divide(T dst[],N divisor,int dimension)56 inline void divide(T dst[], N divisor, int dimension) {
57 int x;
58 if (divisor == 0) {
59 return;
60 }
61 for (x = 0; x < dimension; x++) {
62 dst[x] /= divisor;
63 }
64 }
65
66 /**
67 * Calculates euclidean distance.
68 */
69
70 template <typename T, typename N>
euclideanDist(T val1[],T val2[],int dimension)71 inline N euclideanDist(T val1[], T val2[], int dimension) {
72 int x;
73 N sum = 0;
74 for (x = 0; x < dimension; x++) {
75 N diff = (N) val1[x] - (N) val2[x];
76 sum += diff * diff;
77 }
78 return sqrt(sum);
79 }
80
81 // K-Means
82
83
84 /**
85 * Picks k random starting points from the data set.
86 */
87 template <typename T>
initialPickHeuristicRandom(int k,T values[],int len,int dimension,int stride,T dst[],unsigned int seed)88 void initialPickHeuristicRandom(int k, T values[], int len, int dimension, int stride, T dst[],
89 unsigned int seed) {
90 int x, z, num_vals, cntr;
91 num_vals = len / stride;
92 cntr = 0;
93 srand(seed);
94 unsigned int r_vals[k];
95 unsigned int r;
96
97 for (x = 0; x < k; x++) {
98
99 // ensure randomly chosen value is unique
100 int r_check = 0;
101 while (r_check == 0) {
102 r = (unsigned int) rand() % num_vals;
103 r_check = 1;
104 for (z = 0; z < x; z++) {
105 if (r == r_vals[z]) {
106 r_check = 0;
107 }
108 }
109 }
110 r_vals[x] = r;
111 r *= stride;
112
113 // set dst to be randomly chosen value
114 set<T,T>(dst + cntr, values + r, dimension);
115 cntr += stride;
116 }
117 }
118
119 /**
120 * Finds index of closet centroid to a value
121 */
122 template <typename T, typename N>
findClosest(T values[],T oldCenters[],int dimension,int stride,int pop_size)123 inline int findClosest(T values[], T oldCenters[], int dimension, int stride, int pop_size) {
124 int best_ind = 0;
125 N best_len = euclideanDist <T, N>(values, oldCenters, dimension);
126 int y;
127 for (y = stride; y < pop_size; y+=stride) {
128 N l = euclideanDist <T, N>(values, oldCenters + y, dimension);
129 if (l < best_len) {
130 best_len = l;
131 best_ind = y;
132 }
133 }
134 return best_ind;
135 }
136
137 /**
138 * Calculates new centroids by averaging value clusters for old centroids.
139 */
140 template <typename T, typename N>
calculateNewCentroids(int k,T values[],int len,int dimension,int stride,T oldCenters[],T dst[])141 int calculateNewCentroids(int k, T values[], int len, int dimension, int stride, T oldCenters[],
142 T dst[]) {
143 int x, pop_size;
144 pop_size = k * stride;
145 int popularities[k];
146 N tmp[pop_size];
147
148 //zero popularities
149 memset(popularities, 0, sizeof(int) * k);
150 // zero dst, and tmp
151 for (x = 0; x < pop_size; x++) {
152 tmp[x] = 0;
153 }
154
155 // put summation for each k in tmp
156 for (x = 0; x < len; x+=stride) {
157 int best = findClosest<T, N>(values + x, oldCenters, dimension, stride, pop_size);
158 add<T, N>(values + x, tmp + best, dimension);
159 popularities[best / stride]++;
160
161 }
162
163 int ret = 0;
164 int y;
165 // divide to get centroid and set dst to result
166 for (x = 0; x < pop_size; x+=stride) {
167 divide<N, int>(tmp + x, popularities[x / stride], dimension);
168 for (y = 0; y < dimension; y++) {
169 if ((dst + x)[y] != (T) ((tmp + x)[y])) {
170 ret = 1;
171 }
172 }
173 set(dst + x, tmp + x, dimension);
174 }
175 return ret;
176 }
177
178 template <typename T, typename N>
runKMeansWithPicks(int k,T finalCentroids[],T values[],int len,int dimension,int stride,int iterations,T initialPicks[])179 void runKMeansWithPicks(int k, T finalCentroids[], T values[], int len, int dimension, int stride,
180 int iterations, T initialPicks[]){
181 int k_len = k * stride;
182 int x;
183
184 // zero newCenters
185 for (x = 0; x < k_len; x++) {
186 finalCentroids[x] = 0;
187 }
188
189 T * c1 = initialPicks;
190 T * c2 = finalCentroids;
191 T * temp;
192 int ret = 1;
193 for (x = 0; x < iterations; x++) {
194 ret = calculateNewCentroids<T, N>(k, values, len, dimension, stride, c1, c2);
195 temp = c1;
196 c1 = c2;
197 c2 = temp;
198 if (ret == 0) {
199 x = iterations;
200 }
201 }
202 set<T, T>(finalCentroids, c1, dimension);
203 }
204
205 /**
206 * Runs the k-means algorithm on dataset values with some initial centroids.
207 */
208 template <typename T, typename N>
runKMeans(int k,T finalCentroids[],T values[],int len,int dimension,int stride,int iterations,unsigned int seed)209 void runKMeans(int k, T finalCentroids[], T values[], int len, int dimension, int stride,
210 int iterations, unsigned int seed){
211 int k_len = k * stride;
212 T initialPicks [k_len];
213 initialPickHeuristicRandom<T>(k, values, len, dimension, stride, initialPicks, seed);
214
215 runKMeansWithPicks<T, N>(k, finalCentroids, values, len, dimension, stride,
216 iterations, initialPicks);
217 }
218
219 /**
220 * Sets each value in values to the closest centroid.
221 */
222 template <typename T, typename N>
applyCentroids(int k,T centroids[],T values[],int len,int dimension,int stride)223 void applyCentroids(int k, T centroids[], T values[], int len, int dimension, int stride) {
224 int x, pop_size;
225 pop_size = k * stride;
226 for (x = 0; x < len; x+= stride) {
227 int best = findClosest<T, N>(values + x, centroids, dimension, stride, pop_size);
228 set<T, T>(values + x, centroids + best, dimension);
229 }
230 }
231
232 #endif // KMEANS_H
233