1 /*
2  * Copyright (C) 2012 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef KMEANS_H
18 #define KMEANS_H
19 
20 #include <cstdlib>
21 #include <math.h>
22 
23 // Helper functions
24 
25 template <typename T, typename N>
sum(T values[],int len,int dimension,int stride,N dst[])26 inline void sum(T values[], int len, int dimension, int stride, N dst[]) {
27     int x, y;
28     // zero out dst vector
29     for (x = 0; x < dimension; x++) {
30         dst[x] = 0;
31     }
32     for (x = 0; x < len; x+= stride) {
33         for (y = 0; y < dimension; y++) {
34             dst[y] += values[x + y];
35         }
36     }
37 }
38 
39 template <typename T, typename N>
set(T val1[],N val2[],int dimension)40 inline void set(T val1[], N val2[], int dimension) {
41     int x;
42     for (x = 0; x < dimension; x++) {
43         val1[x] = val2[x];
44     }
45 }
46 
47 template <typename T, typename N>
add(T val[],N dst[],int dimension)48 inline void add(T val[], N dst[], int dimension) {
49     int x;
50     for (x = 0; x < dimension; x++) {
51         dst[x] += val[x];
52     }
53 }
54 
55 template <typename T, typename N>
divide(T dst[],N divisor,int dimension)56 inline void divide(T dst[], N divisor, int dimension) {
57    int x;
58    if (divisor == 0) {
59        return;
60    }
61    for (x = 0; x < dimension; x++) {
62        dst[x] /= divisor;
63    }
64 }
65 
66 /**
67  * Calculates euclidean distance.
68  */
69 
70 template <typename T, typename N>
euclideanDist(T val1[],T val2[],int dimension)71 inline N euclideanDist(T val1[], T val2[], int dimension) {
72     int x;
73     N sum = 0;
74     for (x = 0; x < dimension; x++) {
75         N diff = (N) val1[x] - (N) val2[x];
76         sum += diff * diff;
77     }
78     return sqrt(sum);
79 }
80 
81 // K-Means
82 
83 
84 /**
85  * Picks k random starting points from the data set.
86  */
87 template <typename T>
initialPickHeuristicRandom(int k,T values[],int len,int dimension,int stride,T dst[],unsigned int seed)88 void initialPickHeuristicRandom(int k, T values[], int len, int dimension, int stride, T dst[],
89         unsigned int seed) {
90     int x, z, num_vals, cntr;
91     num_vals = len / stride;
92     cntr = 0;
93     srand(seed);
94     unsigned int r_vals[k];
95     unsigned int r;
96 
97     for (x = 0; x < k; x++) {
98 
99         // ensure randomly chosen value is unique
100         int r_check = 0;
101         while (r_check == 0) {
102             r = (unsigned int) rand() % num_vals;
103             r_check = 1;
104             for (z = 0; z < x; z++) {
105                 if (r == r_vals[z]) {
106                     r_check = 0;
107                 }
108             }
109         }
110         r_vals[x] = r;
111         r *= stride;
112 
113         // set dst to be randomly chosen value
114         set<T,T>(dst + cntr, values + r, dimension);
115         cntr += stride;
116     }
117 }
118 
119 /**
120  * Finds index of closet centroid to a value
121  */
122 template <typename T, typename N>
findClosest(T values[],T oldCenters[],int dimension,int stride,int pop_size)123 inline int findClosest(T values[], T oldCenters[], int dimension, int stride, int pop_size) {
124     int best_ind = 0;
125     N best_len = euclideanDist <T, N>(values, oldCenters, dimension);
126     int y;
127     for (y = stride; y < pop_size; y+=stride) {
128         N l = euclideanDist <T, N>(values, oldCenters + y, dimension);
129         if (l < best_len) {
130             best_len = l;
131             best_ind = y;
132         }
133     }
134     return best_ind;
135 }
136 
137 /**
138  * Calculates new centroids by averaging value clusters for old centroids.
139  */
140 template <typename T, typename N>
calculateNewCentroids(int k,T values[],int len,int dimension,int stride,T oldCenters[],T dst[])141 int calculateNewCentroids(int k, T values[], int len, int dimension, int stride, T oldCenters[],
142         T dst[]) {
143     int x, pop_size;
144     pop_size = k * stride;
145     int popularities[k];
146     N tmp[pop_size];
147 
148     //zero popularities
149     memset(popularities, 0, sizeof(int) * k);
150     // zero dst, and tmp
151     for (x = 0; x < pop_size; x++) {
152         tmp[x] = 0;
153     }
154 
155     // put summation for each k in tmp
156     for (x = 0; x < len; x+=stride) {
157         int best = findClosest<T, N>(values + x, oldCenters, dimension, stride, pop_size);
158         add<T, N>(values + x, tmp + best, dimension);
159         popularities[best / stride]++;
160 
161     }
162 
163     int ret = 0;
164     int y;
165     // divide to get centroid and set dst to result
166     for (x = 0; x < pop_size; x+=stride) {
167         divide<N, int>(tmp + x, popularities[x / stride], dimension);
168         for (y = 0; y < dimension; y++) {
169             if ((dst + x)[y] != (T) ((tmp + x)[y])) {
170                 ret = 1;
171             }
172         }
173         set(dst + x, tmp + x, dimension);
174     }
175     return ret;
176 }
177 
178 template <typename T, typename N>
runKMeansWithPicks(int k,T finalCentroids[],T values[],int len,int dimension,int stride,int iterations,T initialPicks[])179 void runKMeansWithPicks(int k, T finalCentroids[], T values[], int len, int dimension, int stride,
180         int iterations, T initialPicks[]){
181         int k_len = k * stride;
182         int x;
183 
184         // zero newCenters
185         for (x = 0; x < k_len; x++) {
186             finalCentroids[x] = 0;
187         }
188 
189         T * c1 = initialPicks;
190         T * c2 = finalCentroids;
191         T * temp;
192         int ret = 1;
193         for (x = 0; x < iterations; x++) {
194             ret = calculateNewCentroids<T, N>(k, values, len, dimension, stride, c1, c2);
195             temp = c1;
196             c1 = c2;
197             c2 = temp;
198             if (ret == 0) {
199                 x = iterations;
200             }
201         }
202         set<T, T>(finalCentroids, c1, dimension);
203 }
204 
205 /**
206  * Runs the k-means algorithm on dataset values with some initial centroids.
207  */
208 template <typename T, typename N>
runKMeans(int k,T finalCentroids[],T values[],int len,int dimension,int stride,int iterations,unsigned int seed)209 void runKMeans(int k, T finalCentroids[], T values[], int len, int dimension, int stride,
210         int iterations, unsigned int seed){
211     int k_len = k * stride;
212     T initialPicks [k_len];
213     initialPickHeuristicRandom<T>(k, values, len, dimension, stride, initialPicks, seed);
214 
215     runKMeansWithPicks<T, N>(k, finalCentroids, values, len, dimension, stride,
216         iterations, initialPicks);
217 }
218 
219 /**
220  * Sets each value in values to the closest centroid.
221  */
222 template <typename T, typename N>
applyCentroids(int k,T centroids[],T values[],int len,int dimension,int stride)223 void applyCentroids(int k, T centroids[], T values[], int len, int dimension, int stride) {
224     int x, pop_size;
225     pop_size = k * stride;
226     for (x = 0; x < len; x+= stride) {
227         int best = findClosest<T, N>(values + x, centroids, dimension, stride, pop_size);
228         set<T, T>(values + x, centroids + best, dimension);
229     }
230 }
231 
232 #endif // KMEANS_H
233