GCC Code Coverage Report


Directory: ./
File: src/util/numerical/natBreaks.cc
Date: 2025-09-01 06:19:01
Exec Total Coverage
Lines: 0 169 0.0%
Functions: 0 16 0.0%
Branches: 0 58 0.0%

Line Branch Exec Source
1 # include "na64util/numerical/natBreaks.hh"
2
3 # include <cassert>
4 # include <algorithm>
5
6 namespace na64dp {
7 namespace util {
8 namespace jenks {
9
10 const SizeT BUFFER_SIZE = 1024;
11
12 // helper funcs
13 template <typename T> T Min(T a, T b) { return (a<b) ? a : b; }
14
15 SizeT
16 GetTotalCount( const ValueCountPairContainer & vcpc ) {
17 SizeT sum = 0;
18 ValueCountPairContainer::const_iterator
19 i = vcpc.begin(),
20 e = vcpc.end();
21 for(sum = 0; i!=e; ++i)
22 sum += (*i).second;
23 return sum;
24 }
25
26 // helper struct JenksFisher
27
28 // captures the intermediate data and methods for the calculation of Natural
29 // Class Breaks.
30 JenksFisher::JenksFisher( const ValueCountPairContainer & vcpc
31 , SizeT k )
32 : m_M(vcpc.size())
33 , m_K(k)
34 , m_BufSize(vcpc.size()-(k-1))
35 , m_PrevSSM(m_BufSize)
36 , m_CurrSSM(m_BufSize)
37 , m_CB(m_BufSize * (m_K-1))
38 , m_CBPtr() {
39 m_CumulValues.reserve(vcpc.size());
40 double cwv=0;
41 CountType cw = 0, w;
42
43 for(SizeT i=0; i!=m_M; ++i) {
44 // PRECONDITION: the value sequence must be strictly increasing
45 assert(!i || vcpc[i].first > vcpc[i-1].first);
46
47 w = vcpc[i].second;
48 // PRECONDITION: all weights must be positive
49 assert(w > 0);
50
51 cw += w;
52 // No overflow? No loss of precision?
53 assert(cw > w || !i);
54
55 cwv += w * vcpc[i].first;
56 m_CumulValues.push_back(ValueCountPair(cwv, cw));
57 if (i < m_BufSize) {
58 // prepare SSM for first class. Last (k-1) values are omitted
59 m_PrevSSM[i] = cwv * cwv / cw;
60 }
61 }
62 }
63
64 double
65 JenksFisher::GetW(SizeT b, SizeT e) {
66 assert(b); // First element always belongs to class 0, thus queries should never include it.
67 assert(b<=e);
68 assert(e<m_M);
69
70 double res = m_CumulValues[e].second;
71 res -= m_CumulValues[b-1].second;
72 return res;
73 }
74
75 double
76 JenksFisher::GetWV(SizeT b, SizeT e) {
77 assert(b);
78 assert(b<=e);
79 assert(e<m_M);
80
81 double res = m_CumulValues[e].first;
82 res -= m_CumulValues[b-1].first;
83 return res;
84 }
85
86 double
87 JenksFisher::GetSSM(SizeT b, SizeT e) {
88 double res = GetWV(b,e);
89 return res * res / GetW(b,e);
90 }
91
92 SizeT
93 JenksFisher::FindMaxBreakIndex(SizeT i, SizeT bp, SizeT ep) {
94 assert(bp < ep);
95 assert(bp <= i);
96 assert(ep <= i+1);
97 assert(i < m_BufSize);
98 assert(ep <= m_BufSize);
99
100 double minSSM = m_PrevSSM[bp] + GetSSM(bp+m_NrCompletedRows, i+m_NrCompletedRows);
101 SizeT foundP = bp;
102 while (++bp < ep)
103 {
104 double currSSM = m_PrevSSM[bp] + GetSSM(bp+m_NrCompletedRows, i+m_NrCompletedRows);
105 if (currSSM > minSSM)
106 {
107 minSSM = currSSM;
108 foundP = bp;
109 }
110 }
111 m_CurrSSM[i] = minSSM;
112 return foundP;
113 }
114
115 void
116 JenksFisher::CalcRange(SizeT bi, SizeT ei, SizeT bp, SizeT ep) {
117 assert(bi <= ei);
118
119 assert(ep <= ei);
120 assert(bp <= bi);
121
122 if (bi == ei)
123 return;
124 assert(bp < ep);
125
126 SizeT mi = (bi + ei)/2;
127 SizeT mp = FindMaxBreakIndex(mi, bp, Min<SizeT>(ep, mi+1));
128
129 assert(bp <= mp);
130 assert(mp < ep);
131 assert(mp <= mi);
132
133 // solve first half of the sub-problems with lower 'half' of possible outcomes
134 CalcRange(bi, mi, bp, Min<SizeT>(mi, mp+1));
135
136 m_CBPtr[ mi ] = mp; // store result for the middle element.
137
138 // solve second half of the sub-problems with upper 'half' of possible outcomes
139 CalcRange(mi+1, ei, mp, ep);
140 }
141
142 void
143 JenksFisher::CalcAll() {
144 if (m_K>=2) {
145 m_CBPtr = m_CB.begin();
146 for (m_NrCompletedRows=1; m_NrCompletedRows<m_K-1; ++m_NrCompletedRows) {
147 CalcRange(0, m_BufSize, 0, m_BufSize); // complexity: O(m*log(m))
148
149 m_PrevSSM.swap(m_CurrSSM);
150 m_CBPtr += m_BufSize;
151 }
152 }
153 }
154
155
156 void
157 GetCountsDirect( ValueCountPairContainer & vcpc
158 , const double * values
159 , SizeT size ) {
160 assert(size <= BUFFER_SIZE);
161 assert(size > 0);
162 assert(vcpc.empty());
163
164 double buffer[BUFFER_SIZE];
165
166 std::copy(values, values+size, buffer);
167 std::sort(buffer, buffer+size);
168
169 double currValue = buffer[0];
170 SizeT currCount = 1;
171 for (SizeT index = 1; index != size; ++index) {
172 if (currValue < buffer[index]) {
173 vcpc.push_back(ValueCountPair(currValue, currCount));
174 currValue = buffer[index];
175 currCount = 1;
176 }
177 else
178 ++currCount;
179 }
180 vcpc.push_back(ValueCountPair(currValue, currCount));
181 }
182
183 struct CompareFirst {
184 bool operator () (const ValueCountPair& lhs, const ValueCountPair& rhs) {
185 return lhs.first < rhs.first;
186 }
187 };
188
189 static void
190 MergeToLeft( ValueCountPairContainer & vcpcLeft
191 , const ValueCountPairContainer & vcpcRight
192 , ValueCountPairContainer& vcpcDummy ) {
193 assert(vcpcDummy.empty());
194 vcpcDummy.swap(vcpcLeft);
195 vcpcLeft.resize(vcpcRight.size() + vcpcDummy.size());
196
197 std::merge( vcpcRight.begin(), vcpcRight.end()
198 , vcpcDummy.begin(), vcpcDummy.end()
199 , vcpcLeft.begin(), CompareFirst() );
200
201 ValueCountPairContainer::iterator
202 currPair = vcpcLeft.begin(),
203 lastPair = vcpcLeft.end();
204
205 ValueCountPairContainer::iterator index = currPair+1;
206 while (index != lastPair && currPair->first < index->first) {
207 currPair = index;
208 ++index;
209 }
210
211 double currValue = currPair->first;
212 SizeT currCount = currPair->second;
213 for (; index != lastPair;++index) {
214 if (currValue < index->first) {
215 *currPair++ = ValueCountPair(currValue, currCount);
216 currValue = index->first;
217 currCount = index->second;
218 }
219 else
220 currCount += index->second;
221 }
222 *currPair++ = ValueCountPair(currValue, currCount);
223 vcpcLeft.erase(currPair, lastPair);
224
225 vcpcDummy.clear();
226 }
227
228
229 void
230 ValueCountPairContainerArray::resize(SizeT k) {
231 assert(capacity() >= k);
232 while (size() < k) {
233 push_back(ValueCountPairContainer());
234 back().reserve(BUFFER_SIZE);
235 }
236 }
237
238 void
239 ValueCountPairContainerArray::GetValueCountPairs( ValueCountPairContainer & vcpc
240 , const double * values
241 , SizeT size
242 , unsigned int nrUsedContainers ) {
243 assert(vcpc.empty());
244 if (size <= BUFFER_SIZE) {
245 GetCountsDirect(vcpc, values, size);
246 } else {
247 resize(nrUsedContainers+2);
248
249 unsigned int m = size/2;
250
251 GetValueCountPairs( vcpc, values, m, nrUsedContainers );
252 GetValueCountPairs( begin()[nrUsedContainers]
253 , values + m
254 , size - m
255 , nrUsedContainers+1 );
256
257 MergeToLeft( vcpc
258 , begin()[nrUsedContainers]
259 , begin()[nrUsedContainers+1]
260 );
261 begin()[nrUsedContainers].clear();
262 }
263 assert(GetTotalCount(vcpc) == size);
264 }
265
266
267 void
268 GetValueCountPairs( ValueCountPairContainer & vcpc
269 , const double * values
270 , SizeT n ) {
271 vcpc.clear();
272 if( n ) {
273 ValueCountPairContainerArray vcpca;
274 // max nr halving is log2(max cardinality / BUFFER_SIZE); max cardinality is SizeT(-1)
275 vcpca.reserve(3 + 8*sizeof(SizeT) - 10);
276 vcpca.GetValueCountPairs(vcpc, values, n, 0);
277
278 assert(vcpc.size());
279 }
280 }
281
282 void
283 ClassifyJenksFisherFromValueCountPairs( LimitsContainer & breaksArray
284 , SizeT k
285 , const ValueCountPairContainer& vcpc ) {
286 breaksArray.resize(k);
287 SizeT m = vcpc.size();
288
289 assert(k <= m); // PRECONDITION
290
291 if (!k)
292 return;
293
294 JenksFisher jf(vcpc, k);
295
296 if (k > 1) {
297 jf.CalcAll();
298
299 SizeT lastClassBreakIndex = jf.FindMaxBreakIndex(jf.m_BufSize-1, 0, jf.m_BufSize);
300
301 while (--k) {
302 breaksArray[k] = vcpc[lastClassBreakIndex+k].first;
303 assert(lastClassBreakIndex < jf.m_BufSize);
304 if (k > 1)
305 {
306 jf.m_CBPtr -= jf.m_BufSize;
307 lastClassBreakIndex = jf.m_CBPtr[lastClassBreakIndex];
308 }
309 }
310 assert(jf.m_CBPtr == jf.m_CB.begin());
311 }
312 assert( k == 0 );
313 breaksArray[0] = vcpc[0].first; // break for the first class is the minimum of the dataset.
314 }
315
316 } // namespace jenks
317 } // util
318 }
319
320
321 # ifdef TEST_BUILD
322 // test code
323 #include "boost/random.hpp"
324
325 int
326 main(int c, char** argv) {
327 const double rangeMin = 0.0;
328 const double rangeMax = 10.0;
329 typedef boost::uniform_real<double> NumberDistribution;
330 typedef boost::mt19937 RandomNumberGenerator;
331 typedef boost::variate_generator<RandomNumberGenerator&, NumberDistribution> Generator;
332
333 NumberDistribution distribution(rangeMin, rangeMax);
334 RandomNumberGenerator generator;
335 generator.seed(0); // seed with the current time
336 Generator numberGenerator(generator, distribution);
337
338 const int n = 1000000;
339 const int k = 10;
340
341 std::cout << "Generating random numbers..." << std::endl;
342
343 std::vector<double> values;
344 values.reserve(n);
345 // populate a distribuiton slightly more interesting than uniform,
346 // with a lower density at higher values.
347 for( int i=0; i!=n; ++i ) {
348 double v = numberGenerator();
349 values.push_back(v*v);
350 }
351 assert(values.size() == n);
352
353 std::cout << "Generating sortedUniqueValueCounts ..." << std::endl;
354 ValueCountPairContainer sortedUniqueValueCounts;
355 GetValueCountPairs( sortedUniqueValueCounts, &values[0], n );
356
357 std::cout << sortedUniqueValueCounts.size() << " unique values generated." << std::endl;
358 //for( auto p : sortedUniqueValueCounts )
359
360 std::cout << "Finding Jenks ClassBreaks..." << std::endl;
361 LimitsContainer resultingbreaksArray;
362 ClassifyJenksFisherFromValueCountPairs( resultingbreaksArray, k, sortedUniqueValueCounts );
363
364 std::cout << "Reporting results..." << std::endl;
365 for( double breakValue : resultingbreaksArray ) {
366 std::cout << breakValue << std::endl;
367 }
368 } // main
369 # endif // TEST_BUILD
370