| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | # include "na64util/numerical/natBreaks.hh" | ||
| 2 | |||
| 3 | # include <cassert> | ||
| 4 | # include <algorithm> | ||
| 5 | |||
| 6 | namespace na64dp { | ||
| 7 | namespace util { | ||
| 8 | namespace jenks { | ||
| 9 | |||
| 10 | const SizeT BUFFER_SIZE = 1024; | ||
| 11 | |||
| 12 | // helper funcs | ||
| 13 | ✗ | template <typename T> T Min(T a, T b) { return (a<b) ? a : b; } | |
| 14 | |||
| 15 | SizeT | ||
| 16 | ✗ | GetTotalCount( const ValueCountPairContainer & vcpc ) { | |
| 17 | ✗ | SizeT sum = 0; | |
| 18 | ValueCountPairContainer::const_iterator | ||
| 19 | ✗ | i = vcpc.begin(), | |
| 20 | ✗ | e = vcpc.end(); | |
| 21 | ✗ | for(sum = 0; i!=e; ++i) | |
| 22 | ✗ | sum += (*i).second; | |
| 23 | ✗ | return sum; | |
| 24 | } | ||
| 25 | |||
| 26 | // helper struct JenksFisher | ||
| 27 | |||
| 28 | // captures the intermediate data and methods for the calculation of Natural | ||
| 29 | // Class Breaks. | ||
| 30 | ✗ | JenksFisher::JenksFisher( const ValueCountPairContainer & vcpc | |
| 31 | ✗ | , SizeT k ) | |
| 32 | ✗ | : m_M(vcpc.size()) | |
| 33 | ✗ | , m_K(k) | |
| 34 | ✗ | , m_BufSize(vcpc.size()-(k-1)) | |
| 35 | ✗ | , m_PrevSSM(m_BufSize) | |
| 36 | ✗ | , m_CurrSSM(m_BufSize) | |
| 37 | ✗ | , m_CB(m_BufSize * (m_K-1)) | |
| 38 | ✗ | , m_CBPtr() { | |
| 39 | ✗ | m_CumulValues.reserve(vcpc.size()); | |
| 40 | ✗ | double cwv=0; | |
| 41 | ✗ | CountType cw = 0, w; | |
| 42 | |||
| 43 | ✗ | for(SizeT i=0; i!=m_M; ++i) { | |
| 44 | // PRECONDITION: the value sequence must be strictly increasing | ||
| 45 | ✗ | assert(!i || vcpc[i].first > vcpc[i-1].first); | |
| 46 | |||
| 47 | ✗ | w = vcpc[i].second; | |
| 48 | // PRECONDITION: all weights must be positive | ||
| 49 | ✗ | assert(w > 0); | |
| 50 | |||
| 51 | ✗ | cw += w; | |
| 52 | // No overflow? No loss of precision? | ||
| 53 | ✗ | assert(cw > w || !i); | |
| 54 | |||
| 55 | ✗ | cwv += w * vcpc[i].first; | |
| 56 | ✗ | m_CumulValues.push_back(ValueCountPair(cwv, cw)); | |
| 57 | ✗ | if (i < m_BufSize) { | |
| 58 | // prepare SSM for first class. Last (k-1) values are omitted | ||
| 59 | ✗ | m_PrevSSM[i] = cwv * cwv / cw; | |
| 60 | } | ||
| 61 | } | ||
| 62 | } | ||
| 63 | |||
| 64 | double | ||
| 65 | ✗ | JenksFisher::GetW(SizeT b, SizeT e) { | |
| 66 | ✗ | assert(b); // First element always belongs to class 0, thus queries should never include it. | |
| 67 | ✗ | assert(b<=e); | |
| 68 | ✗ | assert(e<m_M); | |
| 69 | |||
| 70 | ✗ | double res = m_CumulValues[e].second; | |
| 71 | ✗ | res -= m_CumulValues[b-1].second; | |
| 72 | ✗ | return res; | |
| 73 | } | ||
| 74 | |||
| 75 | double | ||
| 76 | ✗ | JenksFisher::GetWV(SizeT b, SizeT e) { | |
| 77 | ✗ | assert(b); | |
| 78 | ✗ | assert(b<=e); | |
| 79 | ✗ | assert(e<m_M); | |
| 80 | |||
| 81 | ✗ | double res = m_CumulValues[e].first; | |
| 82 | ✗ | res -= m_CumulValues[b-1].first; | |
| 83 | ✗ | return res; | |
| 84 | } | ||
| 85 | |||
| 86 | double | ||
| 87 | ✗ | JenksFisher::GetSSM(SizeT b, SizeT e) { | |
| 88 | ✗ | double res = GetWV(b,e); | |
| 89 | ✗ | return res * res / GetW(b,e); | |
| 90 | } | ||
| 91 | |||
| 92 | SizeT | ||
| 93 | ✗ | JenksFisher::FindMaxBreakIndex(SizeT i, SizeT bp, SizeT ep) { | |
| 94 | ✗ | assert(bp < ep); | |
| 95 | ✗ | assert(bp <= i); | |
| 96 | ✗ | assert(ep <= i+1); | |
| 97 | ✗ | assert(i < m_BufSize); | |
| 98 | ✗ | assert(ep <= m_BufSize); | |
| 99 | |||
| 100 | ✗ | double minSSM = m_PrevSSM[bp] + GetSSM(bp+m_NrCompletedRows, i+m_NrCompletedRows); | |
| 101 | ✗ | SizeT foundP = bp; | |
| 102 | ✗ | while (++bp < ep) | |
| 103 | { | ||
| 104 | ✗ | double currSSM = m_PrevSSM[bp] + GetSSM(bp+m_NrCompletedRows, i+m_NrCompletedRows); | |
| 105 | ✗ | if (currSSM > minSSM) | |
| 106 | { | ||
| 107 | ✗ | minSSM = currSSM; | |
| 108 | ✗ | foundP = bp; | |
| 109 | } | ||
| 110 | } | ||
| 111 | ✗ | m_CurrSSM[i] = minSSM; | |
| 112 | ✗ | return foundP; | |
| 113 | } | ||
| 114 | |||
| 115 | void | ||
| 116 | ✗ | JenksFisher::CalcRange(SizeT bi, SizeT ei, SizeT bp, SizeT ep) { | |
| 117 | ✗ | assert(bi <= ei); | |
| 118 | |||
| 119 | ✗ | assert(ep <= ei); | |
| 120 | ✗ | assert(bp <= bi); | |
| 121 | |||
| 122 | ✗ | if (bi == ei) | |
| 123 | ✗ | return; | |
| 124 | ✗ | assert(bp < ep); | |
| 125 | |||
| 126 | ✗ | SizeT mi = (bi + ei)/2; | |
| 127 | ✗ | SizeT mp = FindMaxBreakIndex(mi, bp, Min<SizeT>(ep, mi+1)); | |
| 128 | |||
| 129 | ✗ | assert(bp <= mp); | |
| 130 | ✗ | assert(mp < ep); | |
| 131 | ✗ | assert(mp <= mi); | |
| 132 | |||
| 133 | // solve first half of the sub-problems with lower 'half' of possible outcomes | ||
| 134 | ✗ | CalcRange(bi, mi, bp, Min<SizeT>(mi, mp+1)); | |
| 135 | |||
| 136 | ✗ | m_CBPtr[ mi ] = mp; // store result for the middle element. | |
| 137 | |||
| 138 | // solve second half of the sub-problems with upper 'half' of possible outcomes | ||
| 139 | ✗ | CalcRange(mi+1, ei, mp, ep); | |
| 140 | } | ||
| 141 | |||
| 142 | void | ||
| 143 | ✗ | JenksFisher::CalcAll() { | |
| 144 | ✗ | if (m_K>=2) { | |
| 145 | ✗ | m_CBPtr = m_CB.begin(); | |
| 146 | ✗ | for (m_NrCompletedRows=1; m_NrCompletedRows<m_K-1; ++m_NrCompletedRows) { | |
| 147 | ✗ | CalcRange(0, m_BufSize, 0, m_BufSize); // complexity: O(m*log(m)) | |
| 148 | |||
| 149 | ✗ | m_PrevSSM.swap(m_CurrSSM); | |
| 150 | ✗ | m_CBPtr += m_BufSize; | |
| 151 | } | ||
| 152 | } | ||
| 153 | } | ||
| 154 | |||
| 155 | |||
| 156 | void | ||
| 157 | ✗ | GetCountsDirect( ValueCountPairContainer & vcpc | |
| 158 | , const double * values | ||
| 159 | , SizeT size ) { | ||
| 160 | ✗ | assert(size <= BUFFER_SIZE); | |
| 161 | ✗ | assert(size > 0); | |
| 162 | ✗ | assert(vcpc.empty()); | |
| 163 | |||
| 164 | double buffer[BUFFER_SIZE]; | ||
| 165 | |||
| 166 | ✗ | std::copy(values, values+size, buffer); | |
| 167 | ✗ | std::sort(buffer, buffer+size); | |
| 168 | |||
| 169 | ✗ | double currValue = buffer[0]; | |
| 170 | ✗ | SizeT currCount = 1; | |
| 171 | ✗ | for (SizeT index = 1; index != size; ++index) { | |
| 172 | ✗ | if (currValue < buffer[index]) { | |
| 173 | ✗ | vcpc.push_back(ValueCountPair(currValue, currCount)); | |
| 174 | ✗ | currValue = buffer[index]; | |
| 175 | ✗ | currCount = 1; | |
| 176 | } | ||
| 177 | else | ||
| 178 | ✗ | ++currCount; | |
| 179 | } | ||
| 180 | ✗ | vcpc.push_back(ValueCountPair(currValue, currCount)); | |
| 181 | } | ||
| 182 | |||
| 183 | struct CompareFirst { | ||
| 184 | ✗ | bool operator () (const ValueCountPair& lhs, const ValueCountPair& rhs) { | |
| 185 | ✗ | return lhs.first < rhs.first; | |
| 186 | } | ||
| 187 | }; | ||
| 188 | |||
| 189 | static void | ||
| 190 | ✗ | MergeToLeft( ValueCountPairContainer & vcpcLeft | |
| 191 | , const ValueCountPairContainer & vcpcRight | ||
| 192 | , ValueCountPairContainer& vcpcDummy ) { | ||
| 193 | ✗ | assert(vcpcDummy.empty()); | |
| 194 | ✗ | vcpcDummy.swap(vcpcLeft); | |
| 195 | ✗ | vcpcLeft.resize(vcpcRight.size() + vcpcDummy.size()); | |
| 196 | |||
| 197 | ✗ | std::merge( vcpcRight.begin(), vcpcRight.end() | |
| 198 | , vcpcDummy.begin(), vcpcDummy.end() | ||
| 199 | , vcpcLeft.begin(), CompareFirst() ); | ||
| 200 | |||
| 201 | ValueCountPairContainer::iterator | ||
| 202 | ✗ | currPair = vcpcLeft.begin(), | |
| 203 | ✗ | lastPair = vcpcLeft.end(); | |
| 204 | |||
| 205 | ✗ | ValueCountPairContainer::iterator index = currPair+1; | |
| 206 | ✗ | while (index != lastPair && currPair->first < index->first) { | |
| 207 | ✗ | currPair = index; | |
| 208 | ✗ | ++index; | |
| 209 | } | ||
| 210 | |||
| 211 | ✗ | double currValue = currPair->first; | |
| 212 | ✗ | SizeT currCount = currPair->second; | |
| 213 | ✗ | for (; index != lastPair;++index) { | |
| 214 | ✗ | if (currValue < index->first) { | |
| 215 | ✗ | *currPair++ = ValueCountPair(currValue, currCount); | |
| 216 | ✗ | currValue = index->first; | |
| 217 | ✗ | currCount = index->second; | |
| 218 | } | ||
| 219 | else | ||
| 220 | ✗ | currCount += index->second; | |
| 221 | } | ||
| 222 | ✗ | *currPair++ = ValueCountPair(currValue, currCount); | |
| 223 | ✗ | vcpcLeft.erase(currPair, lastPair); | |
| 224 | |||
| 225 | ✗ | vcpcDummy.clear(); | |
| 226 | } | ||
| 227 | |||
| 228 | |||
| 229 | void | ||
| 230 | ✗ | ValueCountPairContainerArray::resize(SizeT k) { | |
| 231 | ✗ | assert(capacity() >= k); | |
| 232 | ✗ | while (size() < k) { | |
| 233 | ✗ | push_back(ValueCountPairContainer()); | |
| 234 | ✗ | back().reserve(BUFFER_SIZE); | |
| 235 | } | ||
| 236 | } | ||
| 237 | |||
| 238 | void | ||
| 239 | ✗ | ValueCountPairContainerArray::GetValueCountPairs( ValueCountPairContainer & vcpc | |
| 240 | , const double * values | ||
| 241 | , SizeT size | ||
| 242 | , unsigned int nrUsedContainers ) { | ||
| 243 | ✗ | assert(vcpc.empty()); | |
| 244 | ✗ | if (size <= BUFFER_SIZE) { | |
| 245 | ✗ | GetCountsDirect(vcpc, values, size); | |
| 246 | } else { | ||
| 247 | ✗ | resize(nrUsedContainers+2); | |
| 248 | |||
| 249 | ✗ | unsigned int m = size/2; | |
| 250 | |||
| 251 | ✗ | GetValueCountPairs( vcpc, values, m, nrUsedContainers ); | |
| 252 | ✗ | GetValueCountPairs( begin()[nrUsedContainers] | |
| 253 | ✗ | , values + m | |
| 254 | ✗ | , size - m | |
| 255 | , nrUsedContainers+1 ); | ||
| 256 | |||
| 257 | ✗ | MergeToLeft( vcpc | |
| 258 | ✗ | , begin()[nrUsedContainers] | |
| 259 | ✗ | , begin()[nrUsedContainers+1] | |
| 260 | ); | ||
| 261 | ✗ | begin()[nrUsedContainers].clear(); | |
| 262 | } | ||
| 263 | ✗ | assert(GetTotalCount(vcpc) == size); | |
| 264 | } | ||
| 265 | |||
| 266 | |||
| 267 | void | ||
| 268 | ✗ | GetValueCountPairs( ValueCountPairContainer & vcpc | |
| 269 | , const double * values | ||
| 270 | , SizeT n ) { | ||
| 271 | ✗ | vcpc.clear(); | |
| 272 | ✗ | if( n ) { | |
| 273 | ✗ | ValueCountPairContainerArray vcpca; | |
| 274 | // max nr halving is log2(max cardinality / BUFFER_SIZE); max cardinality is SizeT(-1) | ||
| 275 | ✗ | vcpca.reserve(3 + 8*sizeof(SizeT) - 10); | |
| 276 | ✗ | vcpca.GetValueCountPairs(vcpc, values, n, 0); | |
| 277 | |||
| 278 | ✗ | assert(vcpc.size()); | |
| 279 | } | ||
| 280 | } | ||
| 281 | |||
| 282 | void | ||
| 283 | ✗ | ClassifyJenksFisherFromValueCountPairs( LimitsContainer & breaksArray | |
| 284 | , SizeT k | ||
| 285 | , const ValueCountPairContainer& vcpc ) { | ||
| 286 | ✗ | breaksArray.resize(k); | |
| 287 | ✗ | SizeT m = vcpc.size(); | |
| 288 | |||
| 289 | ✗ | assert(k <= m); // PRECONDITION | |
| 290 | |||
| 291 | ✗ | if (!k) | |
| 292 | ✗ | return; | |
| 293 | |||
| 294 | ✗ | JenksFisher jf(vcpc, k); | |
| 295 | |||
| 296 | ✗ | if (k > 1) { | |
| 297 | ✗ | jf.CalcAll(); | |
| 298 | |||
| 299 | ✗ | SizeT lastClassBreakIndex = jf.FindMaxBreakIndex(jf.m_BufSize-1, 0, jf.m_BufSize); | |
| 300 | |||
| 301 | ✗ | while (--k) { | |
| 302 | ✗ | breaksArray[k] = vcpc[lastClassBreakIndex+k].first; | |
| 303 | ✗ | assert(lastClassBreakIndex < jf.m_BufSize); | |
| 304 | ✗ | if (k > 1) | |
| 305 | { | ||
| 306 | ✗ | jf.m_CBPtr -= jf.m_BufSize; | |
| 307 | ✗ | lastClassBreakIndex = jf.m_CBPtr[lastClassBreakIndex]; | |
| 308 | } | ||
| 309 | } | ||
| 310 | ✗ | assert(jf.m_CBPtr == jf.m_CB.begin()); | |
| 311 | } | ||
| 312 | ✗ | assert( k == 0 ); | |
| 313 | ✗ | breaksArray[0] = vcpc[0].first; // break for the first class is the minimum of the dataset. | |
| 314 | } | ||
| 315 | |||
| 316 | } // namespace jenks | ||
| 317 | } // util | ||
| 318 | } | ||
| 319 | |||
| 320 | |||
| 321 | # ifdef TEST_BUILD | ||
| 322 | // test code | ||
| 323 | #include "boost/random.hpp" | ||
| 324 | |||
| 325 | int | ||
| 326 | main(int c, char** argv) { | ||
| 327 | const double rangeMin = 0.0; | ||
| 328 | const double rangeMax = 10.0; | ||
| 329 | typedef boost::uniform_real<double> NumberDistribution; | ||
| 330 | typedef boost::mt19937 RandomNumberGenerator; | ||
| 331 | typedef boost::variate_generator<RandomNumberGenerator&, NumberDistribution> Generator; | ||
| 332 | |||
| 333 | NumberDistribution distribution(rangeMin, rangeMax); | ||
| 334 | RandomNumberGenerator generator; | ||
| 335 | generator.seed(0); // seed with the current time | ||
| 336 | Generator numberGenerator(generator, distribution); | ||
| 337 | |||
| 338 | const int n = 1000000; | ||
| 339 | const int k = 10; | ||
| 340 | |||
| 341 | std::cout << "Generating random numbers..." << std::endl; | ||
| 342 | |||
| 343 | std::vector<double> values; | ||
| 344 | values.reserve(n); | ||
| 345 | // populate a distribuiton slightly more interesting than uniform, | ||
| 346 | // with a lower density at higher values. | ||
| 347 | for( int i=0; i!=n; ++i ) { | ||
| 348 | double v = numberGenerator(); | ||
| 349 | values.push_back(v*v); | ||
| 350 | } | ||
| 351 | assert(values.size() == n); | ||
| 352 | |||
| 353 | std::cout << "Generating sortedUniqueValueCounts ..." << std::endl; | ||
| 354 | ValueCountPairContainer sortedUniqueValueCounts; | ||
| 355 | GetValueCountPairs( sortedUniqueValueCounts, &values[0], n ); | ||
| 356 | |||
| 357 | std::cout << sortedUniqueValueCounts.size() << " unique values generated." << std::endl; | ||
| 358 | //for( auto p : sortedUniqueValueCounts ) | ||
| 359 | |||
| 360 | std::cout << "Finding Jenks ClassBreaks..." << std::endl; | ||
| 361 | LimitsContainer resultingbreaksArray; | ||
| 362 | ClassifyJenksFisherFromValueCountPairs( resultingbreaksArray, k, sortedUniqueValueCounts ); | ||
| 363 | |||
| 364 | std::cout << "Reporting results..." << std::endl; | ||
| 365 | for( double breakValue : resultingbreaksArray ) { | ||
| 366 | std::cout << breakValue << std::endl; | ||
| 367 | } | ||
| 368 | } // main | ||
| 369 | # endif // TEST_BUILD | ||
| 370 |