| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | #pragma once | ||
| 2 | |||
| 3 | /**\file | ||
| 4 | * \brief Routines for online statistics | ||
| 5 | * | ||
| 6 | * Contains various application classes implementing windowed (sliding, | ||
| 7 | * rolling) algorithms. | ||
| 8 | * | ||
| 9 | * \ingroup numerical-utils | ||
| 10 | */ | ||
| 11 | |||
| 12 | ///\defgroup numerical-utils-online-stats Numerical utils for online statistics operations | ||
| 13 | |||
| 14 | #include "na64util/numerical/sums.h" | ||
| 15 | |||
| 16 | #include <deque> | ||
| 17 | #include <vector> | ||
| 18 | |||
| 19 | #include <stddef.h> // size_t | ||
| 20 | |||
| 21 | namespace na64dp { | ||
| 22 | namespace numerical { | ||
| 23 | |||
| 24 | ///\brief C++ interface over C implementation of Klein's scorer | ||
| 25 | /// | ||
| 26 | ///\see na64dp_KleinScorer | ||
| 27 | ///\ingroup numerical-utils-online-stats | ||
| 28 | class KleinScorer : protected na64dp_KleinScorer_t { | ||
| 29 | public: | ||
| 30 | /// Constructs initialized scorer instance | ||
| 31 | 3 | KleinScorer() { | |
| 32 | 3 | na64dp_sum_klein_init(this); | |
| 33 | 3 | } | |
| 34 | /// Adds the value to sum | ||
| 35 | 4157399 | void add(double v) { | |
| 36 | 4157399 | na64dp_sum_klein_add(this, v); | |
| 37 | 4157399 | } | |
| 38 | /// Adds the value to sum (operator shortcut) | ||
| 39 | 4157399 | KleinScorer & operator+=(double v) { | |
| 40 | 4157399 | add(v); | |
| 41 | 4157399 | return *this; | |
| 42 | } | ||
| 43 | /// Returns computed sum | ||
| 44 | 39201 | double result() const { | |
| 45 | 39201 | return na64dp_sum_klein_result(this); | |
| 46 | } | ||
| 47 | /// Returns computed sum (operator shortcut) | ||
| 48 | 39201 | operator double() const { | |
| 49 | 39201 | return result(); | |
| 50 | } | ||
| 51 | /// Drops the scorer value to 0. | ||
| 52 | 19386 | void reset() { na64dp_sum_klein_init(this); } | |
| 53 | }; | ||
| 54 | |||
| 55 | ///\brief A C++ wrapper around `na64dp_Covariance` | ||
| 56 | /// | ||
| 57 | ///\ingroup numerical-utils-online-stats | ||
| 58 | class CorrelationScorer : protected na64dp_Covariance { | ||
| 59 | public: | ||
| 60 | /// Initializes and constructs covariance scorer instance | ||
| 61 | 1 | CorrelationScorer() { | |
| 62 | 1 | na64dp_cov_init(this); | |
| 63 | 1 | } | |
| 64 | /// Accounts pair of values | ||
| 65 | 7 | void account( double x, double y ) { | |
| 66 | 7 | na64dp_cov_account( this, x, y ); | |
| 67 | 7 | } | |
| 68 | /// Returns unbiased covariance value | ||
| 69 | 6 | double covariance() const { | |
| 70 | 6 | return na64dp_covariance_get(this); | |
| 71 | } | ||
| 72 | |||
| 73 | /// Returns Pearson's correlation coefficient | ||
| 74 | 6 | double pearson_correlation() const { | |
| 75 | 6 | return na64dp_covariance_get(this)/sqrt(variance_x()*variance_y()); | |
| 76 | } | ||
| 77 | |||
| 78 | /// Returns number of accounted pairs | ||
| 79 | unsigned long n_accounted() const { return n; } | ||
| 80 | /// Returns mean X value | ||
| 81 | 7 | double mean_x() const { return na64dp_sum_klein_result(&xMean); } | |
| 82 | /// Returns mean Y value | ||
| 83 | 7 | double mean_y() const { return na64dp_sum_klein_result(&yMean); } | |
| 84 | /// Returns sum of squared deviations of X value | ||
| 85 | 12 | double sq_dev_x() const { return na64dp_sum_klein_result(&d2x); } | |
| 86 | /// Returns variance of X | ||
| 87 | 12 | double variance_x() const { return sq_dev_x()/(n-1); } | |
| 88 | /// Returns sum of squared deviations of Y value | ||
| 89 | 12 | double sq_dev_y() const { return na64dp_sum_klein_result(&d2y); } | |
| 90 | /// Returns variance of Y | ||
| 91 | 12 | double variance_y() const { return sq_dev_y()/(n-1); } | |
| 92 | }; | ||
| 93 | |||
| 94 | /**\brief Covariance matrix wrapper around `na64dp_CovarianceMatrix` C struct | ||
| 95 | * | ||
| 96 | * This implementation is a bit more efficient than multiple | ||
| 97 | * `CorrelationScorer` instances though it explicitly requires that none of the | ||
| 98 | * considered vector components may be NaN. | ||
| 99 | * | ||
| 100 | *\ingroup numerical-utils-online-stats | ||
| 101 | * */ | ||
| 102 | class CovarianceMatrix : protected na64dp_CovarianceMatrix_t { | ||
| 103 | public: | ||
| 104 | /// Creates pairwise covariance scorer for vector of `d` | ||
| 105 | 1 | CovarianceMatrix( unsigned short v ) { | |
| 106 | 1 | na64dp_mcov_init( this, v ); | |
| 107 | 1 | } | |
| 108 | /// Frees covariance scorer (heap used) | ||
| 109 | 1 | ~CovarianceMatrix() { | |
| 110 | 1 | na64dp_mcov_free(this); | |
| 111 | 1 | } | |
| 112 | /// Takes into account given sample vector | ||
| 113 | 7 | void account( const double * v ) { | |
| 114 | 7 | na64dp_mcov_account(this, v); | |
| 115 | 7 | } | |
| 116 | /// Returns i-th mean value | ||
| 117 | double mean(int i) const { return na64dp_sum_klein_result(means + i); } | ||
| 118 | /// Returns number of accounted vectors | ||
| 119 | unsigned long n_accounted() const { return n; } | ||
| 120 | /// Returns dimensionality | ||
| 121 | unsigned short n_dimensions() const { return d; } | ||
| 122 | /// Returns i-th mean value | ||
| 123 | 35 | double mean(int i) { return na64dp_sum_klein_result(means + i); } // sqds, covs | |
| 124 | /// Returns i-th squared deviation value | ||
| 125 | 210 | double sq_dev(int i) { return na64dp_sum_klein_result(sqds + i); } | |
| 126 | /// Returns variance value of i-th component | ||
| 127 | 210 | double variance(int i) { return sq_dev(i)/(n-1); } | |
| 128 | /// Returns covariance value of i-th and j-th component | ||
| 129 | 180 | double covariance(int i, int j) { return na64dp_mcov( this, i, j ); } | |
| 130 | /// Returns Pearson's correlation value of i-th and j-th component | ||
| 131 | 90 | double pearson_correlation(int i, int j) { | |
| 132 | 90 | return covariance(i, j)/sqrt(variance(i)*variance(j)); | |
| 133 | } | ||
| 134 | }; | ||
| 135 | |||
| 136 | ///\brief Utility interface class representing windowed (sliding) statistics. | ||
| 137 | /// | ||
| 138 | ///\ingroup numerical-utils-online-stats | ||
| 139 | class iMovStats { | ||
| 140 | public: | ||
| 141 | virtual bool is_outlier(double, double threshold=3.) const = 0; | ||
| 142 | virtual double mean() const = 0; | ||
| 143 | virtual void account(double) = 0; | ||
| 144 | virtual std::vector<double> data_copy() const = 0; | ||
| 145 | virtual size_t n_samples() const = 0; | ||
| 146 | }; | ||
| 147 | |||
| 148 | namespace aux { | ||
| 149 | |||
| 150 | template<typename ScorerT> void reset_summation_scorer(ScorerT &); | ||
| 151 | |||
| 152 | template<> void reset_summation_scorer<double>(double &); | ||
| 153 | template<> void reset_summation_scorer<KleinScorer>(KleinScorer &); | ||
| 154 | |||
| 155 | } | ||
| 156 | |||
| 157 | /** Implements moving statistics using strightforward approach | ||
| 158 | * | ||
| 159 | * Maintains a deque of N samples, dynamically recalculating mean/stddev values | ||
| 160 | * each time new sample is accounted. | ||
| 161 | * | ||
| 162 | *\ingroup numerical-utils-online-stats | ||
| 163 | */ | ||
| 164 | template<typename ScorerT> | ||
| 165 | class MovingStats : public iMovStats | ||
| 166 | , public std::deque<double> { | ||
| 167 | private: | ||
| 168 | /// Numbor ef entries since last recaching | ||
| 169 | size_t _recacheCounter; | ||
| 170 | /// Threshold number of entries for recaching | ||
| 171 | const size_t _recacheCounterThreshold; | ||
| 172 | protected: | ||
| 173 | /// Threshold (in units of \f$\sigma\f$) of the values to be omitted | ||
| 174 | double _excludeOutliersThreshold; | ||
| 175 | /// Threshold (in units of \f$\sigma\f$) of the values to cause recache | ||
| 176 | double _recacheOnOutliersThreshold; | ||
| 177 | /// \f$\sum\limits_{i=1}^N x_i\f$ | ||
| 178 | ScorerT _runningSum; | ||
| 179 | /// \f$\sum\limits_{i=1}^N x_i^2\f$ | ||
| 180 | ScorerT _runningSqSum; | ||
| 181 | const size_t _windowSize; | ||
| 182 | |||
| 183 | /// Re-calculates the sums | ||
| 184 | 9693 | void _recache() { | |
| 185 | 9693 | aux::reset_summation_scorer<ScorerT>(_runningSum); | |
| 186 | 9693 | aux::reset_summation_scorer<ScorerT>(_runningSqSum); | |
| 187 |
2/2✓ Branch 5 taken 1918700 times.
✓ Branch 6 taken 9693 times.
|
1928393 | for( auto v : *this ) { |
| 188 |
1/1✓ Branch 1 taken 1918700 times.
|
1918700 | _runningSum += v; |
| 189 |
1/1✓ Branch 1 taken 1918700 times.
|
1918700 | _runningSqSum += v*v; |
| 190 | } | ||
| 191 | 9693 | _recacheCounter = 0; | |
| 192 | 9693 | } | |
| 193 | public: | ||
| 194 | /**\brief Constructs empty sliding scorer | ||
| 195 | * | ||
| 196 | * Used to create new instances of the scorer in cases when no initial data | ||
| 197 | * is available. Various estimations derived by this instance (like mean | ||
| 198 | * and stdev) at first `N` events will not be properly conditioned because | ||
| 199 | * of lack of the data, so wherever is possible an alternative form of | ||
| 200 | * constructor has to be used. | ||
| 201 | * | ||
| 202 | * \param N size of sliding window | ||
| 203 | * \param excludeOutliers \f$k\f$ in \f$k \times \sigma\f$ to classify outliers | ||
| 204 | * \param recacheOnOutliers \f$k\f$ in \f$k \times \sigma\f$ to recache | ||
| 205 | * \param recachingFrac fraction of events to force sums recache | ||
| 206 | * */ | ||
| 207 | 1 | MovingStats( size_t N | |
| 208 | , double excludeOutliers=6. | ||
| 209 | , double recacheOnOutliers=6. | ||
| 210 | , double recachingFrac=1. | ||
| 211 | 1 | ) : _recacheCounter(0) | |
| 212 | 1 | , _recacheCounterThreshold(N*recachingFrac) | |
| 213 | 1 | , _excludeOutliersThreshold(excludeOutliers) | |
| 214 | 1 | , _recacheOnOutliersThreshold(recacheOnOutliers) | |
| 215 |
2/2✓ Branch 3 taken 1 times.
✓ Branch 6 taken 1 times.
|
1 | , _windowSize(N) {} |
| 216 | |||
| 217 | /**\brief Constructs empty sliding scorer | ||
| 218 | * | ||
| 219 | * Used to create new instances of the scorer in cases when no initial data | ||
| 220 | * is available. Various estimations derived by this instance (like mean | ||
| 221 | * and stdev) at first `N` events will not be properly conditioned because | ||
| 222 | * of lack of the data, so wherever is possible an alternative form of | ||
| 223 | * constructor has to be used. | ||
| 224 | * | ||
| 225 | * \param initial shall contain at least `N` samples for initial state of accumulator | ||
| 226 | * \param N size of sliding window | ||
| 227 | * \param excludeOutliers \f$k\f$ in \f$k \times \sigma\f$ to classify outliers | ||
| 228 | * \param recacheOnOutliers \f$k\f$ in \f$k \times \sigma\f$ to recache | ||
| 229 | * \param recachingFrac fraction of events to force sums recache | ||
| 230 | * */ | ||
| 231 | MovingStats( double * initial | ||
| 232 | , size_t N | ||
| 233 | , double excludeOutliers=6. | ||
| 234 | , double recacheOnOutliers=6. | ||
| 235 | , double recachingFrac=1. | ||
| 236 | ) : std::deque<double>(initial, initial + N) | ||
| 237 | , _recacheCounter(0) | ||
| 238 | , _recacheCounterThreshold(N*recachingFrac) | ||
| 239 | , _excludeOutliersThreshold(excludeOutliers) | ||
| 240 | , _recacheOnOutliersThreshold(recacheOnOutliers) | ||
| 241 | , _windowSize(N) {} | ||
| 242 | |||
| 243 | /// Returns true if given sample is beyond $6 \f$\sigma\f$ | ||
| 244 | 9800 | virtual bool is_outlier( double sample, double threshold=3.) const override { | |
| 245 |
1/2✗ Branch 1 not taken.
✓ Branch 2 taken 9800 times.
|
9800 | if( empty() ) return false; |
| 246 | 9800 | return fabs(sample - mean()) > sqrt(sigma_sq())*threshold; | |
| 247 | } | ||
| 248 | |||
| 249 | /// Returns mean value \f$(1/N) \sum\limits_{i=1}^N x_i\f$ | ||
| 250 | 29400 | virtual double mean() const override { | |
| 251 |
1/2✓ Branch 1 taken 29400 times.
✗ Branch 2 not taken.
|
29400 | if( !empty() ) |
| 252 | 29400 | return _runningSum/size(); | |
| 253 | ✗ | return std::nan("0"); | |
| 254 | } | ||
| 255 | |||
| 256 | /// Takes into account sample \f$x_i\f$ | ||
| 257 | 10000 | virtual void account(double sample) override { | |
| 258 | 20000 | if( _excludeOutliersThreshold | |
| 259 |
2/5✗ Branch 0 not taken.
✓ Branch 1 taken 10000 times.
✗ Branch 4 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 10000 times.
|
10000 | && is_outlier(sample, _excludeOutliersThreshold) ) return; |
| 260 |
2/5✗ Branch 0 not taken.
✓ Branch 1 taken 10000 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 10000 times.
|
10000 | if( _recacheCounterThreshold && ++_recacheCounter > _recacheCounterThreshold ) { |
| 261 | // Recache sums to avoid (mitigate) catstrophic cancellation | ||
| 262 | ✗ | _recache(); | |
| 263 | } | ||
| 264 |
2/2✓ Branch 1 taken 200 times.
✓ Branch 2 taken 9800 times.
|
10000 | if( size() < _windowSize ) { |
| 265 | 200 | _runningSum += sample; | |
| 266 | 200 | _runningSqSum += sample*sample; | |
| 267 | 200 | push_back( sample ); | |
| 268 |
1/2✓ Branch 0 taken 200 times.
✗ Branch 1 not taken.
|
200 | if( !_recacheCounterThreshold ) _recache(); |
| 269 | 200 | return; | |
| 270 | } | ||
| 271 | 9800 | const double prevFirst = *begin(); | |
| 272 | 9800 | _runningSum += sample - prevFirst; | |
| 273 | 9800 | _runningSqSum += sample*sample - prevFirst*prevFirst; | |
| 274 | 9800 | pop_front(); | |
| 275 | 9800 | push_back( sample ); | |
| 276 | 19600 | if( _recacheOnOutliersThreshold | |
| 277 |
5/6✓ Branch 0 taken 9800 times.
✗ Branch 1 not taken.
✓ Branch 3 taken 9493 times.
✓ Branch 4 taken 307 times.
✓ Branch 5 taken 9493 times.
✓ Branch 6 taken 307 times.
|
9800 | && is_outlier(sample, _recacheOnOutliersThreshold) ) { |
| 278 | 9493 | _recache(); | |
| 279 | } | ||
| 280 | } | ||
| 281 | |||
| 282 | /// Returns (unbiased, sliding) \f$ \sigma^2 \f$ | ||
| 283 | 9800 | double sigma_sq() const { | |
| 284 |
1/2✓ Branch 1 taken 9800 times.
✗ Branch 2 not taken.
|
9800 | if( size() > 1 ) |
| 285 | 9800 | return (_runningSqSum/size() - mean()*mean())/(size()-1); | |
| 286 | ✗ | return std::nan("0"); | |
| 287 | } | ||
| 288 | |||
| 289 | /// Copies entries from deque to vector | ||
| 290 | ✗ | virtual std::vector<double> data_copy() const override { | |
| 291 | ✗ | return std::vector<double>( this->begin(), this->end() ); | |
| 292 | } | ||
| 293 | |||
| 294 | ✗ | virtual size_t n_samples() const { | |
| 295 | ✗ | return size(); | |
| 296 | } | ||
| 297 | }; | ||
| 298 | |||
| 299 | } | ||
| 300 | } | ||
| 301 | |||
| 302 |