From 815c181503b7a405e4ba4e44fe06447ad9ad6312 Mon Sep 17 00:00:00 2001 From: rob tillaart Date: Wed, 13 May 2020 15:58:15 +0200 Subject: [PATCH] 0.4.0 add useStdDev param of constructor and clear --- LICENSE | 2 +- README.md | 79 ++++++++++++ Statistic.cpp | 134 +++++++++++++++++++++ Statistic.h | 45 +++++++ examples/Average/Average.ino | 53 ++++++++ examples/StatisticArray/StatisticArray.ino | 45 +++++++ examples/TimingTest/TimingTest.ino | 60 +++++++++ keywords.txt | 21 ++++ library.json | 27 +++++ library.properties | 11 ++ 10 files changed, 476 insertions(+), 1 deletion(-) create mode 100644 Statistic.cpp create mode 100644 Statistic.h create mode 100644 examples/Average/Average.ino create mode 100644 examples/StatisticArray/StatisticArray.ino create mode 100644 examples/TimingTest/TimingTest.ino create mode 100644 keywords.txt create mode 100644 library.json create mode 100644 library.properties diff --git a/LICENSE b/LICENSE index 3d51ae3..fbd4df5 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2020 Rob Tillaart +Copyright (c) 2010-2020 Rob Tillaart Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index ef78972..1328550 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,81 @@ # Statistic + Statistic library for Arduino includes sum, average, variance and std deviation + +# Description + +The statistic library is made to get basic statistical information from a +one dimensional set of data, e.g. a stream of values of a sensor. + +The stability of the formulas is improved by the help of Gil Ross (Thanks!) + +The functions implemented are: + +* **clear(useStdDev)** +* **add(value)** +* **count()** returns zero if count == zero (of course) +* **sum()** returns zero if count == zero +* **minimum()** returns zero if count == zero +* **maximum()** returns zero if count == zero +* **average()** returns NAN if count == zero + +These three functions only work id useStdDev == true: + +* **variance()** returns NAN if count == zero +* **pop_stdev()** population stdev, returns NAN if count == zero +* **unbiased_stdev()** returnsNAN if count == zero + + +# Operational + +See examples + +# FAQ + +### Q: Are individual samples still available? +The values added to the library are not stored in the lib as it would use lots +of memory quite fast. Instead a few calculated values are kept to be able to +calculate the most important statistics. + + +### Q: How many samples can the lib hold? (internal variables and overflow) +The counter of samples is an **uint32_t**, implying a maximum of about **4 billion** samples. +In practice 'strange' things might happen before this number is reached. +There are two internal variables, **_sum** which is the sum of the values and **_ssq** +which is the sum of the squared values. Both can overflow especially **_ssq** +can and probably will grow fast. The library does not protect against it. + +There is a workaround for this (to some extend) if one knows the approx +average of the samples before. Before adding values to the lib subtract +the expected average. The sum of the samples would move to around zero. +This workaround has no influence on the standard deviation. + +!! Do not forget to add the expected average to the calculated average. + +*(Q: should this subtraction trick be build into the lib?)* + + +### Q: How about the precision of the library? +The precision of the internal variables is restricted due to the fact +that they are 32 bit float (IEEE754). If the internal variable **_sum** has +a large value, adding relative small values to the dataset wouldn't +change its value any more. Same is true for **_ssq**. One might argue that +statistically speaking these values are less significant, but in fact it is wrong. + +There is a workaround for this (to some extend). If one has the samples in an +array or on disk, one can sort the samples in increasing order (abs value) +and add them from this sorted list. This will minimize the error, +but it works only if the samples are available and the they may be added +in the sorted increasing order. + + +### Q: When will internal var's overflow? esp. squared sum +IEEE754 floats have a max value of about **+-3.4028235E+38** + + +### Q: Why are there two functions for stdev? +There are two stdev functions the population stdev and the unbiased stdev. +See Wikipedia for an elaborate description of the difference between these two. + + + diff --git a/Statistic.cpp b/Statistic.cpp new file mode 100644 index 0000000..44ac9c1 --- /dev/null +++ b/Statistic.cpp @@ -0,0 +1,134 @@ +// +// FILE: Statistic.cpp +// AUTHOR: Rob dot Tillaart at gmail dot com +// modified at 0.3 by Gil Ross at physics dot org +// VERSION: 0.4.0 +// PURPOSE: Recursive statistical library for Arduino +// +// NOTE: 2011-01-07 Gill Ross +// Rob Tillaart's Statistic library uses one-pass of the data (allowing +// each value to be discarded), but expands the Sum of Squares Differences to +// difference the Sum of Squares and the Average Squared. This is susceptible +// to bit length precision errors with the float type (only 5 or 6 digits +// absolute precision) so for long runs and high ratios of +// the average value to standard deviation the estimate of the +// standard error (deviation) becomes the difference of two large +// numbers and will tend to zero. +// +// For small numbers of iterations and small Average/SE th original code is +// likely to work fine. +// It should also be recognised that for very large samples, questions +// of stability of the sample assume greater importance than the +// correctness of the asymptotic estimators. +// +// This recursive algorithm, which takes slightly more computation per +// iteration is numerically stable. +// It updates the number, mean, max, min and SumOfSquaresDiff each step to +// deliver max min average, population standard error (standard deviation) and +// unbiassed SE. +// ------------- +// +// HISTORY: +// 0.1 2010-10-29 initial version +// 0.2 2010-10-29 stripped to minimal functionality +// 0.2.01 2010-10-30 +// added minimim, maximum, unbiased stdev, +// changed counter to long -> int overflows @32K samples +// 0.3 2011-01-07 +// branched from 0.2.01 version of Rob Tillaart's code +// 0.3.1 2012-11-10 minor edits +// 0.3.2 2012-11-10 minor edits +// changed count -> unsigned long allows for 2^32 samples +// added variance() +// 0.3.3 2015-03-07 +// float -> double to support ARM (compiles) +// moved count() sum() min() max() to .h; for optimizing compiler +// 0.3.4 2017-07-31 +// Refactored const in many places +// [reverted] double to float on request as float is 99.99% of the cases +// good enough and float(32 bit) is supported in HW for some processors. +// 0.3.5 2017-09-27 +// Added #include to fix uint32_t bug +// 0.4.0 2020-05-13 +// refactor +// Added flag to switch on the use of stdDev runtime. [idea marc.recksiedl] + + +#include "Statistic.h" + +Statistic::Statistic(bool useStdDev) +{ + clear(useStdDev); +} + +void Statistic::clear(bool useStdDev) // useStdDev default true. +{ + _cnt = 0; + _sum = 0; + _min = 0; + _max = 0; + _useStdDev = useStdDev; + _ssqdif = 0.0; + // note not _ssq but sum of square differences + // which is SUM(from i = 1 to N) of f(i)-_ave_N)**2 +} + +// adds a new value to the data-set +void Statistic::add(const float value) +{ + if (_cnt == 0) + { + _min = value; + _max = value; + } else { + if (value < _min) _min = value; + else if (value > _max) _max = value; + } + _sum += value; + _cnt++; + + if (_useStdDev && (_cnt > 1)) + { + float _store = (_sum / _cnt - value); + _ssqdif = _ssqdif + _cnt * _store * _store / (_cnt - 1); + + // ~10% faster but limits the amount of samples to 65K as _cnt*_cnt overflows + // float _store = _sum - _cnt * value; + // _ssqdif = _ssqdif + _store * _store / (_cnt*_cnt - _cnt); + // + // solution: TODO verify + // _ssqdif = _ssqdif + (_store * _store / _cnt) / (_cnt - 1); + } +} + +// returns the average of the data-set added sofar +float Statistic::average() const +{ + if (_cnt == 0) return NAN; // prevent DIV0 error + return _sum / _cnt; +} + +// Population standard deviation = s = sqrt [ S ( Xi - µ )2 / N ] +// http://www.suite101.com/content/how-is-standard-deviation-used-a99084 +float Statistic::variance() const +{ + if (!_useStdDev) return NAN; + if (_cnt == 0) return NAN; // prevent DIV0 error + return _ssqdif / _cnt; +} + +float Statistic::pop_stdev() const +{ + if (!_useStdDev) return NAN; + if (_cnt == 0) return NAN; // prevent DIV0 error + return sqrt( _ssqdif / _cnt); +} + +float Statistic::unbiased_stdev() const +{ + if (!_useStdDev) return NAN; + if (_cnt < 2) return NAN; // prevent DIV0 error + return sqrt( _ssqdif / (_cnt - 1)); +} + +// -- END OF FILE -- diff --git a/Statistic.h b/Statistic.h new file mode 100644 index 0000000..9805705 --- /dev/null +++ b/Statistic.h @@ -0,0 +1,45 @@ +#pragma once +// +// FILE: Statistic.h +// AUTHOR: Rob dot Tillaart at gmail dot com +// modified at 0.3 by Gil Ross at physics dot org +// VERSION: 0.4.0 +// PURPOSE: Recursive Statistical library for Arduino +// HISTORY: See Statistic.cpp +// + +#include +#include + +#define STATISTIC_LIB_VERSION "0.4.0" + +class Statistic +{ +public: + Statistic(bool useStdDev = true); // "switches on/off" stdev run time + void clear(bool useStdDev = true); // "switches on/off" stdev run time + void add(const float); + + // returns the number of values added + uint32_t count() const { return _cnt; }; // zero if count == zero + float sum() const { return _sum; }; // zero if count == zero + float minimum() const { return _min; }; // zero if count == zero + float maximum() const { return _max; }; // zero if count == zero + float average() const; // NAN if count == zero + + // useStdDev must be true to use next three + float variance() const; // NAN if count == zero + float pop_stdev() const; // population stdev // NAN if count == zero + float unbiased_stdev() const; // NAN if count == zero + +protected: + uint32_t _cnt; + float _sum; + float _min; + float _max; + bool _useStdDev; + float _ssqdif; // sum of squares difference + +}; + +// -- END OF FILE -- diff --git a/examples/Average/Average.ino b/examples/Average/Average.ino new file mode 100644 index 0000000..90e755a --- /dev/null +++ b/examples/Average/Average.ino @@ -0,0 +1,53 @@ +// +// FILE: Average.ino +// AUTHOR: Rob dot Tillaart at gmail dot com +// VERSION: 0.4 +// PURPOSE: Sample sketch for statistic library Arduino +// + +#include "Statistic.h" + +Statistic myStats; + +uint32_t start; +uint32_t stop; + +void setup(void) +{ + Serial.begin(115200); + Serial.println(__FILE__); + Serial.print("Demo Statistics lib "); + Serial.println(STATISTIC_LIB_VERSION); + myStats.clear(); //explicitly start clean + start = millis(); +} + +void loop(void) +{ + long rn = random(0, 9999); + myStats.add(rn / 100.0 + 1); + if (myStats.count() == 10000) + { + stop = millis(); + Serial.print(" Count: "); + Serial.println(myStats.count()); + Serial.print(" Min: "); + Serial.println(myStats.minimum(), 4); + Serial.print(" Max: "); + Serial.println(myStats.maximum(), 4); + Serial.print(" Average: "); + Serial.println(myStats.average(), 4); + Serial.print(" variance: "); + Serial.println(myStats.variance(), 4); + Serial.print(" pop stdev: "); + Serial.println(myStats.pop_stdev(), 4); + Serial.print(" unbias stdev: "); + Serial.println(myStats.unbiased_stdev(), 4); + Serial.print(" time(ms): "); + Serial.println(stop - start); + Serial.println("====================================="); + myStats.clear(); + delay(1000); + start = millis(); + } +} \ No newline at end of file diff --git a/examples/StatisticArray/StatisticArray.ino b/examples/StatisticArray/StatisticArray.ino new file mode 100644 index 0000000..3a83216 --- /dev/null +++ b/examples/StatisticArray/StatisticArray.ino @@ -0,0 +1,45 @@ +// +// FILE: StatisticArray.ino +// AUTHOR: Rob dot Tillaart at gmail dot com +// VERSION: 0.1 +// PURPOSE: Sample sketch for statistic library Arduino +// + +#include "Statistic.h" + +Statistic stats[4]; + +void setup(void) +{ + Serial.begin(115200); + Serial.println(__FILE__); + Serial.print("Demo Statistics lib "); + Serial.println(STATISTIC_LIB_VERSION); + for (int i=0; i<4; i++) + { + stats[i].clear(); //explicitly start clean + } +} + +void loop(void) +{ + long rn = random(0, 9999); + int idx = random(0, 4); + stats[idx].add(rn / 100.0 + 1); + + if (stats[idx].count() == 10000) + { + Serial.print("IDX: "); + Serial.println(idx); + Serial.print(" Count: "); + Serial.println(stats[idx].count()); + Serial.print(" Min: "); + Serial.println(stats[idx].minimum(), 4); + Serial.print(" Max: "); + Serial.println(stats[idx].maximum(), 4); + Serial.print(" Average: "); + Serial.println(stats[idx].average(), 4); + Serial.println("====================================="); + stats[idx].clear(); + } +} diff --git a/examples/TimingTest/TimingTest.ino b/examples/TimingTest/TimingTest.ino new file mode 100644 index 0000000..9d2ebe8 --- /dev/null +++ b/examples/TimingTest/TimingTest.ino @@ -0,0 +1,60 @@ +// +// FILE: TimingTest.ino +// AUTHOR: Rob dot Tillaart at gmail dot com +// VERSION: 0.2.0 +// PURPOSE: measure time difference for runtime stddev toggle. +// add is 1024 millis faster for 10K adds ==> ~ 100uSec per add faster. + +#include "Statistic.h" + +Statistic myStats; + +uint32_t start; +uint32_t stop; + +bool useStdDev = true; + +void setup(void) +{ + Serial.begin(115200); + Serial.println(__FILE__); + Serial.print("Demo Statistics lib "); + Serial.println(STATISTIC_LIB_VERSION); + myStats.clear(useStdDev); + start = millis(); +} + +void loop(void) +{ + long rn = random(0, 9999); + myStats.add(rn / 100.0 + 1); + if (myStats.count() == 10000) + { + stop = millis(); + Serial.print(" Count: "); + Serial.println(myStats.count()); + Serial.print(" Min: "); + Serial.println(myStats.minimum(), 4); + Serial.print(" Max: "); + Serial.println(myStats.maximum(), 4); + Serial.print(" Average: "); + Serial.println(myStats.average(), 4); + if (useStdDev) + { + Serial.print(" variance: "); + Serial.println(myStats.variance(), 4); + Serial.print(" pop stdev: "); + Serial.println(myStats.pop_stdev(), 4); + Serial.print(" unbias stdev: "); + Serial.println(myStats.unbiased_stdev(), 4); + } + Serial.print(" time(ms): "); + Serial.println(stop - start); + Serial.println("====================================="); + useStdDev = !useStdDev; + myStats.clear(useStdDev); + start = millis(); + } +} + +// -- END OF FILE -- diff --git a/keywords.txt b/keywords.txt new file mode 100644 index 0000000..c910a4b --- /dev/null +++ b/keywords.txt @@ -0,0 +1,21 @@ +# Syntax Coloring Map For Statistic + +# Datatypes (KEYWORD1) +Statistic KEYWORD1 + +# Methods and Functions (KEYWORD2) +clear KEYWORD2 +add KEYWORD2 +count KEYWORD2 +sum KEYWORD2 +minimum KEYWORD2 +maximum KEYWORD2 +average KEYWORD2 +variance KEYWORD2 +pop_stdev KEYWORD2 +unbiased_stdev KEYWORD2 + +# Instances (KEYWORD2) + +# Constants (LITERAL1) +STATISTIC_LIB_VERSION LITERAL1 diff --git a/library.json b/library.json new file mode 100644 index 0000000..1b0b407 --- /dev/null +++ b/library.json @@ -0,0 +1,27 @@ +{ + "name": "Statistic", + "keywords": "Statistic,sum,min,max,average,variance,standard,deviation,population,unbiased", + "description": "Library with basic statistical functions for Arduino.", + "authors": + [ + { + "name": "Rob Tillaart", + "email": "Rob.Tillaart@gmail.com", + "maintainer": true + }, + { + "name": "Gil Ross" + } + ], + "repository": + { + "type": "git", + "url": "https://github.com/RobTillaart/Arduino.git" + }, + "version":"0.4.0", + "frameworks": "arduino", + "platforms": "*", + "export": { + "include": "libraries/Statistic" + } +} diff --git a/library.properties b/library.properties new file mode 100644 index 0000000..fc6b9f9 --- /dev/null +++ b/library.properties @@ -0,0 +1,11 @@ +name=Statistic +version=0.4.0 +author=Rob Tillaart +maintainer=Rob Tillaart +sentence=Library with basic statistical functions for Arduino. +paragraph=Supports +category=Data Processing +url=https://github.com/RobTillaart/Statistic +architectures=* +includes=Statistic.h +depends=