18 #define _USE_MATH_DEFINES
1231 {6.895, 4.824, 3.984, 3.513, 3.206, 2.988, 2.823, 2.694, 2.590, 2.503}};
1237 #define MINVARIANCE 0.0004
1245 #define MINSAMPLESPERBUCKET 5
1246 #define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET)
1247 #define MINSAMPLESNEEDED 1
1255 #define BUCKETTABLESIZE 1024
1256 #define NORMALEXTENT 3.0
1315 #define Odd(N) ((N) % 2)
1316 #define Mirror(N, R) ((R) - (N)-1)
1317 #define Abs(N) (((N) < 0) ? (-(N)) : (N))
1327 #define SqrtOf2Pi 2.506628275
1329 static const double kNormalVariance =
1336 #define LOOKUPTABLESIZE 8
1337 #define MAXDEGREESOFFREEDOM MAXBUCKETS
1348 static void CreateClusterTree(CLUSTERER *Clusterer);
1350 static void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t Level);
1352 static CLUSTER *FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster,
float *Distance);
1354 static CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster);
1356 static void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *
Config);
1358 static PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *
Config, CLUSTER *Cluster);
1360 static PROTOTYPE *MakeDegenerateProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics,
1363 static PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *
Config, CLUSTER *Cluster,
1364 STATISTICS *Statistics);
1366 static PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
1369 static PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster,
1370 STATISTICS *Statistics, BUCKETS *Buckets);
1372 static PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
1373 BUCKETS *NormalBuckets,
double Confidence);
1375 static void MakeDimRandom(uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc);
1377 static void MakeDimUniform(uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics);
1379 static STATISTICS *ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster);
1381 static PROTOTYPE *NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
1383 static PROTOTYPE *NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
1385 static PROTOTYPE *NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
1387 static PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster);
1389 static bool Independent(PARAM_DESC *ParamDesc, int16_t N,
float *CoVariance,
float Independence);
1391 static BUCKETS *GetBuckets(CLUSTERER *clusterer,
DISTRIBUTION Distribution, uint32_t SampleCount,
1394 static BUCKETS *MakeBuckets(
DISTRIBUTION Distribution, uint32_t SampleCount,
double Confidence);
1396 static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount);
1398 static double ComputeChiSquared(uint16_t DegreesOfFreedom,
double Alpha);
1400 static double NormalDensity(int32_t x);
1402 static double UniformDensity(int32_t x);
1404 static double Integral(
double f1,
double f2,
double Dx);
1406 static void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc,
1407 float Mean,
float StdDev);
1409 static uint16_t NormalBucket(PARAM_DESC *ParamDesc,
float x,
float Mean,
float StdDev);
1411 static uint16_t UniformBucket(PARAM_DESC *ParamDesc,
float x,
float Mean,
float StdDev);
1413 static bool DistributionOK(BUCKETS *Buckets);
1415 static uint16_t DegreesOfFreedom(
DISTRIBUTION Distribution, uint16_t HistogramBuckets);
1417 static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount);
1419 static void InitBuckets(BUCKETS *Buckets);
1421 static int AlphaMatch(
void *arg1,
1424 static double Solve(
SOLVEFUNC Function,
void *FunctionParams,
double InitialGuess,
double Accuracy);
1426 static double ChiArea(CHISTRUCT *ChiParams,
double x);
1428 static bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster,
float MaxIllegal);
1430 static double InvertMatrix(
const float *input,
int size,
float *inv);
1447 Clusterer->NumberOfSamples = 0;
1448 Clusterer->NumChar = 0;
1451 Clusterer->Root =
nullptr;
1455 Clusterer->ParamDesc =
new PARAM_DESC[SampleSize];
1456 for (i = 0; i < SampleSize; i++) {
1458 Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].
NonEssential;
1459 Clusterer->ParamDesc[i].Min = ParamDesc[i].
Min;
1460 Clusterer->ParamDesc[i].Max = ParamDesc[i].
Max;
1461 Clusterer->ParamDesc[i].Range = ParamDesc[i].
Max - ParamDesc[i].
Min;
1462 Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
1463 Clusterer->ParamDesc[i].MidRange = (ParamDesc[i].
Max + ParamDesc[i].
Min) / 2;
1467 Clusterer->KDTree =
MakeKDTree(SampleSize, ParamDesc);
1470 for (
auto &d : Clusterer->bucket_cache) {
1501 Sample->Clustered =
false;
1502 Sample->Prototype =
false;
1503 Sample->SampleCount = 1;
1504 Sample->Left =
nullptr;
1505 Sample->Right =
nullptr;
1506 Sample->CharID = CharID;
1508 for (i = 0; i < Clusterer->
SampleSize; i++) {
1509 Sample->Mean[i] = Feature[i];
1515 if (CharID >= Clusterer->
NumChar) {
1516 Clusterer->
NumChar = CharID + 1;
1546 if (Clusterer->
Root ==
nullptr) {
1547 CreateClusterTree(Clusterer);
1555 ComputePrototypes(Clusterer,
Config);
1577 if (Clusterer !=
nullptr) {
1579 delete Clusterer->
KDTree;
1580 delete Clusterer->
Root;
1610 auto *Prototype =
static_cast<PROTOTYPE *
>(arg);
1613 if (Prototype->Cluster !=
nullptr) {
1619 delete[] Prototype->Variance.Elliptical;
1620 delete[] Prototype->Magnitude.Elliptical;
1621 delete[] Prototype->Weight.Elliptical;
1645 Cluster =
reinterpret_cast<CLUSTER *
>((*SearchState)->first_node());
1646 *SearchState =
pop(*SearchState);
1648 if (
Cluster->Left ==
nullptr) {
1651 *SearchState =
push(*SearchState,
Cluster->Right);
1664 return (Proto->
Mean[Dimension]);
1675 switch (Proto->
Style) {
1681 switch (Proto->
Distrib[Dimension]) {
1710 static void CreateClusterTree(CLUSTERER *Clusterer) {
1711 ClusteringContext context;
1716 context.tree = Clusterer->KDTree;
1717 context.candidates =
new TEMPCLUSTER[Clusterer->NumberOfSamples];
1719 context.heap =
new ClusterHeap(Clusterer->NumberOfSamples);
1720 KDWalk(context.tree,
reinterpret_cast<void_proc>(MakePotentialClusters), &context);
1723 while (context.heap->Pop(&HeapEntry)) {
1724 TEMPCLUSTER *PotentialCluster = HeapEntry.data();
1728 if (PotentialCluster->Cluster->Clustered) {
1734 else if (PotentialCluster->Neighbor->Clustered) {
1735 PotentialCluster->Neighbor =
1736 FindNearestNeighbor(context.tree, PotentialCluster->Cluster, &HeapEntry.key());
1737 if (PotentialCluster->Neighbor !=
nullptr) {
1738 context.heap->Push(&HeapEntry);
1744 PotentialCluster->Cluster = MakeNewCluster(Clusterer, PotentialCluster);
1745 PotentialCluster->Neighbor =
1746 FindNearestNeighbor(context.tree, PotentialCluster->Cluster, &HeapEntry.key());
1747 if (PotentialCluster->Neighbor !=
nullptr) {
1748 context.heap->Push(&HeapEntry);
1754 Clusterer->Root =
static_cast<CLUSTER *
> RootOf(Clusterer->KDTree);
1757 delete context.tree;
1758 Clusterer->KDTree =
nullptr;
1759 delete context.heap;
1760 delete[] context.candidates;
1772 static void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t ) {
1774 int next = context->next;
1775 context->candidates[next].Cluster = Cluster;
1776 HeapEntry.data() = &(context->candidates[next]);
1777 context->candidates[next].Neighbor =
1778 FindNearestNeighbor(context->tree, context->candidates[next].Cluster, &HeapEntry.key());
1779 if (context->candidates[next].Neighbor !=
nullptr) {
1780 context->heap->Push(&HeapEntry);
1798 static CLUSTER *FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster,
float *Distance)
1799 #define MAXNEIGHBORS 2
1800 #define MAXDISTANCE FLT_MAX
1804 int NumberOfNeighbors;
1806 CLUSTER *BestNeighbor;
1810 reinterpret_cast<void **
>(Neighbor), Dist);
1814 BestNeighbor =
nullptr;
1815 for (i = 0; i < NumberOfNeighbors; i++) {
1816 if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
1817 *Distance = Dist[i];
1818 BestNeighbor = Neighbor[i];
1821 return BestNeighbor;
1833 static CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) {
1835 auto Cluster =
new CLUSTER(Clusterer->SampleSize);
1836 Cluster->Clustered =
false;
1837 Cluster->Prototype =
false;
1838 Cluster->Left = TempCluster->Cluster;
1839 Cluster->Right = TempCluster->Neighbor;
1840 Cluster->CharID = -1;
1843 Cluster->Left->Clustered =
true;
1844 Cluster->Right->Clustered =
true;
1845 KDDelete(Clusterer->KDTree, &Cluster->Left->Mean[0], Cluster->Left);
1846 KDDelete(Clusterer->KDTree, &Cluster->Right->Mean[0], Cluster->Right);
1849 Cluster->SampleCount =
MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc,
1850 Cluster->Left->SampleCount, Cluster->Right->SampleCount,
1851 &Cluster->Mean[0], &Cluster->Left->Mean[0], &Cluster->Right->Mean[0]);
1854 KDStore(Clusterer->KDTree, &Cluster->Mean[0], Cluster);
1872 float m1[],
float m2[]) {
1876 for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
1881 if ((*m2 - *m1) > ParamDesc->
HalfRange) {
1882 *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->
Range)) / n;
1883 if (*m < ParamDesc->Min) {
1884 *m += ParamDesc->
Range;
1886 }
else if ((*m1 - *m2) > ParamDesc->
HalfRange) {
1887 *m = (n1 * (*m1 - ParamDesc->
Range) + n2 * *m2) / n;
1888 if (*m < ParamDesc->Min) {
1889 *m += ParamDesc->
Range;
1892 *m = (n1 * *m1 + n2 * *m2) / n;
1895 *m = (n1 * *m1 + n2 * *m2) / n;
1909 static void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *
Config) {
1912 PROTOTYPE *Prototype;
1916 if (Clusterer->Root !=
nullptr) {
1925 Cluster =
reinterpret_cast<CLUSTER *
>(ClusterStack->first_node());
1926 ClusterStack =
pop(ClusterStack);
1927 Prototype = MakePrototype(Clusterer,
Config, Cluster);
1928 if (Prototype !=
nullptr) {
1929 Clusterer->ProtoList =
push(Clusterer->ProtoList, Prototype);
1931 ClusterStack =
push(ClusterStack, Cluster->Right);
1932 ClusterStack =
push(ClusterStack, Cluster->Left);
1952 static PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *
Config, CLUSTER *Cluster) {
1962 auto Statistics = ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
1967 Proto = MakeDegenerateProto(Clusterer->SampleSize, Cluster, Statistics,
Config->
ProtoStyle,
1969 if (Proto !=
nullptr) {
1974 if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize, &Statistics->CoVariance[0],
1981 Proto = TestEllipticalProto(Clusterer,
Config, Cluster, Statistics);
1982 if (Proto !=
nullptr) {
1994 Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
1997 Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
2000 Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
Config->
Confidence);
2003 Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
2004 if (Proto !=
nullptr) {
2007 Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
2008 if (Proto !=
nullptr) {
2011 Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
Config->
Confidence);
2037 static PROTOTYPE *MakeDegenerateProto(
2038 uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics,
PROTOSTYLE Style, int32_t MinSamples) {
2039 PROTOTYPE *Proto =
nullptr;
2045 if (Cluster->SampleCount < MinSamples) {
2048 Proto = NewSphericalProto(N, Cluster, Statistics);
2052 Proto = NewEllipticalProto(N, Cluster, Statistics);
2055 Proto = NewMixedProto(N, Cluster, Statistics);
2058 Proto->Significant =
false;
2076 static PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *
Config, CLUSTER *Cluster,
2077 STATISTICS *Statistics) {
2083 const double kMagicSampleMargin = 0.0625;
2084 const double kFTableBoostMargin = 2.0;
2086 int N = Clusterer->SampleSize;
2087 CLUSTER *Left = Cluster->Left;
2088 CLUSTER *Right = Cluster->Right;
2089 if (Left ==
nullptr || Right ==
nullptr) {
2092 int TotalDims = Left->SampleCount + Right->SampleCount;
2093 if (TotalDims < N + 1 || TotalDims < 2) {
2096 std::vector<float> Covariance(
static_cast<size_t>(N) * N);
2097 std::vector<float> Inverse(
static_cast<size_t>(N) * N);
2098 std::vector<float> Delta(N);
2100 for (
int i = 0; i < N; ++i) {
2101 int row_offset = i * N;
2102 if (!Clusterer->ParamDesc[i].NonEssential) {
2103 for (
int j = 0; j < N; ++j) {
2104 if (!Clusterer->ParamDesc[j].NonEssential) {
2105 Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
2107 Covariance[j + row_offset] = 0.0f;
2111 for (
int j = 0; j < N; ++j) {
2113 Covariance[j + row_offset] = 1.0f;
2115 Covariance[j + row_offset] = 0.0f;
2120 double err = InvertMatrix(&Covariance[0], N, &Inverse[0]);
2122 tprintf(
"Clustering error: Matrix inverse failed with error %g\n", err);
2125 for (
int dim = 0; dim < N; ++dim) {
2126 if (!Clusterer->ParamDesc[dim].NonEssential) {
2127 Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
2135 for (
int x = 0; x < N; ++x) {
2137 for (
int y = 0; y < N; ++y) {
2138 temp +=
static_cast<double>(Inverse[y + N * x]) * Delta[y];
2140 Tsq += Delta[x] * temp;
2146 double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2) * EssentialN);
2147 int Fx = EssentialN;
2152 int Fy = TotalDims - EssentialN - 1;
2157 double FTarget =
FTable[Fy][Fx];
2161 FTarget += kFTableBoostMargin;
2164 return NewEllipticalProto(Clusterer->SampleSize, Cluster, Statistics);
2180 static PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
2182 PROTOTYPE *Proto =
nullptr;
2186 for (i = 0; i < Clusterer->SampleSize; i++) {
2187 if (Clusterer->ParamDesc[i].NonEssential) {
2191 FillBuckets(Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), Cluster->Mean[i],
2192 sqrt(
static_cast<double>(Statistics->AvgVariance)));
2193 if (!DistributionOK(Buckets)) {
2198 if (i >= Clusterer->SampleSize) {
2199 Proto = NewSphericalProto(Clusterer->SampleSize, Cluster, Statistics);
2215 static PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster,
2216 STATISTICS *Statistics, BUCKETS *Buckets) {
2217 PROTOTYPE *Proto =
nullptr;
2221 for (i = 0; i < Clusterer->SampleSize; i++) {
2222 if (Clusterer->ParamDesc[i].NonEssential) {
2226 FillBuckets(Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), Cluster->Mean[i],
2227 sqrt(
static_cast<double>(Statistics->CoVariance[i * (Clusterer->SampleSize + 1)])));
2228 if (!DistributionOK(Buckets)) {
2233 if (i >= Clusterer->SampleSize) {
2234 Proto = NewEllipticalProto(Clusterer->SampleSize, Cluster, Statistics);
2254 static PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
2255 BUCKETS *NormalBuckets,
double Confidence) {
2258 BUCKETS *UniformBuckets =
nullptr;
2259 BUCKETS *RandomBuckets =
nullptr;
2262 Proto = NewMixedProto(Clusterer->SampleSize, Cluster, Statistics);
2265 for (i = 0; i < Clusterer->SampleSize; i++) {
2266 if (Clusterer->ParamDesc[i].NonEssential) {
2270 FillBuckets(NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i],
2271 std::sqrt(Proto->Variance.Elliptical[i]));
2272 if (DistributionOK(NormalBuckets)) {
2276 if (RandomBuckets ==
nullptr) {
2277 RandomBuckets = GetBuckets(Clusterer,
D_random, Cluster->SampleCount, Confidence);
2279 MakeDimRandom(i, Proto, &(Clusterer->ParamDesc[i]));
2280 FillBuckets(RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i],
2281 Proto->Variance.Elliptical[i]);
2282 if (DistributionOK(RandomBuckets)) {
2286 if (UniformBuckets ==
nullptr) {
2287 UniformBuckets = GetBuckets(Clusterer,
uniform, Cluster->SampleCount, Confidence);
2289 MakeDimUniform(i, Proto, Statistics);
2290 FillBuckets(UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i],
2291 Proto->Variance.Elliptical[i]);
2292 if (DistributionOK(UniformBuckets)) {
2298 if (i < Clusterer->SampleSize) {
2312 static void MakeDimRandom(uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) {
2314 Proto->Mean[i] = ParamDesc->MidRange;
2315 Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
2318 Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
2319 Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
2320 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
2321 Proto->LogMagnitude = log(
static_cast<double>(Proto->TotalMagnitude));
2333 static void MakeDimUniform(uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics) {
2335 Proto->Mean[i] = Proto->Cluster->Mean[i] + (Statistics->Min[i] + Statistics->Max[i]) / 2;
2336 Proto->Variance.Elliptical[i] = (Statistics->Max[i] - Statistics->Min[i]) / 2;
2337 if (Proto->Variance.Elliptical[i] <
MINVARIANCE) {
2342 Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
2343 Proto->Magnitude.Elliptical[i] = 1.0 / (2.0 * Proto->Variance.Elliptical[i]);
2344 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
2345 Proto->LogMagnitude = log(
static_cast<double>(Proto->TotalMagnitude));
2364 static STATISTICS *ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster) {
2368 uint32_t SampleCountAdjustedForBias;
2371 auto Statistics =
new STATISTICS(N);
2374 std::vector<float> Distance(N);
2378 while ((Sample =
NextSample(&SearchState)) !=
nullptr) {
2379 for (i = 0; i < N; i++) {
2380 Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
2381 if (ParamDesc[i].Circular) {
2382 if (Distance[i] > ParamDesc[i].HalfRange) {
2383 Distance[i] -= ParamDesc[i].Range;
2385 if (Distance[i] < -ParamDesc[i].HalfRange) {
2386 Distance[i] += ParamDesc[i].Range;
2389 if (Distance[i] < Statistics->Min[i]) {
2390 Statistics->Min[i] = Distance[i];
2392 if (Distance[i] > Statistics->Max[i]) {
2393 Statistics->Max[i] = Distance[i];
2396 auto CoVariance = &Statistics->CoVariance[0];
2397 for (i = 0; i < N; i++) {
2398 for (j = 0; j < N; j++, CoVariance++) {
2399 *CoVariance += Distance[i] * Distance[j];
2407 if (Cluster->SampleCount > 1) {
2408 SampleCountAdjustedForBias = Cluster->SampleCount - 1;
2410 SampleCountAdjustedForBias = 1;
2412 auto CoVariance = &Statistics->CoVariance[0];
2413 for (i = 0; i < N; i++) {
2414 for (j = 0; j < N; j++, CoVariance++) {
2415 *CoVariance /= SampleCountAdjustedForBias;
2420 Statistics->AvgVariance *= *CoVariance;
2424 Statistics->AvgVariance =
2425 static_cast<float>(pow(
static_cast<double>(Statistics->AvgVariance), 1.0 / N));
2441 static PROTOTYPE *NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {
2444 Proto = NewSimpleProto(N, Cluster);
2446 Proto->Variance.Spherical = Statistics->AvgVariance;
2451 Proto->Magnitude.Spherical = 1.0 / sqrt(2.0 * M_PI * Proto->Variance.Spherical);
2452 Proto->TotalMagnitude =
static_cast<float>(
2453 pow(
static_cast<double>(Proto->Magnitude.Spherical),
static_cast<double>(N)));
2454 Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
2455 Proto->LogMagnitude = log(
static_cast<double>(Proto->TotalMagnitude));
2470 static PROTOTYPE *NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {
2474 Proto = NewSimpleProto(N, Cluster);
2475 Proto->Variance.Elliptical =
new float[N];
2476 Proto->Magnitude.Elliptical =
new float[N];
2477 Proto->Weight.Elliptical =
new float[N];
2479 auto CoVariance = &Statistics->CoVariance[0];
2480 Proto->TotalMagnitude = 1.0;
2481 for (i = 0; i < N; i++, CoVariance += N + 1) {
2482 Proto->Variance.Elliptical[i] = *CoVariance;
2483 if (Proto->Variance.Elliptical[i] <
MINVARIANCE) {
2487 Proto->Magnitude.Elliptical[i] = 1.0f / sqrt(2.0f * M_PI * Proto->Variance.Elliptical[i]);
2488 Proto->Weight.Elliptical[i] = 1.0f / Proto->Variance.Elliptical[i];
2489 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
2491 Proto->LogMagnitude = log(
static_cast<double>(Proto->TotalMagnitude));
2509 static PROTOTYPE *NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {
2510 auto Proto = NewEllipticalProto(N, Cluster, Statistics);
2511 Proto->Distrib.clear();
2512 Proto->Distrib.resize(N,
normal);
2513 Proto->Style =
mixed;
2525 static PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster) {
2526 auto Proto =
new PROTOTYPE;
2528 Proto->Mean = Cluster->Mean;
2529 Proto->Distrib.clear();
2530 Proto->Significant =
true;
2531 Proto->Merged =
false;
2533 Proto->NumSamples = Cluster->SampleCount;
2534 Proto->Cluster = Cluster;
2535 Proto->Cluster->Prototype =
true;
2557 static bool Independent(PARAM_DESC *ParamDesc, int16_t N,
float *CoVariance,
float Independence) {
2561 float CorrelationCoeff;
2564 for (i = 0; i < N; i++, VARii += N + 1) {
2565 if (ParamDesc[i].NonEssential) {
2569 VARjj = VARii + N + 1;
2570 CoVariance = VARii + 1;
2571 for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
2572 if (ParamDesc[j].NonEssential) {
2576 if ((*VARii == 0.0) || (*VARjj == 0.0)) {
2577 CorrelationCoeff = 0.0;
2579 CorrelationCoeff = sqrt(std::sqrt(*CoVariance * *CoVariance / (*VARii * *VARjj)));
2581 if (CorrelationCoeff > Independence) {
2604 static BUCKETS *GetBuckets(CLUSTERER *clusterer,
DISTRIBUTION Distribution, uint32_t SampleCount,
2605 double Confidence) {
2607 uint16_t NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
2608 BUCKETS *Buckets = clusterer->bucket_cache[Distribution][NumberOfBuckets -
MINBUCKETS];
2611 if (Buckets ==
nullptr) {
2612 Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
2613 clusterer->bucket_cache[Distribution][NumberOfBuckets -
MINBUCKETS] = Buckets;
2616 if (SampleCount != Buckets->SampleCount) {
2617 AdjustBuckets(Buckets, SampleCount);
2619 if (Confidence != Buckets->Confidence) {
2620 Buckets->Confidence = Confidence;
2621 Buckets->ChiSquared =
2622 ComputeChiSquared(DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
2624 InitBuckets(Buckets);
2645 static BUCKETS *MakeBuckets(
DISTRIBUTION Distribution, uint32_t SampleCount,
double Confidence) {
2646 const DENSITYFUNC DensityFunction[] = {NormalDensity, UniformDensity, UniformDensity};
2648 double BucketProbability;
2649 double NextBucketBoundary;
2651 double ProbabilityDelta;
2652 double LastProbDensity;
2654 uint16_t CurrentBucket;
2658 auto Buckets =
new BUCKETS(OptimumNumberOfBuckets(SampleCount));
2659 Buckets->SampleCount = SampleCount;
2660 Buckets->Confidence = Confidence;
2663 Buckets->Distribution = Distribution;
2667 Buckets->ChiSquared =
2668 ComputeChiSquared(DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
2672 BucketProbability = 1.0 /
static_cast<double>(Buckets->NumberOfBuckets);
2675 CurrentBucket = Buckets->NumberOfBuckets / 2;
2676 if (
Odd(Buckets->NumberOfBuckets)) {
2677 NextBucketBoundary = BucketProbability / 2;
2679 NextBucketBoundary = BucketProbability;
2683 LastProbDensity = (*DensityFunction[
static_cast<int>(Distribution)])(
BUCKETTABLESIZE / 2);
2685 ProbDensity = (*DensityFunction[
static_cast<int>(Distribution)])(i + 1);
2686 ProbabilityDelta = Integral(LastProbDensity, ProbDensity, 1.0);
2687 Probability += ProbabilityDelta;
2688 if (Probability > NextBucketBoundary) {
2689 if (CurrentBucket < Buckets->NumberOfBuckets - 1) {
2692 NextBucketBoundary += BucketProbability;
2694 Buckets->Bucket[i] = CurrentBucket;
2695 Buckets->ExpectedCount[CurrentBucket] +=
static_cast<float>(ProbabilityDelta * SampleCount);
2696 LastProbDensity = ProbDensity;
2699 Buckets->ExpectedCount[CurrentBucket] +=
static_cast<float>((0.5 - Probability) * SampleCount);
2703 Buckets->Bucket[i] =
Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets);
2707 for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--) {
2708 Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
2727 static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount) {
2731 if (SampleCount < kCountTable[0]) {
2732 return kBucketsTable[0];
2736 if (SampleCount <= kCountTable[Next]) {
2737 Slope =
static_cast<float>(kBucketsTable[Next] - kBucketsTable[Last]) /
2738 static_cast<float>(kCountTable[Next] - kCountTable[Last]);
2740 static_cast<uint16_t
>(kBucketsTable[Last] + Slope * (SampleCount - kCountTable[Last])));
2743 return kBucketsTable[Last];
2762 static double ComputeChiSquared(uint16_t DegreesOfFreedom,
double Alpha)
2763 #define CHIACCURACY 0.01
2764 #define MINALPHA (1e-200)
2771 if (
Odd(DegreesOfFreedom)) {
2778 CHISTRUCT SearchKey(0.0, Alpha);
2779 auto OldChiSquared =
reinterpret_cast<CHISTRUCT *
>(
2782 if (OldChiSquared ==
nullptr) {
2783 OldChiSquared =
new CHISTRUCT(DegreesOfFreedom, Alpha);
2784 OldChiSquared->ChiSquared =
2785 Solve(ChiArea, OldChiSquared,
static_cast<double>(DegreesOfFreedom),
CHIACCURACY);
2786 ChiWith[DegreesOfFreedom] =
push(ChiWith[DegreesOfFreedom], OldChiSquared);
2791 return (OldChiSquared->ChiSquared);
2808 static double NormalDensity(int32_t x) {
2811 Distance = x - kNormalMean;
2812 return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);
2822 static double UniformDensity(int32_t x) {
2826 return UniformDistributionDensity;
2840 static double Integral(
double f1,
double f2,
double Dx) {
2841 return (f1 + f2) * Dx / 2.0;
2864 static void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc,
2865 float Mean,
float StdDev) {
2872 for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2873 Buckets->Count[i] = 0;
2876 if (StdDev == 0.0) {
2885 while ((Sample =
NextSample(&SearchState)) !=
nullptr) {
2886 if (Sample->Mean[Dim] >
Mean) {
2887 BucketID = Buckets->NumberOfBuckets - 1;
2888 }
else if (Sample->Mean[Dim] <
Mean) {
2893 Buckets->Count[BucketID] += 1;
2895 if (i >= Buckets->NumberOfBuckets) {
2902 while ((Sample =
NextSample(&SearchState)) !=
nullptr) {
2903 switch (Buckets->Distribution) {
2905 BucketID = NormalBucket(ParamDesc, Sample->Mean[Dim],
Mean, StdDev);
2909 BucketID = UniformBucket(ParamDesc, Sample->Mean[Dim],
Mean, StdDev);
2914 Buckets->Count[Buckets->Bucket[BucketID]] += 1;
2930 static uint16_t NormalBucket(PARAM_DESC *ParamDesc,
float x,
float Mean,
float StdDev) {
2934 if (ParamDesc->Circular) {
2935 if (x -
Mean > ParamDesc->HalfRange) {
2936 x -= ParamDesc->Range;
2937 }
else if (x - Mean < -ParamDesc->HalfRange) {
2938 x += ParamDesc->Range;
2942 X = ((x -
Mean) / StdDev) * kNormalStdDev + kNormalMean;
2949 return static_cast<uint16_t
>(floor(
static_cast<double>(X)));
2963 static uint16_t UniformBucket(PARAM_DESC *ParamDesc,
float x,
float Mean,
float StdDev) {
2967 if (ParamDesc->Circular) {
2968 if (x -
Mean > ParamDesc->HalfRange) {
2969 x -= ParamDesc->Range;
2970 }
else if (x - Mean < -ParamDesc->HalfRange) {
2971 x += ParamDesc->Range;
2982 return static_cast<uint16_t
>(floor(
static_cast<double>(X)));
2995 static bool DistributionOK(BUCKETS *Buckets) {
2996 float FrequencyDifference;
2997 float TotalDifference;
3001 TotalDifference = 0.0;
3002 for (i = 0; i < Buckets->NumberOfBuckets; i++) {
3003 FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
3004 TotalDifference += (FrequencyDifference * FrequencyDifference) / Buckets->ExpectedCount[i];
3008 if (TotalDifference > Buckets->ChiSquared) {
3027 static uint16_t DegreesOfFreedom(
DISTRIBUTION Distribution, uint16_t HistogramBuckets) {
3028 static uint8_t DegreeOffsets[] = {3, 3, 1};
3030 uint16_t AdjustedNumBuckets;
3032 AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[
static_cast<int>(Distribution)];
3033 if (
Odd(AdjustedNumBuckets)) {
3034 AdjustedNumBuckets++;
3036 return (AdjustedNumBuckets);
3047 static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount) {
3049 double AdjustFactor;
3052 ((
static_cast<double>(NewSampleCount)) / (
static_cast<double>(Buckets->SampleCount)));
3054 for (i = 0; i < Buckets->NumberOfBuckets; i++) {
3055 Buckets->ExpectedCount[i] *= AdjustFactor;
3058 Buckets->SampleCount = NewSampleCount;
3067 static void InitBuckets(BUCKETS *Buckets) {
3070 for (i = 0; i < Buckets->NumberOfBuckets; i++) {
3071 Buckets->Count[i] = 0;
3088 static int AlphaMatch(
void *arg1,
3090 auto *ChiStruct =
static_cast<CHISTRUCT *
>(arg1);
3091 auto *SearchKey =
static_cast<CHISTRUCT *
>(arg2);
3093 return (ChiStruct->Alpha == SearchKey->Alpha);
3110 static double Solve(
SOLVEFUNC Function,
void *FunctionParams,
double InitialGuess,
double Accuracy)
3111 #define INITIALDELTA 0.1
3112 #define DELTARATIO 0.1
3120 double LastPosX, LastNegX;
3125 LastNegX = -FLT_MAX;
3126 f = (*Function)(
static_cast<CHISTRUCT *
>(FunctionParams), x);
3127 while (
Abs(LastPosX - LastNegX) > Accuracy) {
3136 Slope = ((*Function)(
static_cast<CHISTRUCT *
>(FunctionParams), x + Delta) - f) / Delta;
3145 if (NewDelta < Delta) {
3150 f = (*Function)(
static_cast<CHISTRUCT *
>(FunctionParams), x);
3174 static double ChiArea(CHISTRUCT *ChiParams,
double x) {
3180 N = ChiParams->DegreesOfFreedom / 2 - 1;
3184 for (i = 1; i <= N; i++) {
3185 Denominator *= 2 * i;
3187 SeriesTotal += PowerOfx / Denominator;
3189 return ((SeriesTotal * exp(-0.5 * x)) - ChiParams->Alpha);
3216 static bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster,
float MaxIllegal)
3217 #define ILLEGAL_CHAR 2
3219 static std::vector<uint8_t> CharFlags;
3223 int32_t NumCharInCluster;
3224 int32_t NumIllegalInCluster;
3225 float PercentIllegal;
3229 NumIllegalInCluster = 0;
3231 if (Clusterer->NumChar > CharFlags.size()) {
3232 CharFlags.resize(Clusterer->NumChar);
3235 for (
auto &CharFlag : CharFlags) {
3241 while ((Sample =
NextSample(&SearchState)) !=
nullptr) {
3242 CharID = Sample->CharID;
3243 if (CharFlags[CharID] ==
false) {
3244 CharFlags[CharID] =
true;
3246 if (CharFlags[CharID] ==
true) {
3247 NumIllegalInCluster++;
3251 PercentIllegal =
static_cast<float>(NumIllegalInCluster) / NumCharInCluster;
3252 if (PercentIllegal > MaxIllegal) {
3267 static double InvertMatrix(
const float *input,
int size,
float *inv) {
3269 GENERIC_2D_ARRAY<double> U(size, size, 0.0);
3270 GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);
3271 GENERIC_2D_ARRAY<double> L(size, size, 0.0);
3276 for (row = 0; row < size; row++) {
3277 for (col = 0; col < size; col++) {
3278 U[row][col] = input[row * size + col];
3279 L[row][col] = row == col ? 1.0 : 0.0;
3280 U_inv[row][col] = 0.0;
3285 for (col = 0; col < size; ++col) {
3288 double best_pivot = -1.0;
3289 for (row = col; row < size; ++row) {
3290 if (
Abs(U[row][col]) > best_pivot) {
3291 best_pivot =
Abs(U[row][col]);
3296 if (best_row != col) {
3297 for (
int k = 0; k < size; ++k) {
3298 double tmp = U[best_row][k];
3299 U[best_row][k] = U[col][k];
3301 tmp = L[best_row][k];
3302 L[best_row][k] = L[col][k];
3307 for (row = col + 1; row < size; ++row) {
3308 double ratio = -U[row][col] / U[col][col];
3309 for (
int j = col; j < size; ++j) {
3310 U[row][j] += U[col][j] * ratio;
3312 for (
int k = 0; k < size; ++k) {
3313 L[row][k] += L[col][k] * ratio;
3318 for (col = 0; col < size; ++col) {
3319 U_inv[col][col] = 1.0 / U[col][col];
3320 for (row = col - 1; row >= 0; --row) {
3322 for (
int k = col; k > row; --k) {
3323 total += U[row][k] * U_inv[k][col];
3325 U_inv[row][col] = -total / U[row][row];
3329 for (row = 0; row < size; row++) {
3330 for (col = 0; col < size; col++) {
3332 for (
int k = row; k < size; ++k) {
3333 sum += U_inv[row][k] * L[k][col];
3335 inv[row * size + col] = sum;
3339 double error_sum = 0.0;
3340 for (row = 0; row < size; row++) {
3341 for (col = 0; col < size; col++) {
3343 for (
int k = 0; k < size; ++k) {
3344 sum +=
static_cast<double>(input[row * size + k]) * inv[k * size + col];
3347 error_sum +=
Abs(sum);
#define MAXDEGREESOFFREEDOM
#define InitSampleSearch(S, C)
void KDDelete(KDTREE *Tree, float Key[], void *Data)
void KDStore(KDTREE *Tree, float *Key, void *Data)
void tprintf(const char *format,...)
int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], float m1[], float m2[])
float Mean(PROTOTYPE *Proto, uint16_t Dimension)
tesseract::GenericHeap< ClusterPair > ClusterHeap
CLUSTER * NextSample(LIST *SearchState)
double(*)(int32_t) DENSITYFUNC
tesseract::KDPairInc< float, TEMPCLUSTER * > ClusterPair
KDTREE * MakeKDTree(int16_t KeySize, const PARAM_DESC KeyDesc[])
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
double(*)(CHISTRUCT *, double) SOLVEFUNC
const double FTable[FTABLE_Y][FTABLE_X]
void KDWalk(KDTREE *Tree, void_proc action, void *context)
void FreeProtoList(LIST *ProtoList)
void FreePrototype(void *arg)
CLUSTERER * MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[])
float StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension)
void FreeClusterer(CLUSTERER *Clusterer)
LIST search(LIST list, void *key, int_compare is_equal)
void KDNearestNeighborSearch(KDTREE *Tree, float Query[], int QuerySize, float MaxDistance, int *NumberOfResults, void **NBuffer, float DBuffer[])
LIST push(LIST list, void *element)
void destroy_nodes(LIST list, void_dest destructor)
SAMPLE * MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID)
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
std::vector< float > CoVariance
uint16_t Bucket[BUCKETTABLESIZE]
DISTRIBUTION Distribution
std::vector< uint32_t > Count
std::vector< float > ExpectedCount
uint16_t DegreesOfFreedom
CHISTRUCT(uint16_t degreesOfFreedom, double alpha)
std::vector< float > Mean
std::vector< DISTRIBUTION > Distrib
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1 - MINBUCKETS]