curfil  ..
 All Classes Functions Variables Typedefs Friends Groups Pages
random_tree.h
1 #ifndef CURFIL_RANDOMTREE_H
2 #define CURFIL_RANDOMTREE_H
3 
4 #include <boost/format.hpp>
5 #include <boost/lexical_cast.hpp>
6 #include <boost/random.hpp>
7 #include <boost/shared_ptr.hpp>
8 #include <boost/weak_ptr.hpp>
9 #include <cassert>
10 #include <cmath>
11 #include <cuv/ndarray.hpp>
12 #include <functional>
13 #include <limits>
14 #include <map>
15 #include <ostream>
16 #include <set>
17 #include <tbb/concurrent_vector.h>
18 #include <tbb/parallel_for.h>
19 #include <utility>
20 #include <vector>
21 
22 #include "score.h"
23 #include "utils.h"
24 
25 namespace curfil {
26 
27 typedef uint8_t LabelType;
28 typedef unsigned int WeightType;
29 typedef double FeatureResponseType;
30 
31 enum HorizontalFlipSetting {
32  NoFlip = 0, Flip = 1, Both = 2
33 };
34 
35 namespace detail {
36 
37 bool isScoreBetter(const ScoreType bestScore, const ScoreType score, const int featureNr);
38 
41  const cuv::ndarray<WeightType, cuv::host_memory_space>& priorDistribution,
42  const double histogramBias);
43 
44 }
45 
46 // Test/training-time classes
47 enum SplitBranch {
48  LEFT, RIGHT
49 };
50 
51 enum AccelerationMode {
52  CPU_ONLY,
53  GPU_ONLY,
54  GPU_AND_CPU_COMPARE
55 };
56 
61 template<class Instance, class FeatureFunction> class SplitFunction {
62 public:
63 
67  // Note: feat is copied and SplitFunction assumes ownership.
68  // To be able to test on any sample, FeatureFunction must be implemented
69  // such that it can lookup these Instances dynamically.
70  SplitFunction(size_t featureId, const FeatureFunction& feature, float threshold, ScoreType score) :
71  featureId(featureId), feature(feature), threshold(threshold), score(score) {
72  }
73 
74  SplitFunction() :
75  featureId(0), feature(), threshold(std::numeric_limits<float>::quiet_NaN()), score(
76  std::numeric_limits<float>::quiet_NaN()) {
77  }
78 
83  assert(!isnan(other.threshold));
84  assert(!isnan(other.score));
85  featureId = other.featureId;
86  feature = other.feature;
87  threshold = other.threshold;
88  score = other.score;
89  return (*this);
90  }
94  SplitBranch split(const Instance& instance, bool& flippedSameSplit) const {
95  HorizontalFlipSetting horFlipSetting = instance.getHorFlipSetting();
96  flippedSameSplit = true;
97  bool splitValue, value2;
98  switch (horFlipSetting) {
99  case NoFlip:
100  splitValue = feature.calculateFeatureResponse(instance) <= getThreshold();
101  break;
102  case Flip:
103  splitValue = feature.calculateFeatureResponse(instance, true) <= getThreshold();
104  break;
105  case Both:
106  splitValue = feature.calculateFeatureResponse(instance) <= getThreshold();
107  value2 = feature.calculateFeatureResponse(instance, true) <= getThreshold();
108  if (splitValue != value2)
109  flippedSameSplit = false;
110  break;
111  default:
112  splitValue = feature.calculateFeatureResponse(instance) <= getThreshold();
113  break;
114  }
115  return ((splitValue) ? LEFT : RIGHT);
116  }
117 
121  const FeatureFunction& getFeature() const {
122  return (feature);
123  }
124 
128  float getThreshold() const {
129  assert(!isnan(threshold));
130  return threshold;
131  }
132 
136  ScoreType getScore() const {
137  assert(!isnan(score));
138  return score;
139  }
140 
144  size_t getFeatureId() const {
145  return featureId;
146  }
147 
148 private:
149  size_t featureId;
150  FeatureFunction feature;
151  float threshold;
152  ScoreType score;
153 };
154 
160 
161 public:
162 
163  explicit TrainingConfiguration() :
164  randomSeed(0),
165  samplesPerImage(0),
166  featureCount(0),
167  minSampleCount(0),
168  maxDepth(0),
169  boxRadius(0),
170  regionSize(0),
171  thresholds(0),
172  numThreads(0),
173  maxImages(0),
174  imageCacheSize(0),
175  maxSamplesPerBatch(0),
176  accelerationMode(GPU_ONLY),
177  useCIELab(0),
178  useDepthFilling(0),
179  deviceIds(),
180  subsamplingType(),
181  ignoredColors(),
182  useDepthImages(0),
183  horizontalFlipping(0) {
184  }
185 
190 
194  TrainingConfiguration(int randomSeed,
195  unsigned int samplesPerImage,
196  unsigned int featureCount,
197  unsigned int minSampleCount,
198  int maxDepth,
199  uint16_t boxRadius,
200  uint16_t regionSize,
201  uint16_t thresholds,
202  int numThreads,
203  int maxImages,
204  int imageCacheSize,
205  unsigned int maxSamplesPerBatch,
206  AccelerationMode accelerationMode,
207  bool useCIELab = true,
208  bool useDepthFilling = false,
209  const std::vector<int> deviceIds = std::vector<int>(1, 0),
210  const std::string subsamplingType = "classUniform",
211  const std::vector<std::string>& ignoredColors = std::vector<std::string>(),
212  bool useDepthImages = true,
213  bool horizontalFlipping = false) :
214  randomSeed(randomSeed),
215  samplesPerImage(samplesPerImage),
216  featureCount(featureCount),
217  minSampleCount(minSampleCount),
218  maxDepth(maxDepth),
219  boxRadius(boxRadius),
220  regionSize(regionSize),
221  thresholds(thresholds),
222  numThreads(numThreads),
223  maxImages(maxImages),
224  imageCacheSize(imageCacheSize),
225  maxSamplesPerBatch(maxSamplesPerBatch),
226  accelerationMode(accelerationMode),
227  useCIELab(useCIELab),
228  useDepthFilling(useDepthFilling),
229  deviceIds(deviceIds),
230  subsamplingType(subsamplingType),
231  ignoredColors(ignoredColors),
232  useDepthImages(useDepthImages),
233  horizontalFlipping(horizontalFlipping)
234  {
235  for (size_t c = 0; c < ignoredColors.size(); c++) {
236  if (ignoredColors[c].empty()) {
237  throw std::runtime_error(std::string("illegal color: '") + ignoredColors[c] + "'");
238  }
239  }
240  if (maxImages > 0 && maxImages < imageCacheSize) {
241  throw std::runtime_error(
242  (boost::format("illegal configuration: maxImages (%d) must not be lower than imageCacheSize (%d)")
243  % maxImages % imageCacheSize).str());
244  }
245  }
246 
250  void setRandomSeed(int randomSeed) {
251  this->randomSeed = randomSeed;
252  }
253 
257  int getRandomSeed() const {
258  return randomSeed;
259  }
260 
264  unsigned int getSamplesPerImage() const {
265  return samplesPerImage;
266  }
267 
271  unsigned int getFeatureCount() const {
272  return featureCount;
273  }
274 
278  unsigned int getMinSampleCount() const {
279  return minSampleCount;
280  }
281 
285  int getMaxDepth() const {
286  return maxDepth;
287  }
288 
292  uint16_t getBoxRadius() const {
293  return boxRadius;
294  }
295 
299  uint16_t getRegionSize() const {
300  return regionSize;
301  }
302 
306  uint16_t getThresholds() const {
307  return thresholds;
308  }
309 
313  int getNumThreads() const {
314  return numThreads;
315  }
316 
320  int getMaxImages() const {
321  return maxImages;
322  }
323 
327  int getImageCacheSize() const {
328  return imageCacheSize;
329  }
330 
334  unsigned int getMaxSamplesPerBatch() const {
335  return maxSamplesPerBatch;
336  }
337 
341  AccelerationMode getAccelerationMode() const {
342  return accelerationMode;
343  }
344 
348  void setAccelerationMode(const AccelerationMode& accelerationMode) {
349  this->accelerationMode = accelerationMode;
350  }
351 
355  static AccelerationMode parseAccelerationModeString(const std::string& modeString);
356 
360  std::string getAccelerationModeString() const;
361 
365  const std::vector<int>& getDeviceIds() const {
366  return deviceIds;
367  }
368 
372  void setDeviceIds(const std::vector<int>& deviceIds) {
373  this->deviceIds = deviceIds;
374  }
375 
379  std::string getSubsamplingType() const {
380  return subsamplingType;
381  }
382 
386  bool isUseCIELab() const {
387  return useCIELab;
388  }
389 
393  bool isUseDepthFilling() const {
394  return useDepthFilling;
395  }
396 
400  bool isUseDepthImages() const {
401  return useDepthImages;
402  }
403 
407  bool doHorizontalFlipping() const {
408  return horizontalFlipping;
409  }
410 
414  const std::vector<std::string>& getIgnoredColors() const {
415  return ignoredColors;
416  }
417 
422 
426  bool equals(const TrainingConfiguration& other, bool strict = false) const;
427 
431  bool operator==(const TrainingConfiguration& other) const {
432  return this->equals(other, true);
433  }
434 
438  bool operator!=(const TrainingConfiguration& other) const {
439  return (!(*this == other));
440  }
441 
442 private:
443 
444  // do not forget to update operator==/operator= as well!
445  int randomSeed;
446  unsigned int samplesPerImage;
447  unsigned int featureCount;
448  unsigned int minSampleCount;
449  int maxDepth;
450  uint16_t boxRadius;
451  uint16_t regionSize;
452  uint16_t thresholds;
453  int numThreads;
454  int maxImages;
455  int imageCacheSize;
456  unsigned int maxSamplesPerBatch;
457  AccelerationMode accelerationMode;
458  bool useCIELab;
459  bool useDepthFilling;
460  std::vector<int> deviceIds;
461  std::string subsamplingType;
462  std::vector<std::string> ignoredColors;
463  bool useDepthImages;
464  bool horizontalFlipping;
465 };
466 
471 template<class Instance, class FeatureFunction>
472 class RandomTree {
473 
474 public:
475 
479  RandomTree(const size_t& nodeId, const int level,
480  const std::vector<const Instance*>& samples, size_t numClasses,
481  const boost::shared_ptr<RandomTree<Instance, FeatureFunction> >& parent = boost::shared_ptr<
483  nodeId(nodeId), level(level), parent(parent), leaf(true), trainSamples(),
484  numClasses(numClasses), histogram(numClasses), allPixelsHistogram(numClasses), timers(),
485  split(), left(), right() {
486 
487  assert(histogram.ndim() == 1);
488  for (size_t label = 0; label < numClasses; label++) {
489  histogram[label] = 0;
490  allPixelsHistogram[label] = 0;
491  }
492 
493  for (size_t i = 0; i < samples.size(); i++) {
494  histogram[samples[i]->getLabel()] += samples[i]->getWeight();
495  trainSamples.push_back(*samples[i]);
496  if (samples[i]->getHorFlipSetting() == Both) {
497  histogram[samples[i]->getLabel()] += samples[i]->getWeight();
498  }
499  }
500  }
501 
505  RandomTree(const size_t& nodeId, const int level,
506  const boost::shared_ptr<RandomTree<Instance, FeatureFunction> >& parent,
507  const std::vector<WeightType>& histogram) :
508  nodeId(nodeId), level(level), parent(parent), leaf(true), trainSamples(),
509  numClasses(histogram.size()), histogram(histogram.size()), timers(),
510  split(), left(), right() {
511 
512  for (size_t i = 0; i < histogram.size(); i++) {
513  this->histogram[i] = histogram[i];
514  }
515 
516  }
517 
521  bool hasPureHistogram() const {
522  size_t nonZeroClasses = 0;
523  for (size_t i = 0; i < histogram.size(); i++) {
524  const WeightType classCount = histogram[i];
525  if (classCount == 0) {
526  continue;
527  }
528  nonZeroClasses++;
529  if (nonZeroClasses > 1) {
530  return false;
531  }
532  }
533  assert(nonZeroClasses > 0);
534  return (nonZeroClasses == 1);
535  }
536 
540  size_t getNumClasses() const {
541  assert(histogram.size() == numClasses);
542  return numClasses;
543  }
544 
548  void collectNodeIndices(const Instance& instance,
549  std::set<unsigned int>& nodeSet, bool includeRoot) const {
550 
551  if (!isRoot() || includeRoot) {
552  nodeSet.insert(nodeId);
553  }
554  if (isLeaf())
555  return;
556 
557  boost::shared_ptr<RandomTree<Instance, FeatureFunction> > node;
558  if (split.split(instance) == LEFT) {
559  node = left;
560  } else {
561  node = right;
562  }
563  assert(node != NULL);
564  node->collectNodeIndices(instance, nodeSet, includeRoot);
565  }
566 
570  void collectLeafNodes(std::vector<size_t> &leafSet) {
571  if (isLeaf())
572  {
573  leafSet.push_back(this->getNodeId());
574  }
575  else
576  {
577  left->collectLeafNodes(leafSet);
578  right->collectLeafNodes(leafSet);
579  }
580  }
581 
585  LabelType classify(const Instance& instance) const {
586  return traverseToLeaf(instance)->getDominantClass();
587  }
588 
592  const cuv::ndarray<double, cuv::host_memory_space>& classifySoft(const Instance& instance) const {
593  return (traverseToLeaf(instance)->getNormalizedHistogram());
594  }
595 
599  size_t getNumTrainSamples() const {
600  return trainSamples.size();
601  }
602 
606  const std::map<std::string, double>& getTimerValues() const {
607  return timers;
608  }
609 
613  const std::map<std::string, std::string>& getTimerAnnotations() const {
614  return timerAnnotations;
615  }
616 
620  void setTimerValue(const std::string& key, utils::Timer& timer) {
621  setTimerValue(key, timer.getSeconds());
622  }
623 
627  template<class V>
628  void setTimerAnnotation(const std::string& key, const V& annotation) {
629  timerAnnotations[key] = boost::lexical_cast<std::string>(annotation);
630  }
631 
635  void addTimerValue(const std::string& key, utils::Timer& timer) {
636  addTimerValue(key, timer.getSeconds());
637  }
638 
642  void setTimerValue(const std::string& key, const double timeInSeconds) {
643  timers[key] = timeInSeconds;
644  }
645 
649  void addTimerValue(const std::string& key, const double timeInSeconds) {
650  timers[key] += timeInSeconds;
651  }
652 
656  boost::shared_ptr<const RandomTree<Instance, FeatureFunction> > getLeft() const {
657  return left;
658  }
659 
663  boost::shared_ptr<const RandomTree<Instance, FeatureFunction> > getRight() const {
664  return right;
665  }
666 
670  int getLevel() const {
671  return level;
672  }
673 
677  bool isRoot() const {
678  return (parent.lock().get() == NULL);
679  }
680 
685  if (isRoot()) {
686  return this;
687  }
688  return parent.lock()->getRoot();
689  }
690 
694  void countFeatures(std::map<std::string, size_t>& featureCounts) const {
695  if (isLeaf()) {
696  return;
697  }
698  std::string featureType(split.getFeature().getTypeString());
699  featureCounts[featureType]++;
700 
701  left->countFeatures(featureCounts);
702  right->countFeatures(featureCounts);
703  }
704 
708  size_t countNodes() const {
709  return (isLeaf() ? 1 : (1 + left->countNodes() + right->countNodes()));
710  }
711 
715  size_t countLeafNodes() const {
716  return (isLeaf() ? 1 : (left->countLeafNodes() + right->countLeafNodes()));
717  }
718 
722  size_t getTreeDepth() const {
723  if (isLeaf())
724  return 1;
725  return (1 + std::max(left->getTreeDepth(), right->getTreeDepth()));
726  }
727 
736  boost::shared_ptr<RandomTree<Instance, FeatureFunction> > left,
737  boost::shared_ptr<RandomTree<Instance, FeatureFunction> > right) {
738  assert(isLeaf());
739 
740  assert(left.get());
741  assert(right.get());
742 
743  assert(left->getNodeId() > this->getNodeId());
744  assert(right->getNodeId() > this->getNodeId());
745 
746  assert(left->parent.lock().get() == this);
747  assert(right->parent.lock().get() == this);
748 
749  assert(this->left.get() == 0);
750  assert(this->right.get() == 0);
751 
752  // Link
753  this->left = left;
754  this->right = right;
755  this->split = split;
756 
757  // This node becomes interior to the tree
758  leaf = false;
759  }
760 
764  bool isLeaf() const {
765  return leaf;
766  }
767 
772  return split;
773  }
774 
778  size_t getNodeId() const {
779  return nodeId;
780  }
781 
785  size_t getTreeId() const {
786  size_t rootNodeId = getRoot()->getNodeId();
787  assert(rootNodeId <= getNodeId());
788  return rootNodeId;
789  }
790 
795  const double histogramBias) {
796 
797  normalizedHistogram = detail::normalizeHistogram(histogram, priorDistribution, histogramBias);
798  assert(normalizedHistogram.shape() == histogram.shape());
799 
800  if (isLeaf()) {
801  double sum = 0;
802  for (size_t i = 0; i < normalizedHistogram.size(); i++) {
803  sum += normalizedHistogram[i];
804  }
805  if (sum == 0) {
806  CURFIL_INFO("normalized histogram of node " << getNodeId() << ", level " << getLevel() << " has zero sum");
807  }
808  } else {
809  left->normalizeHistograms(priorDistribution, histogramBias);
810  right->normalizeHistograms(priorDistribution, histogramBias);
811  }
812 
813  if (normalizedHistogram.shape() != histogram.shape()) {
814  CURFIL_ERROR("node: " << nodeId << " (level " << level << ")");
815  CURFIL_ERROR("histogram: " << histogram);
816  CURFIL_ERROR("normalized histogram: " << normalizedHistogram);
817  throw std::runtime_error("failed to normalize histogram");
818  }
819  }
820 
825  return histogram;
826  }
827 
832  if (normalizedHistogram.shape() != histogram.shape()) {
833  CURFIL_ERROR("node: " << nodeId << " (level " << level << ")");
834  CURFIL_ERROR("histogram: " << histogram);
835  CURFIL_ERROR("normalized histogram: " << normalizedHistogram);
836  throw std::runtime_error("histogram not normalized");
837  }
838  return normalizedHistogram;
839  }
840 
844  const std::vector<Instance>& getTrainSamples() const {
845  return trainSamples;
846  }
847 
851  void setAllPixelsHistogram(size_t label, double value) {
852 
853  allPixelsHistogram[label] += value;
854  }
855 
860  if (isLeaf())
861  {
862  size_t label = instance.getLabel();
863  allPixelsHistogram[label] += 1;
864  return this;
865  }
866 
867  assert(left.get());
868  assert(right.get());
869 
870  bool flippedSameSplit;
871  if (split.split(instance,flippedSameSplit) == LEFT) {
872  return left->setAllPixelsHistogram(instance);
873  } else {
874  return right->setAllPixelsHistogram(instance);
875  }
876  }
877 
882  {
883 
884  if (isLeaf()) {
885  for (size_t label = 0; label < numClasses; label++) {
886  // CURFIL_INFO(label<<" histogram[label] "<<histogram[label]<<" allPixelsHistogram[label] "<<allPixelsHistogram[label]);
887  if (histogram[label] != 0)
888  { histogram[label] = allPixelsHistogram[label];}
889  }
890  return;
891  }
892  left->updateHistograms();
893  right->updateHistograms();
894 
895  }
896 
900  void recomputeHistogramNoFlipping(const std::vector<const Instance*>& samples)
901  {
902  for (size_t label = 0; label < numClasses; label++) {
903  histogram[label] = 0;
904  }
905 
906  for (size_t i = 0; i < samples.size(); i++) {
907  histogram[samples[i]->getLabel()] += samples[i]->getWeight();
908  }
909  }
910 
911 
912 private:
913  // A unique node identifier within this tree
914  const size_t nodeId;
915  const int level;
916 
917  /*
918  * Parent node or NULL if this is the root
919  *
920  * Having a weak pointer here is important to avoid loops which cause a memory leak
921  */
922  const boost::weak_ptr<RandomTree<Instance, FeatureFunction> > parent;
923 
924  // If true, this node is a leaf node
925  bool leaf;
926 
927  std::vector<Instance> trainSamples;
928 
929  size_t numClasses;
930 
933 
935 
936  std::map<std::string, double> timers;
937  std::map<std::string, std::string> timerAnnotations;
938 
939  // If isLeaf is false, split and both left and right must be non-NULL
940  // If isLeaf is true, split = left = right = NULL
942  boost::shared_ptr<RandomTree<Instance, FeatureFunction> > left;
943  boost::shared_ptr<RandomTree<Instance, FeatureFunction> > right;
944 
945  LabelType getDominantClass() const {
946  double max = std::numeric_limits<double>::quiet_NaN();
947  assert(histogram.size() == numClasses);
948  LabelType maxClass = 0;
949 
950  for (LabelType classNr = 0; classNr < histogram.size(); classNr++) {
951  const WeightType& count = histogram[classNr];
952  assert(count >= 0.0);
953  if (isnan(max) || count > max) {
954  max = count;
955  maxClass = classNr;
956  }
957  }
958  return maxClass;
959  }
960 
961  const RandomTree<Instance, FeatureFunction>* traverseToLeaf(const Instance& instance) const {
962  if (isLeaf())
963  return this;
964 
965  assert(left.get());
966  assert(right.get());
967 
968  bool flippedSameSplit;
969  if (split.split(instance, flippedSameSplit) == LEFT) {
970  return left->traverseToLeaf(instance);
971  } else {
972  return right->traverseToLeaf(instance);
973  }
974  }
975 };
976 
981 class Sampler {
982 public:
983 
987  Sampler(int seed, int lower, int upper) :
988  seed(seed), lower(lower), upper(upper), rng(seed), distribution(lower, upper) {
989  assert(upper >= lower);
990  }
991 
995  Sampler(const Sampler& other) :
996  seed(other.seed), lower(other.lower), upper(other.upper), rng(seed), distribution(lower, upper) {
997  }
998 
1002  int getNext();
1003 
1007  int getSeed() const {
1008  return seed;
1009  }
1010 
1014  int getLower() const {
1015  return upper;
1016  }
1017 
1021  int getUpper() const {
1022  return upper;
1023  }
1024 
1025 private:
1026  Sampler& operator=(const Sampler&);
1027  Sampler();
1028 
1029  int seed;
1030  int lower;
1031  int upper;
1032 
1033  boost::mt19937 rng;
1034  boost::uniform_int<> distribution;
1035 };
1036 
1041 template<class T>
1043 public:
1044 
1045  ReservoirSampler() :
1046  count(0), samples(0), reservoir() {
1047  }
1048 
1052  ReservoirSampler(size_t samples) :
1053  count(0), samples(samples), reservoir() {
1054  reservoir.reserve(samples);
1055  assert(reservoir.empty());
1056  }
1057 
1061  void sample(Sampler& sampler, const T& sample) {
1062 
1063  if (samples == 0) {
1064  throw std::runtime_error("no samples to sample");
1065  }
1066 
1067  assert(sampler.getUpper() > static_cast<int>(count));
1068 
1069  if (reservoir.size() < samples) {
1070  reservoir.push_back(sample);
1071  } else {
1072  assert(count >= samples);
1073  // draw a random number from the interval (0, count) including count!
1074  size_t rand = sampler.getNext() % (count + 1);
1075  if (rand < samples) {
1076  reservoir[rand] = sample;
1077  }
1078  }
1079 
1080  count++;
1081  }
1082 
1086  const std::vector<T>& getReservoir() const {
1087  return reservoir;
1088  }
1089 
1090 private:
1091  size_t count;
1092  size_t samples;
1093  std::vector<T> reservoir;
1094 };
1095 
1100 // Training-time classes
1102 
1103 private:
1104  int seed;
1105 
1106 public:
1107 
1111  RandomSource(int seed) :
1112  seed(seed) {
1113  }
1114 
1119  return uniformSampler(0, val - 1);
1120  }
1121 
1125  Sampler uniformSampler(int lower, int upper) {
1126  return Sampler(seed++, lower, upper);
1127  }
1128 };
1129 
1134 template<class Instance, class FeatureEvaluation, class FeatureFunction>
1136 public:
1147  RandomTreeTrain(int id, size_t numClasses, const TrainingConfiguration& configuration) :
1148  id(id), numClasses(numClasses), configuration(configuration) {
1149  assert(configuration.getMaxDepth() > 0);
1150  }
1151 
1152 private:
1153 
1154  bool shouldContinueGrowing(const boost::shared_ptr<RandomTree<Instance, FeatureFunction> > node) const {
1155  if (node->hasPureHistogram()) {
1156  return false;
1157  }
1158  if (node->getNumTrainSamples() < configuration.getMinSampleCount()) {
1159  return false;
1160  }
1161 
1162  return true;
1163  }
1164 
1165  typedef boost::shared_ptr<RandomTree<Instance, FeatureFunction> > RandomTreePointer;
1166  typedef std::vector<const Instance*> Samples;
1167 
1168  void compareHistograms(cuv::ndarray<WeightType, cuv::host_memory_space> allHistogram,
1171  const SplitFunction<Instance, FeatureFunction>& bestSplit, size_t histogramSize) const {
1172 
1173  const unsigned int leftRightStride = 1; // consecutive in memory
1174 
1175  // assert that actual score is the same as the calculated score
1176  double totalLeft = sum(leftHistogram);
1177  double totalRight = sum(rightHistogram);
1178 
1179  WeightType leftHistogramArray[histogramSize];
1180  WeightType rightHistogramArray[histogramSize];
1181  WeightType allHistogramArray[histogramSize];
1182 
1183  std::stringstream strLeft;
1184  std::stringstream strRight;
1185  std::stringstream strAll;
1186 
1187  for (size_t i=0; i<histogramSize; i++)
1188  {
1189  leftHistogramArray[i] = leftHistogram[i];
1190  rightHistogramArray[i] = rightHistogram[i];
1191  allHistogramArray[i] = allHistogram[i];
1192 
1193  strLeft<<leftHistogramArray[i]<<",";
1194  strRight<<rightHistogramArray[i]<<",";
1195  strAll<<allHistogramArray[i]<<",";
1196  }
1197  double actualScore = NormalizedInformationGainScore::calculateScore(histogramSize, leftHistogramArray, rightHistogramArray,
1198  leftRightStride,
1199  allHistogramArray, totalLeft, totalRight);
1200  double diff = std::fabs(actualScore - bestSplit.getScore());
1201  if (diff > 0.02) {
1202  std::ostringstream o;
1203  o.precision(10);
1204  o << "actual score and best split score differ: " << diff << std::endl;
1205  o << "actual score: " << actualScore << std::endl;
1206  o << "best split score: " << bestSplit.getScore() << std::endl;
1207  o << "total left: " << totalLeft << std::endl;
1208  o << "total right: " << totalRight << std::endl;
1209  o << "histogram: " << strAll.str() << std::endl;
1210  o << "histogram left: " << strLeft.str() << std::endl;
1211  o << "histogram right: " << strRight.str() << std::endl;
1212  throw std::runtime_error(o.str());
1213  }
1214  }
1215 
1216 public:
1217 
1221  void train(FeatureEvaluation& featureEvaluation,
1222  RandomSource& randomSource,
1223  const std::vector<std::pair<RandomTreePointer, Samples> >& samplesPerNode,
1224  int idNode, int currentLevel = 1) const {
1225 
1226  // Depth exhausted: leaf node
1227  if (currentLevel == configuration.getMaxDepth()) {
1228  return;
1229  }
1230 
1231  CURFIL_INFO("training level " << currentLevel << ". nodes: " << samplesPerNode.size());
1232 
1233  utils::Timer trainTimer;
1234 
1235  std::vector<std::pair<RandomTreePointer, Samples> > samplesPerNodeNextLevel;
1236 
1237  std::vector<SplitFunction<Instance, FeatureFunction> > bestSplits = featureEvaluation.evaluateBestSplits(
1238  randomSource, samplesPerNode);
1239 
1240  assert(bestSplits.size() == samplesPerNode.size());
1241 
1242  std::vector<boost::shared_ptr<Instance> > flipped;
1243 
1244  for (size_t i = 0; i < samplesPerNode.size(); i++) {
1245 
1246  const std::pair<RandomTreePointer, Samples>& it = samplesPerNode[i];
1247 
1248  const std::vector<const Instance*>& samples = it.second;
1249 
1250  boost::shared_ptr<RandomTree<Instance, FeatureFunction> > currentNode = it.first;
1251  assert(currentNode);
1252 
1253  const SplitFunction<Instance, FeatureFunction>& bestSplit = bestSplits[i];
1254 
1255  // Split all training instances into the subtrees. The way we train
1256  // the trees ensures we have no more than 2*N instances to manage in
1257  // memory.
1258  std::vector<const Instance*> samplesLeft;
1259  std::vector<const Instance*> samplesRight;
1260 
1261  size_t numClasses = currentNode->getHistogram().size();
1262  cuv::ndarray<WeightType, cuv::host_memory_space> allHistogram(numClasses);
1263  cuv::ndarray<WeightType, cuv::host_memory_space> leftHistogram(numClasses);
1264  cuv::ndarray<WeightType, cuv::host_memory_space> rightHistogram(numClasses);
1265 
1266  for (size_t c=0; c<numClasses; c++)
1267  {
1268  leftHistogram[c] = 0;
1269  rightHistogram[c] = 0;
1270  allHistogram[c] = 0;
1271  }
1272 
1273  unsigned int totalFlipped = 0;
1274  unsigned int rightFlipped = 0;
1275  unsigned int leftFlipped = 0;
1276 
1277  int off = 0;
1278 
1279  for (size_t sample = 0; sample < samples.size(); sample++) {
1280  assert(samples[sample] != NULL);
1281  bool flippedSameSplit;
1282  Instance* ptr = const_cast<Instance *>(samples[sample]);
1283 
1284  SplitBranch splitResult = bestSplit.split(*ptr, flippedSameSplit);
1285 
1286  HorizontalFlipSetting flipSetting = samples[sample]->getHorFlipSetting();
1287 
1288  if (flipSetting == Both && !flippedSameSplit)
1289  {ptr->setHorFlipSetting(NoFlip);
1290  off +=1 ;}
1291 
1292  allHistogram[samples[sample]->getLabel()] += samples[sample]->getWeight();
1293  if (splitResult == LEFT) {
1294  samplesLeft.push_back(ptr);
1295  leftHistogram[samples[sample]->getLabel()] += samples[sample]->getWeight();
1296  if (flipSetting == Both) {
1297  totalFlipped += 1;
1298  allHistogram[samples[sample]->getLabel()] += samples[sample]->getWeight();
1299  if (flippedSameSplit)
1300  leftHistogram[samples[sample]->getLabel()] += samples[sample]->getWeight();
1301  else
1302  {rightHistogram[samples[sample]->getLabel()] += samples[sample]->getWeight();
1303  rightFlipped += 1;
1304  flipped.push_back(boost::shared_ptr<Instance>(new Instance((samples[sample]->getRGBDImage()), samples[sample]->getLabel(), samples[sample]->getX(), samples[sample]->getY(), Flip)));
1305  samplesRight.push_back(flipped.back().get());
1306  }
1307  }
1308  } else {
1309  samplesRight.push_back(ptr);
1310  rightHistogram[samples[sample]->getLabel()] += samples[sample]->getWeight();
1311  if (flipSetting == Both) {
1312  totalFlipped += 1;
1313  allHistogram[samples[sample]->getLabel()] += samples[sample]->getWeight();
1314  if (flippedSameSplit)
1315  rightHistogram[samples[sample]->getLabel()] += samples[sample]->getWeight();
1316  else
1317  {leftHistogram[samples[sample]->getLabel()] += samples[sample]->getWeight();
1318  leftFlipped += 1;
1319  flipped.push_back(boost::shared_ptr<Instance>(new Instance((samples[sample]->getRGBDImage()), samples[sample]->getLabel(), samples[sample]->getX(), samples[sample]->getY(), Flip)));
1320  samplesLeft.push_back(flipped.back().get());
1321  }
1322  }
1323  }
1324  }
1325 
1326  assert(samplesLeft.size() + samplesRight.size() == samples.size() + off);
1327 
1328  boost::shared_ptr<RandomTree<Instance, FeatureFunction> > leftNode = boost::make_shared<RandomTree<Instance,
1329  FeatureFunction> >(++idNode, currentNode->getLevel() + 1, samplesLeft, numClasses, currentNode);
1330 
1331  boost::shared_ptr<RandomTree<Instance, FeatureFunction> > rightNode = boost::make_shared<
1332  RandomTree<Instance, FeatureFunction> >(++idNode, currentNode->getLevel() + 1, samplesRight,
1333  numClasses, currentNode);
1334 
1335 #ifndef NDEBUG
1336  compareHistograms(allHistogram, leftHistogram, rightHistogram, bestSplit, numClasses);
1337 #endif
1338 
1339  bool errorEmptyChildren = false;
1340  if (samplesLeft.empty() && leftFlipped == 0)
1341  {
1342  errorEmptyChildren = true;
1343  }
1344  if (samplesRight.empty() && rightFlipped == 0)
1345  {
1346  errorEmptyChildren = true;
1347  }
1348  if (errorEmptyChildren) {
1349  CURFIL_ERROR("best split score: " << bestSplit.getScore());
1350  CURFIL_ERROR("samples: " << samples.size());
1351  CURFIL_ERROR("threshold: " << bestSplit.getThreshold());
1352  CURFIL_ERROR("feature: " << bestSplit.getFeature());
1353  CURFIL_ERROR("histogram: " << currentNode->getHistogram());
1354  CURFIL_ERROR("samplesLeft: " << samplesLeft.size());
1355  CURFIL_ERROR("samplesRight: " << samplesRight.size());
1356  CURFIL_ERROR("leftFlipped "<<leftFlipped<<" rightFlipped "<<rightFlipped<<" totalFlipped "<<totalFlipped)
1357 
1358  compareHistograms(allHistogram, leftHistogram, rightHistogram, bestSplit, numClasses);
1359 
1360  if (samplesLeft.empty()) {
1361  throw std::runtime_error("no samples in left node");
1362  }
1363  if (samplesRight.empty()) {
1364  throw std::runtime_error("no samples in right node");
1365  }
1366  }
1367 
1368  if (!samplesLeft.empty() && !samplesRight.empty()) {
1369  currentNode->addChildren(bestSplit, leftNode, rightNode);
1370 
1371  if (shouldContinueGrowing(leftNode)) {
1372  samplesPerNodeNextLevel.push_back(std::make_pair(leftNode, samplesLeft));
1373  }
1374 
1375  if (shouldContinueGrowing(rightNode)) {
1376  samplesPerNodeNextLevel.push_back(std::make_pair(rightNode, samplesRight));
1377  }
1378  }
1379  else
1380  idNode = idNode - 2;
1381  }
1382 
1383  CURFIL_INFO("training level " << currentLevel << " took " << trainTimer.format(3));
1384  if (!samplesPerNodeNextLevel.empty()) {
1385  train(featureEvaluation, randomSource, samplesPerNodeNextLevel, idNode, currentLevel + 1);
1386  }
1387  }
1388 
1389 private:
1390 
1391  template<class T>
1392  T sum(const cuv::ndarray<T, cuv::host_memory_space>& vector, const T initial = static_cast<T>(0)) const {
1393  T v = initial;
1394  for (size_t i = 0; i < vector.size(); i++) {
1395  v += vector[i];
1396  }
1397  return v;
1398  }
1399 
1400  int id;
1401  size_t numClasses;
1402  const TrainingConfiguration configuration;
1403 }
1404 ;
1405 
1406 }
1407 
1408 std::ostream& operator<<(std::ostream& os, const curfil::TrainingConfiguration& configuration);
1409 
1410 #endif