Tuesday, October 15, 2013

Endianess conversions performance

Endianness is always a problem when trying to write portable programs. For that reason there are functions implemented in endian.h that handles that. Normally, it is implemented using shifts and bit-wise or to get the bytes in the right order for the machine, something like this:


1
2
3
4
5
6
uint32_t be32toh(const void* ptr) {
  uint32_t val = *static_cast<const uint32_t*>(ptr);
  const uint32_t lower = (val >> 24) | ((val >> 8) & 0xFF00);
  const uint32_t upper = (val << 24) | ((val << 8) & 0xFF0000);
  return lower | upper;
}

Here we assume that we read from some buffer where we have read bytes into from an external source that stores everything in big-endian format. (I picked big-endian format because I use an Intel, which is little-endian.) This deviate from the definition in endian.h on purpose, bear with me for a while.
There are other implementations, however, that read the bytes from memory directly and shift the bytes; something like this:


1
2
3
4
uint32_t be32toh(const void* buf) {
  const uint8_t *ptr = static_cast<const uint8_t>(buf);
  return (ptr[0] << 24) | (ptr[1] << 16) | (ptr[2] << 8) | ptr[3];
}

Noting that most modern architecture have plenty of registers and efficient instructions on registers, I wondered which one is fastest. Here are the results:


ProgramSecondsPercent
Register0.6480449%
Pointer1.32808100%

The results come from the following program (compiled with g++ -O4 -std=c++0x).


Update: there were an error in the code leading to different loop count. I added a variable to contain the loop count instead and ran the measurements again.


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#include <cstdlib>
#include <sys/time.h>
#include <sys/resource.h>
#include <vector>
#include <functional>
#include <numeric>
#include <iostream>
#include <memory>
#include <algorithm>
#include <stdint.h>
 
double operator-(rusage const& a, rusage const& b) {
    double result = (a.ru_utime.tv_usec - b.ru_utime.tv_usec) / 1.0e6;
    result += (a.ru_utime.tv_sec - b.ru_utime.tv_sec);
    return result;
}
 
template <class Func>
double measure(Func func)
{
    rusage before, after;
    getrusage(RUSAGE_SELF, &before);
    func();
    getrusage(RUSAGE_SELF, &after);
    return (after - before);
}
 
uint32_t mk1_be32toh(const void* buf) {
  uint32_t val = *static_cast<const uint32_t*>(buf);
  const uint32_t lower = (val >> 24) | ((val >> 8) & 0xFF00);
  const uint32_t upper = (val << 24) | ((val << 8) & 0xFF0000);
  return lower | upper;
}
 
uint32_t mk2_be32toh(const void* buf) {
  const uint8_t *ptr = static_cast<const uint8_t*>(buf);
  return (ptr[0] << 24) | (ptr[1] << 16) | (ptr[2] << 8) | ptr[3];
}
 
int main() {
  std::vector<uint32_t> array;
  size_t sum = 0;
  double result;
  const int loop_count = 100000;
 
  for (int i = 0 ; i < 10000 ; ++i)
    array.push_back(random());
 
  result = measure([&array, &sum]() {
      for (unsigned int n = 0 ; n < loop_count ; ++n)
        for (unsigned int i = 0 ; i < array.size() ; ++i)
          sum += mk1_be32toh(&array[i]);
    });
 
  std::cout << "mk1 exec time is: " << result
            << "(sum is " << sum << ")"
            << std::endl;
 
  result = measure([&array, &sum]() {
      for (unsigned int n = 0 ; n < loop_count ; ++n)
        for (unsigned int i = 0 ; i < array.size() ; ++i)
          sum += mk2_be32toh(&array[i]);
    });
 
  std::cout << "mk2 exec time is: " << result
            << "(sum is " << sum << ")"
            << std::endl;
 
}

Tuesday, October 08, 2013

Virtual functions or member variables (micro-benchmark)

This blog has been revived as a general dump for various measurements of C/C++ programs.

If you have some property you would like to make available for some the objects of a hierarchy, there are two approaches:
  1. Define a pure virtual function in the base class and override it in the subclasses where it computes (or returns) the value.
  2. Add a member variable to the base class and compute the value in the constructor of the subclasses. Add a member function in the base class to fetch that value.
The result of running the program below is:

Virtual Functions 1.05207 seconds 100%
Member Variable 0.136009 seconds 13%

Update: Adding -O4 improved the results significantly.

C++ code

Compiled using g++ -O4 -std=c++0x vfunc-attr.cc -o vfunc-attr

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#include <cstdlib>
#include <sys/time.h>
#include <sys/resource.h>
#include <vector>
#include <functional>
#include <numeric>
#include <iostream>
#include <memory>
#include <algorithm>
 
class VBase {
public:
  virtual size_t size() const = 0;
};
 
class VInt : public VBase {
public:
  template <class T> VInt(T value) : m_value(value) {}
  virtual size_t size() const {
    return sizeof(m_value);
  }
private:
  int m_value;
};
 
class VFloat : public VBase {
public:
  template <class T> VFloat(T value) : m_value(value) {}
 
  virtual size_t size() const {
    return sizeof(m_value);
  }
private:
  float m_value;
};
 
class MBase {
public:
  MBase(size_t size) : m_size(size) { }
  size_t size() const {
    return m_size;
  }
private:
  size_t m_size;
};
 
class MInt : public MBase {
public:
  MInt(int value) : MBase(sizeof(m_value)), m_value(value) { }
private:
  int m_value;
};
 
class MFloat : public MBase {
public:
  MFloat(int value) : MBase(sizeof(m_value)), m_value(value) { }
private:
  float m_value;
};
 
double operator-(rusage const& a, rusage const& b) {
    double result = (a.ru_utime.tv_usec - b.ru_utime.tv_usec) / 1.0e6;
    result += (a.ru_utime.tv_sec - b.ru_utime.tv_sec);
    return result;
}
 
template <class Func>
double measure(Func func)
{
    rusage before, after;
    getrusage(RUSAGE_SELF, &before);
    func();
    getrusage(RUSAGE_SELF, &after);
    return (after - before);
}
 
int main() {
  {
    std::vector<VBase*> array;
    size_t sum = 0;
    for (int i = 0 ; i < 100000000 ; ++i) {
      if (i % 2 == 0)
        array.push_back(new VFloat(i));
      else
        array.push_back(new VInt(i));
    }
 
    double result = measure([&array, &sum](){
        sum = std::accumulate(array.begin(), array.end(), 0,
                              [](size_t x, VBase* ptr){
                                return x + ptr->size();
                              });
      });
    std::for_each(array.begin(), array.end(),
                  std::default_delete<VBase>());
 
    std::cout << "VBase - Sum is: " << sum << ", "
              << "Exec time is: " << result
              << std::endl;
  }
  {
    std::vector<MBase*> array;
    size_t sum = 0;
    for (int i = 0 ; i < 100000000 ; ++i) {
      if (i % 2 == 0)
        array.push_back(new MFloat(i));
      else
        array.push_back(new MInt(i));
    }
 
    double result = measure([&array, &sum](){
        sum = std::accumulate(array.begin(), array.end(), 0,
                              [](size_t x, MBase* ptr){
                                return x + ptr->size();
                              });
      });
 
    std::for_each(array.begin(), array.end(),
                  std::default_delete<MBase>());
 
    std::cout << "MBase - Sum is: " << sum << ", "
              << "Exec time is: " << result
              << std::endl;
  }
}