Tuesday, October 15, 2013

Endianess conversions performance

Endianness is always a problem when trying to write portable programs. For that reason there are functions implemented in endian.h that handles that. Normally, it is implemented using shifts and bit-wise or to get the bytes in the right order for the machine, something like this:


uint32_t be32toh(const void* ptr) {
  uint32_t val = *static_cast<const uint32_t*>(ptr);
  const uint32_t lower = (val >> 24) | ((val >> 8) & 0xFF00);
  const uint32_t upper = (val << 24) | ((val << 8) & 0xFF0000);
  return lower | upper;
}

Here we assume that we read from some buffer where we have read bytes into from an external source that stores everything in big-endian format. (I picked big-endian format because I use an Intel, which is little-endian.) This deviate from the definition in endian.h on purpose, bear with me for a while.
There are other implementations, however, that read the bytes from memory directly and shift the bytes; something like this:


uint32_t be32toh(const void* buf) {
  const uint8_t *ptr = static_cast<const uint8_t>(buf);
  return (ptr[0] << 24) | (ptr[1] << 16) | (ptr[2] << 8) | ptr[3];
}

Noting that most modern architecture have plenty of registers and efficient instructions on registers, I wondered which one is fastest. Here are the results:


ProgramSecondsPercent
Register0.6480449%
Pointer1.32808100%

The results come from the following program (compiled with g++ -O4 -std=c++0x).


Update: there were an error in the code leading to different loop count. I added a variable to contain the loop count instead and ran the measurements again.


#include <cstdlib>
#include <sys/time.h>
#include <sys/resource.h>
#include <vector>
#include <functional>
#include <numeric>
#include <iostream>
#include <memory>
#include <algorithm>
#include <stdint.h>

double operator-(rusage const& a, rusage const& b) {
    double result = (a.ru_utime.tv_usec - b.ru_utime.tv_usec) / 1.0e6;
    result += (a.ru_utime.tv_sec - b.ru_utime.tv_sec);
    return result;
}

template <class Func>
double measure(Func func)
{
    rusage before, after;
    getrusage(RUSAGE_SELF, &before);
    func();
    getrusage(RUSAGE_SELF, &after);
    return (after - before);
}

uint32_t mk1_be32toh(const void* buf) {
  uint32_t val = *static_cast<const uint32_t*>(buf);
  const uint32_t lower = (val >> 24) | ((val >> 8) & 0xFF00);
  const uint32_t upper = (val << 24) | ((val << 8) & 0xFF0000);
  return lower | upper;
}

uint32_t mk2_be32toh(const void* buf) {
  const uint8_t *ptr = static_cast<const uint8_t*>(buf);
  return (ptr[0] << 24) | (ptr[1] << 16) | (ptr[2] << 8) | ptr[3];
}

int main() {
  std::vector<uint32_t> array;
  size_t sum = 0;
  double result;
  const int loop_count = 100000;

  for (int i = 0 ; i < 10000 ; ++i)
    array.push_back(random());

  result = measure([&array, &sum]() {
      for (unsigned int n = 0 ; n < loop_count ; ++n)
        for (unsigned int i = 0 ; i < array.size() ; ++i)
          sum += mk1_be32toh(&array[i]);
    });

  std::cout << "mk1 exec time is: " << result
            << "(sum is " << sum << ")"
            << std::endl;

  result = measure([&array, &sum]() {
      for (unsigned int n = 0 ; n < loop_count ; ++n)
        for (unsigned int i = 0 ; i < array.size() ; ++i)
          sum += mk2_be32toh(&array[i]);
    });

  std::cout << "mk2 exec time is: " << result
            << "(sum is " << sum << ")"
            << std::endl;

}

Tuesday, October 08, 2013

Virtual functions or member variables (micro-benchmark)

This blog has been revived as a general dump for various measurements of C/C++ programs.

If you have some property you would like to make available for some the objects of a hierarchy, there are two approaches:
  1. Define a pure virtual function in the base class and override it in the subclasses where it computes (or returns) the value.
  2. Add a member variable to the base class and compute the value in the constructor of the subclasses. Add a member function in the base class to fetch that value.
The result of running the program below is:

Virtual Functions 1.05207 seconds 100%
Member Variable 0.136009 seconds 13%

Update: Adding -O4 improved the results significantly.

C++ code

Compiled using g++ -O4 -std=c++0x vfunc-attr.cc -o vfunc-attr

#include <cstdlib>
#include <sys/time.h>
#include <sys/resource.h>
#include <vector>
#include <functional>
#include <numeric>
#include <iostream>
#include <memory>
#include <algorithm>

class VBase {
public:
  virtual size_t size() const = 0;
};

class VInt : public VBase {
public:
  template <class T> VInt(T value) : m_value(value) {}
  virtual size_t size() const {
    return sizeof(m_value);
  }
private:
  int m_value;
};

class VFloat : public VBase {
public:
  template <class T> VFloat(T value) : m_value(value) {}

  virtual size_t size() const {
    return sizeof(m_value);
  }
private:
  float m_value;
};

class MBase {
public:
  MBase(size_t size) : m_size(size) { }
  size_t size() const {
    return m_size;
  }
private:
  size_t m_size;
};

class MInt : public MBase {
public:
  MInt(int value) : MBase(sizeof(m_value)), m_value(value) { }
private:
  int m_value;
};

class MFloat : public MBase {
public:
  MFloat(int value) : MBase(sizeof(m_value)), m_value(value) { }
private:
  float m_value;
};

double operator-(rusage const& a, rusage const& b) {
    double result = (a.ru_utime.tv_usec - b.ru_utime.tv_usec) / 1.0e6;
    result += (a.ru_utime.tv_sec - b.ru_utime.tv_sec);
    return result;
}

template <class Func>
double measure(Func func)
{
    rusage before, after;
    getrusage(RUSAGE_SELF, &before);
    func();
    getrusage(RUSAGE_SELF, &after);
    return (after - before);
}

int main() {
  {
    std::vector<VBase*> array;
    size_t sum = 0;
    for (int i = 0 ; i < 100000000 ; ++i) {
      if (i % 2 == 0)
        array.push_back(new VFloat(i));
      else
        array.push_back(new VInt(i));
    }

    double result = measure([&array, &sum](){
        sum = std::accumulate(array.begin(), array.end(), 0,
                              [](size_t x, VBase* ptr){
                                return x + ptr->size();
                              });
      });
    std::for_each(array.begin(), array.end(),
                  std::default_delete<VBase>());

    std::cout << "VBase - Sum is: " << sum << ", "
              << "Exec time is: " << result
              << std::endl;
  }
  {
    std::vector<MBase*> array;
    size_t sum = 0;
    for (int i = 0 ; i < 100000000 ; ++i) {
      if (i % 2 == 0)
        array.push_back(new MFloat(i));
      else
        array.push_back(new MInt(i));
    }

    double result = measure([&array, &sum](){
        sum = std::accumulate(array.begin(), array.end(), 0,
                              [](size_t x, MBase* ptr){
                                return x + ptr->size();
                              });
      });

    std::for_each(array.begin(), array.end(),
                  std::default_delete<MBase>());

    std::cout << "MBase - Sum is: " << sum << ", "
              << "Exec time is: " << result
              << std::endl;
  }
}