/**
 * Copyright (c) Facebook, Inc. and its affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include "hardware-counter.h"

#ifndef NO_HARDWARE_COUNTERS

#define _GNU_SOURCE 1
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <assert.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <asm/unistd.h>
#include <sys/prctl.h>
#include <linux/perf_event.h>

namespace HPHP {
///////////////////////////////////////////////////////////////////////////////

IMPLEMENT_THREAD_LOCAL_NO_CHECK(HardwareCounter,
    HardwareCounter::s_counter);

static bool s_recordSubprocessTimes = false;
static bool s_profileHWEnable;
static std::string s_profileHWEvents;

static inline bool useCounters() {
#ifdef VALGRIND
  return false;
#else
  return s_profileHWEnable;
#endif
}

class HardwareCounterImpl {
public:
  HardwareCounterImpl(int type, unsigned long config,
                      const char* desc = nullptr)
    : m_desc(desc ? desc : ""), m_err(0), m_fd(-1), inited(false) {
    memset (&pe, 0, sizeof (struct perf_event_attr));
    pe.type = type;
    pe.size = sizeof (struct perf_event_attr);
    pe.config = config;
    pe.inherit = s_recordSubprocessTimes;
    pe.disabled = 1;
    pe.pinned = 0;
    pe.exclude_kernel = 0;
    pe.exclude_hv = 1;
    pe.read_format =
      PERF_FORMAT_TOTAL_TIME_ENABLED|PERF_FORMAT_TOTAL_TIME_RUNNING;
    }

  ~HardwareCounterImpl() {
    close();
  }

  void init_if_not() {
    /*
     * perf_event_open(struct perf_event_attr *hw_event_uptr, pid_t pid,
     *                 int cpu, int group_fd, unsigned long flags)
     */
    if (inited) return;
    inited = true;
    m_fd = syscall(__NR_perf_event_open, &pe, 0, -1, -1, 0);
    if (m_fd < 0) {
      // Logger::Verbose("perf_event_open failed with: %s",
      //                 folly::errnoStr(errno).c_str());
      m_err = -1;
      return;
    }
    if (ioctl(m_fd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
      // Logger::Warning("perf_event failed to enable: %s",
      //                 folly::errnoStr(errno).c_str());
      close();
      m_err = -1;
      return;
    }
    reset();
  }

  int64_t read() {
    uint64_t values[3];
    if (readRaw(values)) {
      if (!values[2]) return 0;
      int64_t value = (double)values[0] * values[1] / values[2];
      return value + extra;
    }
    return 0;
  }

  void incCount(int64_t amount) {
    extra += amount;
  }

  bool readRaw(uint64_t* values) {
    if (m_err || !useCounters()) return false;
    init_if_not();

    if (m_fd > 0) {
      /*
       * read the count + scaling values
       *
       * It is not necessary to stop an event to read its value
       */
      auto ret = ::read(m_fd, values, sizeof(*values) * 3);
      if (ret == sizeof(*values) * 3) {
        values[0] -= reset_values[0];
        values[1] -= reset_values[1];
        values[2] -= reset_values[2];
        return true;
      }
    }
    return false;
  }

  void reset() {
    if (m_err || !useCounters()) return;
    init_if_not();
    extra = 0;
    if (m_fd > 0) {
      if (ioctl (m_fd, PERF_EVENT_IOC_RESET, 0) < 0) {
        // Logger::Warning("perf_event failed to reset with: %s",
        //                 folly::errnoStr(errno).c_str());
        m_err = -1;
        return;
      }
      auto ret = ::read(m_fd, reset_values, sizeof(reset_values));
      if (ret != sizeof(reset_values)) {
        // Logger::Warning("perf_event failed to reset with: %s",
        //                 folly::errnoStr(errno).c_str());
        m_err = -1;
        return;
      }
    }
  }

public:
  std::string m_desc;
  int m_err;
private:
  int m_fd;
  struct perf_event_attr pe;
  bool inited;
  uint64_t reset_values[3];
  uint64_t extra{0};

  void close() {
    if (m_fd > 0) {
      ::close(m_fd);
      m_fd = -1;
    }
  }
};

class InstructionCounter : public HardwareCounterImpl {
public:
  InstructionCounter() :
    HardwareCounterImpl(PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS) {}
};

class LoadCounter : public HardwareCounterImpl {
public:
  LoadCounter() :
    HardwareCounterImpl(PERF_TYPE_HW_CACHE,
        (PERF_COUNT_HW_CACHE_L1D | ((PERF_COUNT_HW_CACHE_OP_READ) << 8))) {}
};

class StoreCounter : public HardwareCounterImpl {
public:
  StoreCounter() :
    HardwareCounterImpl(PERF_TYPE_HW_CACHE,
        PERF_COUNT_HW_CACHE_L1D | ((PERF_COUNT_HW_CACHE_OP_WRITE) << 8)) {}
};

HardwareCounter::HardwareCounter()
  : m_countersSet(false) {
  m_instructionCounter.reset(new InstructionCounter());
  if (s_profileHWEvents.empty()) {
    m_loadCounter.reset(new LoadCounter());
    m_storeCounter.reset(new StoreCounter());
  } else {
    m_countersSet = true;
    setPerfEvents(s_profileHWEvents);
  }
}

HardwareCounter::~HardwareCounter() {
}

void HardwareCounter::Init(bool enable, const std::string& events,
                           bool subProc) {
  s_profileHWEnable = enable;
  s_profileHWEvents = events;
  s_recordSubprocessTimes = subProc;
}

void HardwareCounter::Reset() {
  s_counter->reset();
}

void HardwareCounter::reset() {
  m_instructionCounter->reset();
  if (!m_countersSet) {
    m_storeCounter->reset();
    m_loadCounter->reset();
  }
  for (unsigned i = 0; i < m_counters.size(); i++) {
    m_counters[i]->reset();
  }
}

int64_t HardwareCounter::GetInstructionCount() {
  return s_counter->getInstructionCount();
}

int64_t HardwareCounter::getInstructionCount() {
  return m_instructionCounter->read();
}

int64_t HardwareCounter::GetLoadCount() {
  return s_counter->getLoadCount();
}

int64_t HardwareCounter::getLoadCount() {
  return m_loadCounter->read();
}

int64_t HardwareCounter::GetStoreCount() {
  return s_counter->getStoreCount();
}

int64_t HardwareCounter::getStoreCount() {
  return m_storeCounter->read();
}

void HardwareCounter::IncInstructionCount(int64_t amount) {
  s_counter->m_instructionCounter->incCount(amount);
}

void HardwareCounter::IncLoadCount(int64_t amount) {
  if (!s_counter->m_countersSet) {
    s_counter->m_loadCounter->incCount(amount);
  }
}

void HardwareCounter::IncStoreCount(int64_t amount) {
  if (!s_counter->m_countersSet) {
    s_counter->m_storeCounter->incCount(amount);
  }
}

struct PerfTable perfTable[] = {
  /* PERF_TYPE_HARDWARE events */
#define PC(n)    PERF_TYPE_HARDWARE, PERF_COUNT_HW_ ## n
  { "cpu-cycles",              PC(CPU_CYCLES)              },
  { "cycles",                  PC(CPU_CYCLES)              },
  { "instructions",            PC(INSTRUCTIONS)            },
  { "cache-references",        PC(CACHE_REFERENCES)        },
  { "cache-misses",            PC(CACHE_MISSES)            },
  { "branch-instructions",     PC(BRANCH_INSTRUCTIONS)     },
  { "branches",                PC(BRANCH_INSTRUCTIONS)     },
  { "branch-misses",           PC(BRANCH_MISSES)           },
  { "bus-cycles",              PC(BUS_CYCLES)              },
  { "stalled-cycles-frontend", PC(STALLED_CYCLES_FRONTEND) },
  { "stalled-cycles-backend",  PC(STALLED_CYCLES_BACKEND)  },

  /* PERF_TYPE_HW_CACHE hw_cache_id */
#define PCC(n)   PERF_TYPE_HW_CACHE, PERF_COUNT_HW_CACHE_ ## n
  { "L1-dcache-",          PCC(L1D)                },
  { "L1-icache-",          PCC(L1I)                },
  { "LLC-",                PCC(LL)                 },
  { "dTLB-",               PCC(DTLB)               },
  { "iTLB-",               PCC(ITLB)               },
  { "branch-",             PCC(BPU)                },

  /* PERF_TYPE_HW_CACHE hw_cache_op, hw_cache_result */
#define PCCO(n, m)  PERF_TYPE_HW_CACHE, \
                    ((PERF_COUNT_HW_CACHE_OP_ ## n) << 8 | \
                    (PERF_COUNT_HW_CACHE_RESULT_ ## m) << 16)
  { "loads",               PCCO(READ, ACCESS)      },
  { "load-misses",         PCCO(READ, MISS)        },
  { "stores",              PCCO(WRITE, ACCESS)     },
  { "store-misses",        PCCO(WRITE, MISS)       },
  { "prefetches",          PCCO(PREFETCH, ACCESS)  },
  { "prefetch-misses",     PCCO(PREFETCH, MISS)    }
};

static int findEvent(const char *event, struct PerfTable *t,
                     int len, int *match_len) {
  int i;

  for (i = 0; i < len; i++) {
    if (!strncmp(event, t[i].name, strlen(t[i].name))) {
      *match_len = strlen(t[i].name);
      return i;
    }
  }
  return -1;
}

#define CPUID_STEPPING(x)  ((x) & 0xf)
#define CPUID_MODEL(x)     (((x) & 0xf0) >> 4)
#define CPUID_FAMILY(x)    (((x) & 0xf00) >> 8)
#define CPUID_TYPE(x)      (((x) & 0x3000) >> 12)

// hack to get LLC counters on perflab frc machines
static bool isIntelE5_2670() {
#ifdef __x86_64__
  unsigned long x;
  asm volatile ("cpuid" : "=a"(x): "a"(1) : "ebx", "ecx", "edx");
  return CPUID_STEPPING(x) == 6 && CPUID_MODEL(x) == 0xd
         && CPUID_FAMILY(x) == 6 && CPUID_TYPE(x) == 0;
#else
  return false;
#endif
}

static void checkLLCHack(const char* event, uint32_t& type, uint64_t& config) {
  if (!strncmp(event, "LLC-load", 8) && isIntelE5_2670()) {
    type = PERF_TYPE_RAW;
    if (!strncmp(&event[4], "loads", 5)) {
      config = 0x534f2e;
    } else if (!strncmp(&event[4], "load-misses", 11)) {
      config = 0x53412e;
    }
  }
}

bool HardwareCounter::addPerfEvent(const char* event) {
  uint32_t type = 0;
  uint64_t config = 0;
  int i, match_len;
  bool found = false;
  const char* ev = event;

  while ((i = findEvent(ev, perfTable,
                        sizeof(perfTable)/sizeof(struct PerfTable),
                        &match_len))
       != -1) {
    if (!found) {
      found = true;
      type = perfTable[i].type;
    } else if (type != perfTable[i].type) {
      // Logger::Warning("failed to find perf event: %s", event);
      return false;
    }
    config |= perfTable[i].config;
    ev = &ev[match_len];
  }

  checkLLCHack(event, type, config);

  // Check if we have a raw spec.
  if (!found && event[0] == 'r' && event[1] != 0) {
    config = strtoull(event + 1, const_cast<char**>(&ev), 16);
    if (*ev == 0) {
      found = true;
      type = PERF_TYPE_RAW;
    }
  }

  if (!found || *ev) {
    // Logger::Warning("failed to find perf event: %s", event);
    return false;
  }
  std::unique_ptr<HardwareCounterImpl> hwc(
      new HardwareCounterImpl(type, config, event));
  if (hwc->m_err) {
    // Logger::Warning("failed to set perf event: %s", event);
    return false;
  }
  m_counters.emplace_back(std::move(hwc));
  if (!m_countersSet) {
    // reset load and store counters. This is because
    // perf does not seem to handle more than three counters
    // very well.
    m_loadCounter.reset();
    m_storeCounter.reset();
    m_countersSet = true;
  }
  return true;
}

bool HardwareCounter::eventExists(const char *event) {
  // hopefully m_counters set is small, so a linear scan does not hurt
  for(unsigned i = 0; i < m_counters.size(); i++) {
    if (!strcmp(event, m_counters[i]->m_desc.c_str())) {
      return true;
    }
  }
  return false;
}

bool HardwareCounter::setPerfEvents(std::string sevents) {
  // Make a copy of the string for use with strtok.
  auto const sevents_buf = static_cast<char*>(malloc(sevents.size() + 1));
  memcpy(sevents_buf, sevents.data(), sevents.size());
  sevents_buf[sevents.size()] = '\0';

  char* strtok_buf = nullptr;
  char* s = strtok_r(sevents_buf, ",", &strtok_buf);
  bool success = true;
  while (s) {
    if (!eventExists(s) && !addPerfEvent(s)) {
      success = false;
      break;
    }
    s = strtok_r(nullptr, ",", &strtok_buf);
  }
  free(sevents_buf);
  return success;
}

bool HardwareCounter::SetPerfEvents(std::string events) {
  return s_counter->setPerfEvents(events);
}

void HardwareCounter::clearPerfEvents() {
  m_counters.clear();
}

void HardwareCounter::ClearPerfEvents() {
  s_counter->clearPerfEvents();
}

const std::string
  s_instructions("instructions"),
  s_loads("loads"),
  s_stores("stores");

void HardwareCounter::getPerfEvents(PerfEventCallback f, void* data) {
  f(s_instructions, getInstructionCount(), data);
  if (!m_countersSet) {
    f(s_loads, getLoadCount(), data);
    f(s_stores, getStoreCount(), data);
  }
  for (unsigned i = 0; i < m_counters.size(); i++) {
    f(m_counters[i]->m_desc, m_counters[i]->read(), data);
  }
}

void HardwareCounter::GetPerfEvents(PerfEventCallback f, void* data) {
  s_counter->getPerfEvents(f, data);
}

///////////////////////////////////////////////////////////////////////////////
}


#else // NO_HARDWARE_COUNTERS

namespace HPHP {
///////////////////////////////////////////////////////////////////////////////

HardwareCounter HardwareCounter::s_counter;

///////////////////////////////////////////////////////////////////////////////
}

#endif // NO_HARDWARE_COUNTERS
