// Copyright 2010-2012 RethinkDB, all rights reserved.
#ifndef ARCH_RUNTIME_CORO_PROFILER_HPP_
#define	ARCH_RUNTIME_CORO_PROFILER_HPP_


#ifdef ENABLE_CORO_PROFILER

#include <array>
#include <map>
#include <fstream>

#include "backtrace.hpp"
#include "utils.hpp"
#include "arch/spinlock.hpp"
#include "config/args.hpp"


/* Depth of the stack traces generated by the coro profiler. */
#define CORO_PROFILER_BACKTRACE_DEPTH            10

/* How frequently should the coro profiler aggregate data and generate a report? */
#define CORO_PROFILER_REPORTING_INTERVAL        (secs_to_ticks(1.0) * 5)

/* If you set CORO_PROFILER_ADDRESS_TO_LINE to 1, the coro profiler prints
 * the filename and line number of the source file in/at which a sample was recorded.
 * Unfortunately that conversion is pretty slow. */
#define CORO_PROFILER_ADDRESS_TO_LINE           0


/*
 * The `coro_profiler_t` collects information about where coroutines spend time.
 * In order to turn it on, `ENABLE_CORO_PROFILER` must be defined at compile time.
 * It will only work reliably in debug mode, even though it can provide some
 * data in release mode as well.
 * You should compile as follows: `make CORO_PROFILING=1 DEBUG=1`
 * If you want to use the profiler in release mode, compile with
 * `make CORO_PROFILING=1 SYMBOLS=1 NO_OMIT_FRAME_POINTER=1`
 * Keep in mind though that backtraces can be unreliable in release.
 *
 * The coro profiler records a sample whenever it encounters a `PROFILER_RECORD_SAMPLE`
 * and also every time a coroutine yields.
 *
 * The following data is aggregated:
 *      - How often a certain recording point has been reached within the past
 *        `CORO_PROFILER_REPORTING_INTERVAL`.
 *      - How much time has passed since the coroutine has resumed running
 *        (this is useful to identify coroutines that run for long
 *        periods of time without yielding control)
 *      - How much time has passed on a coroutine since the previous recording
 *        point
 *      - The priority of the coroutine
 *
 * A combination of coro_type (signature of the function that spawned the coroutine)
 * and a limited-depth backtrace (see `CORO_PROFILER_BACKTRACE_DEPTH`) is used to
 * identify an "execution point". Data is recorded and reported for each such
 * execution point.
 *
 * The aggregated data is written to the file "coro_profiler_out.py" in the working
 * directory. Data is written every `CORO_PROFILER_REPORTING_INTERVAL` ticks.
 */
class coro_profiler_t {
public:
    // Should you ever want to make this a true singleton, just make the
    // constructor private.
    coro_profiler_t();

    static coro_profiler_t &get_global_profiler();

    void record_sample(size_t levels_to_strip_from_backtrace = 0);

    // coroutine execution is resumed
    void record_coro_resume();
    // coroutine execution yields
    void record_coro_yield(size_t levels_to_strip_from_backtrace);

private:
    typedef std::array<void *, CORO_PROFILER_BACKTRACE_DEPTH> small_trace_t;
    // We identify an execution point of a coroutine by a pair of
    // the coro's coroutine_type (the function which spawned it) and
    // a small_trace_t of its current execution point.
    typedef std::pair<std::string, small_trace_t> coro_execution_point_key_t;
    struct coro_sample_t {
        coro_sample_t(ticks_t _ticks_since_resume, ticks_t _ticks_since_previous, int _priority) :
            ticks_since_resume(_ticks_since_resume),
            ticks_since_previous(_ticks_since_previous),
            priority(_priority) { }
        ticks_t ticks_since_resume;
        ticks_t ticks_since_previous;
        int priority;
    };
    struct per_execution_point_samples_t {
        per_execution_point_samples_t() : num_samples_total(0) { }
        int num_samples_total;
        std::vector<coro_sample_t> samples;
    };
    struct per_thread_samples_t {
        per_thread_samples_t() : ticks_at_last_report(get_ticks()) { }
        std::map<coro_execution_point_key_t, per_execution_point_samples_t> per_execution_point_samples;
        spinlock_t spinlock;
        // This field is a duplicate of the global `ticks_at_last_report` in
        // `coro_profiler_t`. We copy it in each thread in order to avoid having
        // to lock and access the global field from different threads.
        ticks_t ticks_at_last_report;
    };

    // Represents the distribution of data points as a normal distribution,
    // plus minimum and maximum values.
    struct data_distribution_t {
        data_distribution_t() : min(0.0), max(0.0), mean(0.0), stddev(0.0) { }
        double min, max;
        double mean, stddev;
    };
    struct per_execution_point_collected_report_t {
        per_execution_point_collected_report_t() : num_samples(0) { }
        void compute_stats();
        std::vector<coro_sample_t> collected_samples;
        size_t num_samples;
        data_distribution_t time_since_previous;
        data_distribution_t time_since_resume;
        data_distribution_t priority;

    private:
        // Helper functions for compute_stats
        void update_min_max(const double new_sample, data_distribution_t *current_out) const;
        void accumulate_sample_pass1(const double new_sample,
                                     data_distribution_t *current_out) const;
        void divide_mean(data_distribution_t *current_out) const;
        void accumulate_sample_pass2(const double new_sample,
                                     data_distribution_t *current_out) const;
        void divide_stddev(data_distribution_t *current_out) const;
    };

    void generate_report();
    void print_to_reql(const std::map<coro_execution_point_key_t,
                       per_execution_point_collected_report_t> &execution_point_reports);
    void write_reql_header();
    std::string distribution_to_object_str(const data_distribution_t &distribution);
    std::string trace_to_array_str(const small_trace_t &trace);
    const std::string &get_frame_description(void *addr);
    coro_execution_point_key_t get_current_execution_point(size_t levels_to_strip_from_backtrace);

    // Would be nice if we could use one_per_thread here. However
    // that makes the construction order tricky.
    std::array<cache_line_padded_t<per_thread_samples_t >, MAX_THREADS> per_thread_samples;

    ticks_t ticks_at_last_report;
    /* Locking order is always:
     * 1. report_interval_spinlock
     * 2. per_thread_samples.spinlock in ascending order of thread num
     * You can safely skip some of the locks in this order.
     * Acquiring locks in different orders can dead-lock.
     */
    spinlock_t report_interval_spinlock;

    std::map<void *, std::string> frame_description_cache;
    address_to_line_t address_to_line;

    std::ofstream reql_output_file;

    DISABLE_COPYING(coro_profiler_t);
};

// Short-cuts
//
// PROFILER_CORO_RESUME and PROFILER_CORO_YIELD are meant to be used in
// the internal coroutine implementation to notify the profiler about when a coroutine
// yields and resumes execution respectively.
//
// PROFILER_RECORD_SAMPLE on the other hand can be used throughout the code to
// increase the granularity of profiling. By default, the coro profiler collects
// data only when a coroutine yields (assuming that PROFILER_CORO_YIELD gets called).
// PROFILER_RECORD_SAMPLE adds an additional point for data collection in between
// such yields and can be used to "trace" execution times through different
// sections of a given piece of code.
#define PROFILER_RECORD_SAMPLE coro_profiler_t::get_global_profiler().record_sample()
#define PROFILER_CORO_RESUME coro_profiler_t::get_global_profiler().record_coro_resume()
#define PROFILER_CORO_YIELD(STRIP_FRAMES) coro_profiler_t::get_global_profiler().record_coro_yield(STRIP_FRAMES)

#else /* ENABLE_CORO_PROFILER */

// Short-cuts (no-ops for disabled coro profiler)
#define PROFILER_RECORD_SAMPLE do {} while(0)
#define PROFILER_CORO_RESUME do {} while(0)
#define PROFILER_CORO_YIELD(STRIP_FRAMES) do {} while(0)

#endif /* not ENABLE_CORO_PROFILER */

#endif	/* ARCH_RUNTIME_CORO_PROFILER_HPP_ */
