﻿/*---------------------------------------------------------------------------*
    Copyright (C)2015 Nintendo Co., Ltd.  All rights reserved.

    These coded instructions, statements, and computer programs contain
    proprietary information of Nintendo of America Inc. and/or Nintendo
    Company Ltd., and are protected by Federal copyright law.  They may
    not be disclosed to third parties or copied or duplicated in any form,
    in whole or in part, without the prior written consent of Nintendo.
 *---------------------------------------------------------------------------*/

#define _BSD_SOURCE 1
#include <string.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <time.h>
#include <inttypes.h>
#include "ntd-tests/ntd-tests.h"

#define ARRAY_COUNT(a) (sizeof(a)/sizeof(a[0]))

// #define MIN_PAGE_SIZE 1024*1024
#define MIN_PAGE_SIZE 1024
#define BUF_SIZE (16*MIN_PAGE_SIZE)
#define PERF_TEST_SIZE (BUF_SIZE - (2*64))

#define DUMP_WINDOW 80
// print DUMP_WINDOW bytes around problem
// indicate the begining/end of copy range

#define INDEX_LIMIT 128
#define max(a,b) ((a) > (b) ? (a) : (b))
#define min(a,b) ((a) < (b) ? (a) : (b))
#define TEST_DIST 1
// only track low 16 bits (10000 tries almost covers all of 32x32 but sometimes misses one)
// since our implementation only aligns to 16 bytes we don't need to verify more
// if we use something which does cache line alignment we may have to increase this.
#define DIST_ALIGN  16
#define MAX_ALIGN_DIST (DIST_ALIGN*DIST_ALIGN)
#define MAX_LENGTH 1040
#define INDEX_ALIGN(i) ((i) % DIST_ALIGN)
#define dist_of_aligns(a1, a2) ((INDEX_ALIGN(a1) * DIST_ALIGN) + INDEX_ALIGN(a2))

char *buf1 = NULL;
char *buf2 = NULL;
char *buf3 = NULL;

const size_t iterations = 60000;
#define TRACK_ALIGN  32
typedef uint64_t pico_secs_t;
typedef struct perf_data_t {
    uint8_t     src_align;
    uint8_t     dst_align;
    uint8_t     summary_index;
    uint32_t    len;
    pico_secs_t pico_secs;
    uint64_t    bytes_per_second;
} perf_data_t;

static void verify_memcpy_output(int line, const char *func, char *dst, char* src, size_t len);
static void compute_copy_overhead();

pico_secs_t gettime_in_picoseconds(void)
{
    struct timespec ts;
    clock_gettime(CLOCK_REALTIME, &ts);

    return (pico_secs_t) (ts.tv_sec * 1000000000000) + ts.tv_nsec * 1000;
}

static pico_secs_t delta_picoseconds(pico_secs_t start)
{
    pico_secs_t delta = (gettime_in_picoseconds() - start);
    if (delta <= 0) delta = 500;
    return delta;
}

static void time_human_readable_format(double t, char *format)
{
    char *suffix = " ";
    double fraction;
    double multipler = 1.0;
    if (t > 500000000.0) {
        /* print as seconds if > .5 second */
        multipler = 0.000000001;
        suffix = "s ";
    } else if (t > 100000.0) {
        /* print as milli-second if > 100 micro-seconds (100000 nano seconds) */
        multipler = 0.000001;
        suffix = "ms";
    } else if (t > 100.0) {
        /* print as micro-seconds if > 100 nano-seconds */
        multipler = 0.001;
        suffix = "us";
    } else {
        suffix = "ns";
    }
    fraction = (double) t * multipler;
    printf(format, fraction, suffix);
}

static void time_human_readable_ave(pico_secs_t t, int count)
{
    if (count == 0) count = 1;
    double ave = (double) (t / 1000.0) / (double) count;
    time_human_readable_format(ave, "%8.3f %s");
}

static void time_human_readable(pico_secs_t t)
{
    time_human_readable_format((double) (t / 1000.0), "%8.3f %s");
}

static void print_gigabyte(int64_t l)
{
    double fraction;
    /* print as Giga Byte */
    fraction = (double) l / (double) (1024*1024*1024);

    printf("%6.3f G", fraction);
}

static void print_human_readable_format(int64_t l, char *format)
{
    char *suffix = " ";
    double fraction;
    if (l > 990*1024*1024) {
        /* print as Giga Byte if greater than 990 Meg */
        fraction = (double) l / (double) (1024*1024*1024);
        suffix = "G";
    } else if (l > 990*1024) {
        /* print as Mega Byte if greater than 990 K */
        fraction = (double) l / (double) (1024*1024);
        suffix = "M";
    } else if (l > 990) {
        /* print as Kilo Byte if greater than 990 */
        fraction = (double) l / (double) (1024);
        suffix = "K";
    } else {
        fraction = (double) l;
        suffix = " ";
    }
    printf(format, fraction, suffix);
}

static void print_human_readable(int64_t l)
{
    print_human_readable_format(l, "%8.2f %s");
}

__attribute__((noinline)) static void *byte_memcpy(void *restrict dest, const void *restrict src, size_t n)
{
    unsigned char *d = dest;
    const unsigned char *s = src;

#pragma clang loop vectorize(disable) interleave(disable) unroll(disable)
    for (; n; n--) *d++ = *s++;
    return dest;
}

typedef struct perf_summary_t {
    int     iterations;
    int64_t bytes;
    int64_t pico_secs;
    size_t  len_min;
    size_t  len_max;
    int     summary_start;
    int     summary_end;
} perf_summary_t;

#define N_SUMMARIES 7

typedef struct copy_func_info_t *copy_func_info_ptr;
typedef char * (*copy_test_func_ptr)(char *, const char *, size_t, perf_data_t *, int);
typedef struct copy_func_info_t {
    const char *        name;
    copy_test_func_ptr  func;
    perf_data_t         *array;
    int                 index;
    perf_summary_t      summary[N_SUMMARIES];
} copy_func_info_t;

#define COPY_FUNC_INFO(copyfunc) { #copyfunc, &do_ ## copyfunc, NULL, 0, {0, 0, 0, 0, 0} }
#define SUMMARY_SMALL_DATA_SIZE 128
uint8_t summary_from_size_and_align(uint32_t len, uint8_t src_align, uint8_t dst_align)
{
    // all copys <= SUMMARY_SMALL_DATA_SIZE bytes
    // all copies > SUMMARY_SMALL_DATA_SIZE
    // 8 byte aligned test case
    // 4 byte aligned test case
    // 3 byte aligned test case
    // 2 byte aligned test case
    // 1 byte aligned test case

    uint8_t summary_index = 0;
    if (len > SUMMARY_SMALL_DATA_SIZE) {
        if (len == PERF_TEST_SIZE) {
            int delta = dst_align - src_align;
            if (src_align > dst_align) {
                delta = src_align-dst_align;
            }
            if ((delta % 8) == 0) {
                summary_index = 2;
            } else {
                summary_index = 3 + (delta % 4);
            }
        } else {
            summary_index = 1;
        }
    }
    return summary_index;
}

#define DECLARE_COPY_TIME_FUNCTION(copyfunc) \
static char *do_ ## copyfunc(char *dst, const char *src, size_t len, perf_data_t *perf_data, int repeats) \
{ \
    char *ndest; \
    pico_secs_t start = gettime_in_picoseconds(); \
    do { \
        ndest = copyfunc(dst, src, len); \
    } while (--repeats); \
    perf_data->pico_secs = delta_picoseconds(start); \
    return ndest; \
}

DECLARE_COPY_TIME_FUNCTION(byte_memcpy)
DECLARE_COPY_TIME_FUNCTION(memcpy)
DECLARE_COPY_TIME_FUNCTION(memmove)

#if __NNMUSL_HAS_MEMCPY_MUSL
DECLARE_COPY_TIME_FUNCTION(__memcpy_musl)
#endif
#if __NNMUSL_HAS_MEMCPY_FAST
DECLARE_COPY_TIME_FUNCTION(__memcpy_fast)
#endif
#if __NNMUSL_HAS_MEMCPY_DEVICE
DECLARE_COPY_TIME_FUNCTION(__memcpy_device)
#endif
#if __NNMUSL_HAS_MEMCPY_SIMD
DECLARE_COPY_TIME_FUNCTION(__memcpy_simd)
#endif
#if __NNMUSL_HAS_MEMMOVE_FAST
DECLARE_COPY_TIME_FUNCTION(__memmove_fast)
#endif
#if __NNMUSL_HAS_MEMMOVE_DEVICE
DECLARE_COPY_TIME_FUNCTION(__memmove_device)
#endif

copy_func_info_t    memcpy_funcs[] = {
    COPY_FUNC_INFO(memcpy),
#if __NNMUSL_HAS_MEMCPY_MUSL
    COPY_FUNC_INFO(__memcpy_musl),
#endif
#if __NNMUSL_HAS_MEMCPY_DEVICE
    COPY_FUNC_INFO(__memcpy_device),
#endif
#if __NNMUSL_HAS_MEMCPY_FAST
    COPY_FUNC_INFO(__memcpy_fast),
#endif
#if __NNMUSL_HAS_MEMCPY_SIMD
    COPY_FUNC_INFO(__memcpy_simd),
#endif
    COPY_FUNC_INFO(byte_memcpy)
};

copy_func_info_t    memmove_funcs[] = {
    COPY_FUNC_INFO(memmove),

#if __NNMUSL_HAS_MEMMOVE_DEVICE
    COPY_FUNC_INFO(__memmove_device),
#endif
#if __NNMUSL_HAS_MEMMOVE_FAST
    COPY_FUNC_INFO(__memmove_fast),
#endif
};

static bool test_init(void)
{
    buf1 = (char*)MALLOC_TEST(BUF_SIZE);
    if (!buf1) return false;
    buf2 = (char*)MALLOC_TEST(BUF_SIZE);
    if (!buf1) return false;
    buf3 = (char*)MALLOC_TEST(BUF_SIZE);
    if (!buf1) return false;
    memset(buf1, 0x5a, BUF_SIZE);
    memset(buf2, 0xa5, BUF_SIZE);
    // buf3 is a reference for buf2's origional value
    memset(buf3, 0xa5, BUF_SIZE);

    int i;
    for (i = 0; i < ARRAY_COUNT(memcpy_funcs); i++) {
        memcpy_funcs[i].array = CALLOC_TEST(iterations, sizeof(perf_data_t));
        memcpy_funcs[i].index = 0;
    }

    for (i = 0; i < ARRAY_COUNT(memmove_funcs); i++) {
        memmove_funcs[i].array = CALLOC_TEST(iterations, sizeof(perf_data_t));
        memmove_funcs[i].index = 0;
    }

    compute_copy_overhead();

    return true;
}

static void test_fini(void)
{
    free(buf1);
    free(buf2);
    free(buf3);

    int i;
    for (i = 0; i < ARRAY_COUNT(memcpy_funcs); i++) {
        free(memcpy_funcs[i].array);
        memcpy_funcs[i].array = NULL;
        memcpy_funcs[i].index = 0;
    }

    for (i = 0; i < ARRAY_COUNT(memmove_funcs); i++) {
        free(memmove_funcs[i].array);
        memmove_funcs[i].array = NULL;
        memmove_funcs[i].index = 0;
    }
}

static void dump_buffer(const char *name, const char *buf, int start, int end, size_t len, int delta, int target_start, int bad_index)
{
    int i;
    printf("%s[%6d]:", name, start + delta);
    if (target_start + (int) len < start + delta) {
        printf("->");
    } else if (target_start < start + delta) {
        printf("<-");
    }
    for (i = start; i < end; i++) {
        int buf_index = i + delta;
        if (buf_index == target_start) {
            printf("<");
        }
        if (buf_index == (target_start + (int) len)) {
            printf(">");
        }
        if (i == bad_index) {
            printf("*");
        }
        if (((buf_index - delta) % 16) == 0) {
            printf("|");
        }
        if ((buf_index < 0) || (buf_index >= BUF_SIZE)) {
            printf("  ");
        } else {
            printf("%02x", (uint8_t) buf[buf_index]);
        }
    }
    if (target_start + len > end + delta) {
        printf("->");
    }
    printf("\n");
}

unsigned short dist_len[MAX_LENGTH] = {0};
unsigned short dist_align[MAX_ALIGN_DIST];

static void dump_range(int dst_start, int src_start, int i, int bad_index, size_t len, int delta)
{
    int start_index;
    int end_index;


    if (i > (DUMP_WINDOW / 2)) {
        start_index = i - (DUMP_WINDOW / 2);
    } else {
        start_index = 0;
    }
    end_index = start_index + DUMP_WINDOW;
    /* normalize start_buf1 based on src/dst delta */

    dump_buffer("buf1", buf1, start_index, end_index, len, delta, src_start, bad_index);
    dump_buffer("buf2", buf2, start_index, end_index, len, 0,     dst_start, bad_index);
    dump_buffer("buf3", buf3, start_index, end_index, len, 0,     dst_start, bad_index);
}

static void report_error(
    int line,
    const char *func,
    int dst_start,
    int src_start,
    size_t len,
    int bad_index,
    bool is_memmove,
    int delta)
{
    int i = bad_index;
    const char *src_name = "buf1";
    if (is_memmove) {
        src_name = "buf2";
    }
    NTD_TESTCASE_MESSAGE(0, __FILE__, line,
            "verify_copy_output()",
            "validation failed %s(&buf2[%d], &%s[%d], %d)",
            func, dst_start, src_name, src_start, len);
    printf("failed in %s(&buf2[%d], &%s[%d], %d) at index = %d delta = %d\n",
        func,
        (int) dst_start,
        src_name,
        (int) src_start,
        (int) len, (int) i, (int) delta);
    do {
        dump_range(dst_start, src_start, i, bad_index, len, delta);
        i = i + DUMP_WINDOW;
    } while (i < dst_start + len + (DUMP_WINDOW / 2));
}

static void verify_memcpy_output(int line, const char *func, char *dst, char* src, size_t len) {
    int     i;
    int     src_start = 0;
    int     dst_start = dst - buf2;
    char    *buf = buf2;
    bool    is_memmove = false;
    int     delta = 0;
    size_t  buffer_limit = BUF_SIZE;

    if ((src >= buf2) && (src < (buf2 + BUF_SIZE))) {
        // in memmove overlapping case src might be from buf2
        // but we compare with buf1 which retains the original value to be copied
        is_memmove = true;
        src_start = (src - buf2);
        dst_start = (dst - buf2);
    } else {
        src_start = (src - buf1);
        dst_start = (dst - buf2);
    }
    delta = src_start - dst_start;

    int dst_end = dst_start + len;

    // limit validation to begining of buffer + len + DUMP_WINDOW
    buffer_limit = min(BUF_SIZE, dst_start + len + DUMP_WINDOW);

    for (i = 0; i < buffer_limit; i++) {
        if (i >= dst_start && i < dst_end) {
            // check for garbage in
            if (buf2[i] != buf1[i + delta]) {
                report_error(line, func, dst_start, src_start, len, i, is_memmove, delta);
                return;
            }
        } else {
            // check for garbage before or after
            if (buf2[i] != buf3[i]) {
                report_error(line, func, dst_start, src_start, len, i, is_memmove, delta);
                return;
            }
        }
        buf++;
    }
}

static int compare_perf_data(const void *ptr1, const void *ptr2)
{
    const perf_data_t *p1 = (const perf_data_t *)ptr1;
    const perf_data_t *p2 = (const perf_data_t *)ptr2;
    if (p1->summary_index == p2->summary_index) {
        if (p1->bytes_per_second < p2->bytes_per_second) {
            return -1;
        } else if (p1->bytes_per_second == p2->bytes_per_second) {
            return 0;
        }
        return 1;
    }
    return (int) (p1->summary_index - p2->summary_index);
}

static int compare_by_size_time(const void *ptr1, const void *ptr2)
{
    const perf_data_t *p1 = (const perf_data_t *)ptr1;
    const perf_data_t *p2 = (const perf_data_t *)ptr2;
    if (p1->len == p2->len) {
        if (p1->pico_secs < p2->pico_secs) {
            return -1;
        } else if (p1->pico_secs == p2->pico_secs) {
            return 0;
        }
        return 1;
    }
    return (int) (p1->len - p2->len);
}

static void compute_dist_info(copy_func_info_t *info)
{
    int i;
    int n;
    perf_data_t *perf_array = info->array;
    size_t      array_count = info->index;

    for (i = 0; i < N_SUMMARIES; i++) {
        perf_summary_t *ps = &info->summary[i];
        ps->len_min = BUF_SIZE;
        ps->len_max = 0;
        ps->summary_start = array_count;
        ps->summary_end = 0;
    }

    memset(dist_len, 0, MAX_LENGTH * sizeof(short));
    memset(dist_align, 0, MAX_ALIGN_DIST * sizeof(short));

    qsort(perf_array, array_count, sizeof(perf_data_t), compare_perf_data);

    for (i = 0; i < array_count; i++) {
        perf_summary_t *ps = &info->summary[perf_array[i].summary_index];
        if (ps->summary_start == array_count) {
            ps->summary_start = i;
            ps->summary_end = i;
        } else if (ps->summary_end < i) {
            if (ps->summary_end != i - 1) {
                printf("summary[%d] start = %d, end was %d, found %d\n",
                    perf_array[i].summary_index, ps->summary_start, ps->summary_end, i);
            }
            ps->summary_end = i;
        }
        if (perf_array[i].len > 0) {
            ps->iterations++;
            ps->bytes += perf_array[i].len;
            ps->pico_secs += perf_array[i].pico_secs;
            if (perf_array[i].len < ps->len_min) {
                ps->len_min = perf_array[i].len;
            }
            if (perf_array[i].len > ps->len_max) {
                ps->len_max = perf_array[i].len;
            }
        }
        int dxs_align = dist_of_aligns(perf_array[i].dst_align, perf_array[i].src_align);
        dist_align[dxs_align]++;
        int len = perf_array[i].len;
        if (len >= MAX_LENGTH) {
            len = MAX_LENGTH-1;
        }
        dist_len[len]++;
    }

    bool found_all = true;
    for (n = 0; n < MAX_ALIGN_DIST; n++) {
        if (!dist_align[n]) {
            printf("    %s: no tests for byte align of %03d_%03d\n", info->name, n / DIST_ALIGN, n % DIST_ALIGN);
            found_all = false;
        }
    }
    found_all = true;
    int prev_missed = -1;
    for (n = 0; n < MAX_LENGTH; n++) {
        if (!dist_len[n]) {
            found_all = false;
            if (prev_missed < 0) prev_missed = n;
        } else {
            if (prev_missed >= 0) {
                if (prev_missed == (n - 1)) {
                    printf("    %s: LENGTH %4d       MISSING\n", info->name, prev_missed);
                } else {
                    printf("    %s: LENGTH %4d..%4d MISSING\n", info->name, prev_missed, n - 1);
                }
            }
            prev_missed = -1;
        }
    }
    if (prev_missed >= 0) {
        if (prev_missed == MAX_LENGTH - 1) {
            printf("    %s: LENGTH %4d       MISSING\n", info->name, prev_missed);
        } else {
            printf("    %s: LENGTH %4d..%4d MISSING\n", info->name, prev_missed, MAX_LENGTH - 1);
        }
    }
}

const char *align_titles[] = {
    "small",
    "random",
    "8-aligned",
    "4-aligned",
    "3-aligned",
    "2-aligned",
    "1-aligned"
};

void print_perf_title(copy_func_info_t *info)
{
    int i;
    // note all tests use the same sizes and alignments
    for (i = 0; i < N_SUMMARIES; i++) {
        printf("    %-11s: lengths [%5d..%5d] %5d copies of ",
            align_titles[i],
            info->summary[i].len_min,
            info->summary[i].len_max,
            info->summary[i].iterations);
        print_human_readable(info->summary[i].bytes);
        printf(" bytes\n");
    }

    printf("\n");
    printf("||copy function                ||");
    for (i = 0; i < N_SUMMARIES; i++) {
        printf(" %9.9s ||", align_titles[i]);
    }
    printf("\n||                             ||");
    printf("    <= %3d || > %3d   ||", SUMMARY_SMALL_DATA_SIZE, SUMMARY_SMALL_DATA_SIZE);
    for (i = 2; i < N_SUMMARIES; i++) {
        printf("    %5d  ||", PERF_TEST_SIZE);
    }
    printf("\n");
}

void print_perf_summaries(copy_func_info_t *info)
{
    int i;
    printf("|%-30s|", info->name);
    for (i = 0; i < N_SUMMARIES; i++) {
        perf_summary_t *perf = &info->summary[i];
        if (perf->summary_start >= iterations) {
            printf("bad summary indexes for %d\n", i);
        } else {
            int start = perf->summary_start;
            int end = perf->summary_end;
            // ignore the zero length copies
            while (info->array[start].len == 0) start++;
            int median_index = start + ((end - start) / 2);
            if (info->array[median_index].bytes_per_second > info->array[end].bytes_per_second) {
                printf("\n\nBAD data for summary %d of %s (%d, %d, %d)\n", i, info->name, start, median_index, end);
            }
            print_gigabyte(info->array[median_index].bytes_per_second);
        }
        printf("/s |");
    }
    printf("\n");

#if 0
    printf("\n all data for <= %d\n", SUMMARY_SMALL_DATA_SIZE);
    perf_summary_t *perf = &info->summary[0];

    qsort(info->array, perf->summary_end + 1, sizeof(perf_data_t), compare_by_size_time);

    uint32_t    last_len = 10000;

    for (i = perf->summary_start; i <= perf->summary_end; i++) {
        if (last_len != info->array[i].len) {
            last_len = info->array[i].len;
            printf("\nsize_%d", last_len);
        }
        printf("\t%" PRId64 , info->array[i].pico_secs);
    }
    printf("\n\n");
#endif
}

// 10ms min time
#define MIN_PS_TIME 10 * 1000 * 1000
#define RETRY_COUNT 20

static int compare_pico_secs_t(const void *ptr1, const void *ptr2)
{
    const pico_secs_t picosec1 = *(const pico_secs_t *)ptr1;
    const pico_secs_t picosec2 = *(const pico_secs_t *)ptr2;
    if (picosec1 != picosec2) {
        if (picosec1 < picosec2) {
            return -1;
        }
        return 1;
    }
    return 0;
}

static pico_secs_t overhead_of_zero_copy = 0;
static pico_secs_t overhead_per_byte_copied = 0;

static void compute_copy_overhead()
{
    const int repeats = 1000;
    const int retries = 20;
    const int byte_copy_index = ARRAY_COUNT(memcpy_funcs) - 1;
    copy_func_info_t *test_bytecpy = &memcpy_funcs[byte_copy_index];
    int i;
    pico_secs_t times_zero[retries];
    pico_secs_t times_15byte[retries];
    bool good_times = true;
    perf_data_t perf_data = {
        0,
        0,
        (uint8_t) 0,
        (uint32_t) 0,
        0,
        0
    };
    do {
        for (i = 0; i < retries; i++) {
            void *ndest = test_bytecpy->func(buf2, buf1, 0, &perf_data, repeats);
            pico_secs_t picosec2 = perf_data.pico_secs / repeats;
#if 0
            printf("%d: %s(buf2, buf1, 0) repeated %d times duration ",
                i, test_bytecpy->name, repeats);
            time_human_readable(picosec2);
            printf("\n");
#endif
            times_zero[i] = picosec2;
        }

        qsort(times_zero, retries, sizeof(pico_secs_t), compare_pico_secs_t);
        // take the median value
        overhead_of_zero_copy = times_zero[retries / 2];
        printf("overhead of zero copy = ");
        time_human_readable(overhead_of_zero_copy);
        printf("\n");

        for (i = 0; i < retries; i++) {
            void *ndest = test_bytecpy->func(buf2 + 1, buf1, 15, &perf_data, repeats);
            pico_secs_t picosec2 = perf_data.pico_secs / repeats;
#if 0
            printf("%d: %s(buf2, buf1, 15) repeated %d times duration ",
                i, test_bytecpy->name, repeats);
            time_human_readable(picosec2);
            printf("\n");
#endif
            times_15byte[i] = picosec2;
        }

        qsort(times_15byte, retries, sizeof(pico_secs_t), compare_pico_secs_t);
        // take the median value
        pico_secs_t picosec = times_15byte[retries / 2];
        if (picosec <= overhead_of_zero_copy) {
            good_times = false;
            printf("zero copy time ");
            time_human_readable(overhead_of_zero_copy);
            printf(" is greater then 15 byte copy time ");
            time_human_readable(picosec);
            printf("\n");
#if 0
            for (i = 0; i < retries; i++) {
                printf("times_zero[%d] = ", i);
                time_human_readable(times_zero[i]);
                printf("\n");
            }
            for (i = 0; i < retries; i++) {
                printf("times_15byte[%d] = ", i);
                time_human_readable(times_zero[i]);
                printf("\n");
            }
#endif
        } else {
            good_times = true;
            overhead_per_byte_copied = (picosec - overhead_of_zero_copy) / 15;
            if (overhead_per_byte_copied == 0) {
                printf("overhead_per_byte_copied was 0, based on %d repeats taking ", repeats);
                time_human_readable(overhead_per_byte_copied);
                printf("\n");
                overhead_per_byte_copied = 500;
            }
            printf("overhead per byte copied = ");
            time_human_readable(overhead_per_byte_copied);
            printf("\n");
        }
    } while (!good_times);

}

static void do_test_func(int line, char *dst, const char *src, size_t len, copy_func_info_ptr data)
{
    pico_secs_t max_expected_time = (overhead_of_zero_copy + len * overhead_per_byte_copied) * 20;
    pico_secs_t orig_ps = 1;
    perf_data_t perf_data = {
        (uint8_t) ((uintptr_t) src % TRACK_ALIGN),
        (uint8_t) ((uintptr_t) dst % TRACK_ALIGN),
        (uint8_t) 0,
        (uint32_t) len,
        0,
        0
    };

    char *ndest = data->func(dst, src, len, &perf_data, 1);

    NTD_TESTCASE_MESSAGE(ndest == dst, __FILE__, line, "ndest == dst",
            "%s(%p, %p, %d) returned %p expected %p",
            data->name, dst, src, (int) len, ndest, dst);
    verify_memcpy_output(line, data->name, dst, src, len);

    if (perf_data.pico_secs > 0) {
        orig_ps = perf_data.pico_secs;
    }
    if (orig_ps > max_expected_time || orig_ps < MIN_PS_TIME) {
        int retry = RETRY_COUNT;
        pico_secs_t picosec = orig_ps;
        int repeats = 1 + (MIN_PS_TIME / picosec);
        if (picosec > max_expected_time) {
            repeats = 1 + (MIN_PS_TIME / max_expected_time);
        }
        do {
            ndest = data->func(dst, src, len, &perf_data, repeats);
            pico_secs_t picosec2 = perf_data.pico_secs / repeats;
            if (picosec2 <= 0) continue;
            if (picosec2 < picosec) {
                picosec = picosec2;
            }
            // try to avoid some large values
            if (picosec2 < max_expected_time) {
                break;
            }
            if (retry < 4) {
                // doing this print seems to fix what ever is stealing time.
                printf("%d:%d retry %s(%p, %p, %d) repeated %d times duration ",
                    data->index, retry, data->name, dst, src, len, repeats);
                time_human_readable(picosec2);
                printf(" limit ");
                time_human_readable(max_expected_time);
                printf("\n");
            }
            repeats = repeats + (repeats / 2);
        } while (--retry);
        if (retry == 0) {
            printf("retry failure on %s(%p, %p, %d) repeated %d times\n",
                data->name, dst, src, len, repeats);
            printf("\tsmallest time was ");
            time_human_readable(picosec);
            printf(", last time was ");
            time_human_readable(perf_data.pico_secs);
            printf(" limit was ");
            time_human_readable(max_expected_time);
            printf("\n");
        }
        perf_data.pico_secs = picosec;
    }

    perf_data.summary_index = summary_from_size_and_align(len, perf_data.src_align, perf_data.dst_align);
    if (dst == src) len = 1;
    perf_data.bytes_per_second = (len * 1000000000000) / perf_data.pico_secs;
    perf_data_t *perf_array = data->array;
    int *perf_index =  &data->index;
    if (perf_array && *perf_index < iterations) {
        perf_array[(*perf_index)++] = perf_data;
    }
}

static void do_test_case(int in_index, int out_index, int len)
{
    int i;
    char c;
    char *dst;
    char *src;

    src = buf1 + in_index;
    dst = buf2 + out_index;
    for (i = 0; i < len; ++i)
        src[i] = random () & 255;

    c = (char) random () & 255;
    memset(buf3, c, min(BUF_SIZE, out_index + len + DUMP_WINDOW));

    for (i = 0; i < ARRAY_COUNT(memcpy_funcs); i++) {
        memset(buf2, c, min(BUF_SIZE, out_index + len + DUMP_WINDOW));
        do_test_func(__LINE__, dst, src, len, &memcpy_funcs[i]);
    }

    // memmove can do everything memcpy does
    for (i = 0; i < ARRAY_COUNT(memmove_funcs); i++) {
        memset(buf2, c, min(BUF_SIZE, out_index + len + DUMP_WINDOW));
        do_test_func(__LINE__, dst, src, len, &memmove_funcs[i]);
    }

}

static void reset_test_indexes(void)
{
    int i;

    for (i = 0; i < ARRAY_COUNT(memcpy_funcs); i++) {
        memcpy_funcs[i].index = 0;
    }


    for (i = 0; i < ARRAY_COUNT(memmove_funcs); i++) {
        memmove_funcs[i].index = 0;
    }
}

static void test_memory_copy (void)
{
    int i;
    int n;
    int len;
    int in_offset;
    int out_offset;
    int in_index;
    int out_index;

    n = 0;

    reset_test_indexes();

    // complete test for starting offsets 0..15, and lengths of 0..128
    // runs 33,024 tests

    for (in_offset = 0; in_offset < 16; in_offset++) {
        for (out_offset = 0; out_offset < 16; out_offset++) {
            for (len = 0; len <= 128; len++) {
                do_test_case(64+in_offset, 64+out_offset, len);
                n++;
            }
        }
    }

    TESTCASE_MESSAGE(n < iterations, "%d < %d", n, iterations);
    for (i = 0; i < 5; i++) {
        // warm up large copies
        in_index  = 64;
        len       = PERF_TEST_SIZE / 2;
        out_index = in_index;
        do_test_case(in_index, out_index, len);  // aligned
        do_test_case(in_index + 4, out_index, len);  // 4 byte -aligned
        do_test_case(in_index + 3, out_index, len);  // not-aligned
        do_test_case(in_index + 2, out_index, len);  // not-aligned
        do_test_case(in_index + 1, out_index, len);  // not-aligned
        n++;
    }

    TESTCASE_MESSAGE(n < iterations, "%d < %d", n, iterations);
    for (i = 0; i < 200; i++) {
        // make some large copies
        in_index  = 64;
        len       = PERF_TEST_SIZE;
        out_index = in_index;
        do_test_case(in_index, out_index, len);  // aligned
        do_test_case(in_index + 4, out_index    , len);  // 4 byte -aligned
        do_test_case(in_index + 3, out_index    , len);  // not-aligned
        do_test_case(in_index + 2, out_index    , len);  // not-aligned
        do_test_case(in_index + 1, out_index    , len);  // not-aligned
        do_test_case(in_index    , out_index + 4, len);  // 4 byte -aligned
        do_test_case(in_index    , out_index + 3, len);  // not-aligned
        do_test_case(in_index    , out_index + 2, len);  // not-aligned
        do_test_case(in_index    , out_index + 1, len);  // not-aligned
        n++;
    }

    TESTCASE_MESSAGE(n < iterations, "%d < %d", n, iterations);

    for (; n < iterations; n++)
    {
        in_index  = random() % INDEX_LIMIT;
        len       = 128 + (random() % (MAX_LENGTH-128));
        out_index = random() % INDEX_LIMIT;
        do_test_case(in_index, out_index, len);
    }


    for (i = 0; i < ARRAY_COUNT(memcpy_funcs); i++) {
        compute_dist_info(&memcpy_funcs[i]);
    }

    for (i = 0; i < ARRAY_COUNT(memmove_funcs); i++) {
        compute_dist_info(&memmove_funcs[i]);
    }

    print_perf_title(&memcpy_funcs[0]);

    for (i = 0; i < ARRAY_COUNT(memcpy_funcs); i++) {
        print_perf_summaries(&memcpy_funcs[i]);
    }

    for (i = 0; i < ARRAY_COUNT(memmove_funcs); i++) {
        print_perf_summaries(&memmove_funcs[i]);
    }

}

void do_rand_memmove_tests (void)
{
    int i;
    int n;
    char c;
    char *res;
    char *src;
    char *dst;

    printf("   memmove overlapping tests:\n");
    for (i = 0; i < ARRAY_COUNT(memmove_funcs); i++) {
        memmove_funcs[i].index = 0;
    }

    // setup all buffers the same
    for (i = 0; i < BUF_SIZE; ++i) {
        buf1[i] = buf2[i] = buf3[i] = random () & 255;
    }

    for (n = 0; n < iterations; n++)
    {
        char *dst;
        char *src;
        int in_index  = 64;
        int len;
        if (n < (iterations - 6000)) {
            in_index  = random() % INDEX_LIMIT;
            len       = random() % min(MAX_LENGTH, BUF_SIZE - in_index);
        } else {
            len       = PERF_TEST_SIZE;
        }

        int out_index;
        if (n < 10) {
            // first 10 times to a full overlapping
            out_index = in_index;
            len = 15;
        } else if (len == PERF_TEST_SIZE) {
            if (n % 2) {
                out_index = in_index + 32 + ((n / 2) % 5);
            } else {
                out_index = in_index - 32 - ((n / 2) % 5);
            }
        } else {
            int min_start = max(0, (in_index - len) + 1);
            int max_start = min(MAX_LENGTH - len, (in_index + len));
            int start_range = max_start - min_start;

            out_index = min_start + (random() % start_range);
        }

        src = buf2 + in_index;
        dst = buf2 + out_index;

        for (i = 0; i < ARRAY_COUNT(memmove_funcs); i++) {
            // reset buf2 to buf1
            memcpy(buf2, buf1, BUF_SIZE);
            do_test_func(__LINE__, dst, src, len, &memmove_funcs[i]);
        }
    }
    for (i = 0; i < ARRAY_COUNT(memmove_funcs); i++) {
        compute_dist_info(&memmove_funcs[i]);
    }

    print_perf_title(&memmove_funcs[0]);

    for (i = 0; i < ARRAY_COUNT(memmove_funcs); i++) {
        print_perf_summaries(&memmove_funcs[i]);
    }
}

static char state[128];

void ntd_test_memcpy( void )
{
    NTD_TEST_GROUP_START("memcpy", 3);
    size_t i;
    size_t j;
    void * orig = initstate(1, state, sizeof(state));

    if (test_init()) {
        test_memory_copy();
        do_rand_memmove_tests();
    }
    test_fini();
    setstate(orig);
    NTD_TEST_GROUP_END("memcpy", 3);
}
