/*
 * pfmon_util_x86_64.c  - X86-64 specific set of helper functions
 *
 * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
 * Contributed by Stephane Eranian <eranian@hpl.hp.com>
 *
 * This file is part of pfmon, a sample tool to measure performance 
 * of applications on Linux.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
 * 02111-1307 USA
 */
#include <sys/types.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdarg.h>
#include <stddef.h>
#include <pthread.h>
#include <errno.h>
#include <unistd.h>
#include <fcntl.h>
#include <ctype.h>
#include <sys/ptrace.h>
#include <sys/ucontext.h>
#include <sys/user.h>
#include <asm/unistd.h>


#include "pfmon.h"

#define IBRS_BASE	0
#define DBRS_BASE	2


int
pfmon_get_cache_info(int cpuid, pfmon_cache_info_t *info)
{
	return 0;
}

/*
 * NPTL:
 * 	getpid() identical for all thread
 * 	gettid() unique for each thread
 * LinuxThreads:
 * 	getpid() unique for each thread
 *	gettid() undefined by library, could be getpid()
 *
 * To avoid issues between NPTL and LinuxThreads, we hardcode gettid() 
 * to always return the value managed by the kernel.
 *
 * Kernel (independent of thread package):
 * 	sys_gettid(): kernel task->pid
 * 	sys_getpid(): kernel task->tgid
 * 	first task in group is such that task->tgid == task->pid
 */
pid_t
gettid(void)
{
#ifndef __NR_gettid
#define __NR_gettid 186
#endif
	return (pid_t)syscall(__NR_gettid);
}

typedef struct {
	unsigned long l0:1;
	unsigned long g0:1;
	unsigned long l1:1;
	unsigned long g1:1;
	unsigned long l2:1;
	unsigned long g2:1;
	unsigned long l3:1;
	unsigned long g3:1;
	unsigned long le:1;
	unsigned long ge:1;
	unsigned long res1:3; /* must be 1 */
	unsigned long gd:1;
	unsigned long res2:2; /* must be 0 */
	unsigned long rw0:2;
	unsigned long len0:2;
	unsigned long rw1:2;
	unsigned long len1:2;
	unsigned long rw2:2;
	unsigned long len2:2;
	unsigned long rw3:2;
	unsigned long len3:2;
	unsigned long res:32;
} dr7_reg_t;

typedef union {
	dr7_reg_t reg;
	unsigned long val;
} dr7_t;

static int
pfmon_x86_64_set_breakpoint(pid_t pid, int dbreg, unsigned long address, int rw)
{
	dr7_t dr7;
	unsigned long rw_mode = 0;
	unsigned long offset;
	long r;

	offset = offsetof(struct user, u_debugreg[7]);
	dr7.val = ptrace(PTRACE_PEEKUSER, (long)pid, offset, 0);
	if (dr7.val == -1) {
		warning("cannot peek %d\n", errno);
		return -1;
	}
	DPRINT(("dr7=0x%lx\n", dr7.val));

	dr7.reg.res1 = 0; /* reserved bits */
	dr7.reg.le   = 0; /* ignored for AMD64 */
	dr7.reg.ge   = 0; /* ignored for AMD64 */
	/* 
	 * rwXX is zero for instruction execution only
	 * lenXX is zero for instruction execution only
	 */

	/*
	 * XXX: IA-32 cannot do read only, we do do read-write
	 */
	switch(rw) {
		case 0: 
			rw_mode = 0;
			break;
		case  1:
		case  2: /* IA-32 cannot do read-only */
			rw_mode = 1;
			break;
		case  3:
			rw_mode = 3;
			break;
	}

	switch(dbreg) {
		case	0:
			dr7.reg.len0 = 0;
			dr7.reg.rw0  = rw_mode;
			break;
		case	1:
			dr7.reg.len1 = 0;
			dr7.reg.rw1  = rw_mode;
			break;
		case	2:
			dr7.reg.len2 = 0;
			dr7.reg.rw2  = rw_mode;
			break;
		case	3:
			dr7.reg.len3 = 0;
			dr7.reg.rw3  = rw_mode;
			break;
		default:
			fatal_error("unexpected debug register %d\n", dbreg);
	}
	DPRINT(("dr7: poke=0x%lx offs=%ld\n", dr7.val, offset));

	r = ptrace(PTRACE_POKEUSER, pid, offset, dr7.val);
	if (r == -1) {
		warning("cannot poke dr7\n");
		return -1;
	}
	dr7.val = ptrace(PTRACE_PEEKUSER, (long)pid, offset, 0);
	DPRINT(("dr7: peek=0x%lx offs=%ld\n", dr7.val, offset));

	DPRINT(("dr%d: poke=0x%lx\n", dbreg, address));

	offset = offsetof(struct user, u_debugreg[dbreg]);
	return ptrace(PTRACE_POKEUSER, pid, offset, address);
}

/*
 * common function to clear a breakpoint
 */
static int
pfmon_x86_64_clear_breakpoint(pid_t pid, int dbreg)
{
	dr7_t dr7;
	unsigned long offset;

	offset = offsetof(struct user, u_debugreg[7]);

	dr7.val = ptrace(PTRACE_PEEKUSER, pid, offset, 0);
	if (dr7.val == -1) return -1;

	DPRINT(("dr7=0x%lx\n", dr7.val));

	/* clear lX bit */
	switch(dbreg) {
		case	0:
			dr7.reg.l0 = 0;
			break;
		case	1:
			dr7.reg.l1 = 0;
			break;
			break;
		case	2:
			dr7.reg.l2 = 0;
			break;
		case	3:
			dr7.reg.l3 = 0;
			break;
		default:
			fatal_error("unexpected debug register %d\n", dbreg);
	}

	DPRINT(("dr7: poke=0x%lx offs=%ld\n", dr7.val, offset));

	return ptrace(PTRACE_POKEUSER, pid, offset, dr7.val);
}

/*
 * this function sets a code breakpoint at bundle address
 * In our context, we only support this features from user level code (of course). It is 
 * not possible to set kernel level breakpoints.
 *
 * The dbreg argument varies from 0 to 4, the configuration registers are not directly
 * visible.
 */
int
pfmon_set_code_breakpoint(pid_t pid, int dbreg, uintptr_t address)
{
	if (dbreg < 0 || dbreg >= options.nibrs) return -1;

	dbreg += IBRS_BASE;

	return pfmon_x86_64_set_breakpoint(pid, dbreg, address, 0);
}

int
pfmon_clear_code_breakpoint(pid_t pid, int dbreg, uintptr_t address)
{
	if (dbreg < 0 || dbreg >= options.nibrs) return -1;

	dbreg += IBRS_BASE;

	return pfmon_x86_64_clear_breakpoint(pid, dbreg);
}

/*
 * this function sets a data breakpoint at an address
 * In our context, we only support this features for user level code (of course). It is 
 * not possible to set kernel level breakpoints.
 *
 * The dbreg argument varies from 0 to 4, the configuration registers are not directly
 * visible.
 *
 * the rw field:
 * 	bit 0 = w : 1 means trigger on write access
 * 	bit 1 = r : 1 means trigger on read access
 */
int
pfmon_set_data_breakpoint(pid_t pid, int dbreg, uintptr_t address, int rw)
{
	if (dbreg < 0 || dbreg >= options.ndbrs) return -1;

	/*
	 * XXX: hack to split DB regs into two sets
	 */
	dbreg += DBRS_BASE;

	return pfmon_x86_64_set_breakpoint(pid, dbreg, address, rw);
}

int
pfmon_clear_data_breakpoint(pid_t pid, int dbreg, uintptr_t address)
{
	if (dbreg < 0 || dbreg >= options.ndbrs) return -1;

	dbreg += DBRS_BASE;

	return pfmon_x86_64_clear_breakpoint(pid, dbreg);
}

static int
__pfmon_resume_after_breakpoint(pid_t pid)
{
#define X86_EFLAGS_RF 0x00010000
	unsigned long tmp, tmp2;
	unsigned long offs;

	offs = offsetof(struct user, regs.eflags);
	tmp  = (unsigned long)ptrace(PTRACE_PEEKUSER, pid, offs, 0);
	if (tmp == (unsigned long)-1) {
		warning("cannot retrieve eflags: %s\n", strerror(errno));
		return -1;
	}
	DPRINT((">>>>>eflags=0x%lx\n", tmp));

	tmp |= X86_EFLAGS_RF;
	DPRINT((">>>>>eflags=0x%lx\n", tmp));
	ptrace(PTRACE_POKEUSER, pid, offs, tmp);

	tmp2  = (unsigned long)ptrace(PTRACE_PEEKUSER, pid, offs, 0);
	if (tmp2 != tmp)
		fatal_error("your kernel does not have the ptrace EFLAGS.RF fix\n");
	return 0;
}

int
pfmon_resume_after_code_breakpoint(pid_t pid)
{
	return __pfmon_resume_after_breakpoint(pid);
}

int
pfmon_resume_after_data_breakpoint(pid_t pid)
{
	return 0;
}

void
pfmon_arch_initialize(void)
{
	options.opt_support_gen = 1;
	options.libpfm_generic  = PFMLIB_AMD_X86_64_PMU;
	/*
	 * XXX: temporary hack. pfmon allows both code and data
	 * triggers to be set at the same time. Yet there is only
	 * one set of DB registers. We should really use an allocator
	 * but for nwo split registers into two sets and hack a
	 * base in the code
	 */
	options.nibrs = 2;
	options.ndbrs = 2;
}

int
pfmon_enable_all_breakpoints(pid_t pid)
{
	dr7_t dr7;	
	unsigned long offset;
	long r;

	offset = offsetof(struct user, u_debugreg[7]);

	dr7.val = ptrace(PTRACE_PEEKUSER, pid, offset, 0);
	if (dr7.val == -1) return -1;

	DPRINT(("dr7: peek=0x%lx\n", dr7.val));

	/* set all lX bits */
	dr7.reg.l0 = 1;
	dr7.reg.l1 = 1;
	dr7.reg.l2 = 1;
	dr7.reg.l3 = 1;

	DPRINT(("dr7: poke=0x%lx offs=%ld\n", dr7.val, offset));

	r = ptrace(PTRACE_POKEUSER, pid, offset, (void *)dr7.val);
	if (r == -1) {
		warning("cannot poke dr7\n");
		return -1;
	}
	return 0;
}

int
pfmon_disable_all_breakpoints(pid_t pid)
{
	dr7_t dr7;	
	unsigned long offset;
	long r;

	offset = offsetof(struct user, u_debugreg[7]);

	dr7.val = ptrace(PTRACE_PEEKUSER, pid, offset, 0);
	if (dr7.val == -1) return -1;

	DPRINT(("dr7=0x%lx\n", dr7.val));

	/* set all lX bits */
	dr7.reg.l0 = 0;
	dr7.reg.l1 = 0;
	dr7.reg.l2 = 0;
	dr7.reg.l3 = 0;

	DPRINT(("dr7: poke=0x%lx offs=%ld\n", dr7.val, offset));

	r = ptrace(PTRACE_POKEUSER, pid, offset, (void *)dr7.val);
	if (r == -1) {
		warning("cannot poke dr7\n");
		return -1;
	}
	return 0;
}

int
pfmon_validate_code_trigger_address(uintptr_t addr)
{
	return 0;
}
	
int
pfmon_validate_data_trigger_address(uintptr_t addr)
{
	return 0;
}

void
pfmon_segv_handler_info(struct siginfo *si, void *sc)
{
	struct ucontext *uc;
	unsigned long ip;
	uc = (struct ucontext *)sc;
	ip = uc->uc_mcontext.gregs[REG_RIP];
	printf("<pfmon fatal error @ [%d:%d] ip=0x%lx>\n", getpid(), gettid(), ip);
}

int
pfmon_get_breakpoint_addr(pid_t pid, uintptr_t *addr, int *is_data)
{
	dr7_t dr7;
	unsigned long offset, val, which_reg;

	/*
	 * XXX: we use three ptrace(), there ought to be something
	 * faster than this
	 */

	offset = offsetof(struct user, u_debugreg[6]);
	val = ptrace(PTRACE_PEEKUSER, (long)pid, offset, 0);
	if (val == (unsigned long)-1) {
		warning("cannot peek %d\n", errno);
		return -1;
	}
	/* XXX: assume only one bit set */
	which_reg = (val & 0xf);
	if (which_reg == 0) {
		warning("not a breakpoint\n");
		return -1;
	}
	DPRINT(("dr6=0x%lx which_reg=0x%lx\n", val, which_reg));

	val &= ~0xf;
	val = ptrace(PTRACE_POKEUSER, pid, offset, val);
	if (val == (unsigned long)-1) {
		warning("cannot clear dr6 %d\n", errno);
		return -1;
	}
	val = ptrace(PTRACE_PEEKUSER, (long)pid, offset, 0);
	DPRINT(("dr6=0x%lx\n", val));

	offset = offsetof(struct user, u_debugreg[7]);
	dr7.val = ptrace(PTRACE_PEEKUSER, (long)pid, offset, 0);
	if (dr7.val == (unsigned long)-1) {
		warning("cannot peek %d\n", errno);
		return -1;
	}
	DPRINT(("dr7=0x%lx\n", dr7.val));
	/*
	 * XXX: handle only one breakpoint at a time
	 */
	switch(which_reg) {
	case 1:
		*is_data = dr7.reg.rw0 == 0 ? 0 : 1;
		which_reg = 0;
		break;
	case 2:
		*is_data = dr7.reg.rw1 == 0 ? 0 : 1;
		which_reg = 1;
		break;
	case 4:
		*is_data = dr7.reg.rw2 == 0 ? 0 : 1;
		which_reg = 2;
		break;
	case 8:
		*is_data = dr7.reg.rw3 == 0 ? 0 : 1;
		which_reg = 3;
		break;
	default:
		fatal_error("cannot get breakpoint addr which_reg=0x%lx\n", which_reg);
	}

	offset = offsetof(struct user, u_debugreg[which_reg]);
	val = ptrace(PTRACE_PEEKUSER, (long)pid, offset, 0);
	if (val == (unsigned long)-1) {
		warning("cannot peek %d\n", errno);
		return -1;
	}
	DPRINT(("is_data=%d addr=0x%lx\n", *is_data, val));
	*addr = val;
	return 0;
}

int
pfmon_get_return_pointer(pid_t pid, uintptr_t *rp)
{
	unsigned long tmp;
	unsigned long offs;

	offs = offsetof(struct user, regs.rsp);
	tmp  = (unsigned long)ptrace(PTRACE_PEEKUSER, pid, offs, 0);
	if (tmp == (unsigned long)-1) {
		warning("cannot retrieve return: %s\n", strerror(errno));
		return -1;
	}
	DPRINT((">>>>>stack pointer=0x%lx\n", tmp));
	tmp  = (unsigned long)ptrace(PTRACE_PEEKDATA, pid, tmp, 0);
	if (tmp == (unsigned long)-1) {
		warning("cannot retrieve return: %s\n", strerror(errno));
		return -1;
	}
	DPRINT((">>>>>return pointer=0x%lx\n", tmp));

	*rp = tmp;
	return 0;
}

/*
 * we define our own syscall entries because depending on the version
 * of glibc the affinity calls are supported with a different API.
 * In other words, there is a glibc interface that then maps onto
 * the kernel interface which has been stable for quite some time now.
 */
int
__pfmon_set_affinity(pid_t pid, size_t size, pfmon_cpumask_t mask)
{
#ifndef __NR_sched_setaffinity
#define __NR_sched_setaffinity 203
#endif
	return (int)syscall(__NR_sched_setaffinity, pid, size, mask);
}

int
__pfmon_get_affinity(pid_t pid, size_t size, pfmon_cpumask_t mask)
{
#ifndef __NR_sched_getaffinity
#define __NR_sched_getaffinity 204
#endif
	return (int)syscall(__NR_sched_getaffinity, pid, size, mask);
}

int
pfmon_get_timestamp(uint64_t *t)
{
	uint64_t tmp;
	__asm__ __volatile__("rdtsc" : "=a"(tmp) :: "edx");
	*t = tmp;
	return 0;
}
	
void
pfmon_print_simple_cpuinfo(FILE *fp, const char *msg)
{
	char *cpu_name, *p, *stepping;
	char *cache_str;
	size_t cache_size;
	int ret;

	ret = find_in_cpuinfo("cache size", &cache_str);
	if (ret == -1)
		cache_str = "0";

	/* size in KB */
	sscanf(cache_str, "%zu", &cache_size);

	free(cache_str);

	ret = find_in_cpuinfo("model name", &cpu_name);
	if (ret == -1)
		cpu_name = "unknown";

	/*
	 * skip leading spaces
	 */
	p = cpu_name;
	while (*p == ' ') p++;

	ret = find_in_cpuinfo("stepping", &stepping);
	if (ret == -1)
		stepping = "??";

	fprintf(fp, "%s %lu-way %luMHz/%.1fMB -- %s (stepping %s)\n", 
		msg ? msg : "", 
		options.online_cpus, 
		options.cpu_mhz,
		(1.0*(double)cache_size)/1024,
		p, stepping);

	free(cpu_name);
	free(stepping);
}

void
pfmon_print_cpuinfo(FILE *fp)
{
	/*
	 * assume all CPUs are identical
	 */
	pfmon_print_simple_cpuinfo(fp, "# host CPUs: ");
}
