diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c70684f..deb77ae 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -8,6 +8,7 @@ config 64BIT config X86_32 def_bool !64BIT + select IPIPE_WANT_CLOCKSOURCE if IPIPE select CLKSRC_I8253 config X86_64 @@ -17,6 +18,8 @@ config X86_64 ### Arch settings config X86 def_bool y + select IPIPE_HAVE_HOSTRT if IPIPE + select IPIPE_HAVE_VM_NOTIFIER if IPIPE select HAVE_AOUT if X86_32 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE @@ -91,7 +94,7 @@ config X86 select GENERIC_CLOCKEVENTS select ARCH_CLOCKSOURCE_DATA if X86_64 select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) - select GENERIC_TIME_VSYSCALL if X86_64 + select GENERIC_TIME_VSYSCALL if X86_64 || IPIPE select KTIME_SCALAR if X86_32 select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER @@ -549,6 +552,7 @@ config SCHED_OMIT_FRAME_POINTER menuconfig PARAVIRT_GUEST bool "Paravirtualized guest support" + depends on !IPIPE ---help--- Say Y here to get to see options related to running Linux under various hypervisors. This option alone does not add any kernel code. @@ -593,6 +597,7 @@ source "arch/x86/lguest/Kconfig" config PARAVIRT bool "Enable paravirtualization code" + depends on !IPIPE ---help--- This changes the kernel so it can modify itself when it is run under a hypervisor, potentially improving performance significantly @@ -632,10 +637,10 @@ config MEMTEST ---help--- This option adds a kernel parameter 'memtest', which allows memtest to be set. - memtest=0, mean disabled; -- default - memtest=1, mean do 1 test pattern; - ... - memtest=4, mean do 4 test patterns. + memtest=0, mean disabled; -- default + memtest=1, mean do 1 test pattern; + ... + memtest=4, mean do 4 test patterns. If you are unsure how to answer this question, answer N. config X86_SUMMIT_NUMA @@ -677,11 +682,11 @@ config APB_TIMER select DW_APB_TIMER depends on X86_INTEL_MID && SFI help - APB timer is the replacement for 8254, HPET on X86 MID platforms. - The APBT provides a stable time base on SMP - systems, unlike the TSC, but it is more expensive to access, - as it is off-chip. APB timers are always running regardless of CPU - C states, they are used as per CPU clockevent device when possible. + APB timer is the replacement for 8254, HPET on X86 MID platforms. + The APBT provides a stable time base on SMP + systems, unlike the TSC, but it is more expensive to access, + as it is off-chip. APB timers are always running regardless of CPU + C states, they are used as per CPU clockevent device when possible. # Mark as expert because too many people got it wrong. # The code disables itself when not needed. @@ -807,6 +812,8 @@ config IRQ_TIME_ACCOUNTING source "kernel/Kconfig.preempt" +source "kernel/ipipe/Kconfig" + config X86_UP_APIC bool "Local APIC support on uniprocessors" depends on X86_32 && !SMP && !X86_32_NON_STANDARD diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index e46c214..004a5eb 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -61,7 +61,7 @@ config EARLY_PRINTK_DBGP config DEBUG_STACKOVERFLOW bool "Check for stack overflows" - depends on DEBUG_KERNEL + depends on DEBUG_KERNEL && !IPIPE ---help--- Say Y here if you want to check the overflows of kernel, IRQ and exception stacks. This option will cause messages of the diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index eaff479..664ca8a 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -476,7 +476,13 @@ static inline u32 safe_apic_wait_icr_idle(void) { return 0; } #endif /* CONFIG_X86_LOCAL_APIC */ +#ifdef CONFIG_IPIPE +#define ack_APIC_irq() do { } while(0) +static inline void __ack_APIC_irq(void) +#else /* !CONFIG_IPIPE */ +#define __ack_APIC_irq() ack_APIC_irq() static inline void ack_APIC_irq(void) +#endif /* CONFIG_IPIPE */ { /* * ack_APIC_irq() actually gets compiled as a single instruction diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index c46bb99..569f594 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -155,6 +155,7 @@ # define MAX_LOCAL_APIC 32768 #endif +#ifndef __ASSEMBLY__ /* * All x86-64 systems are xAPIC compatible. * In the following, "apicid" is a physical APIC ID. @@ -442,4 +443,6 @@ enum ioapic_irq_destination_types { dest_ExtINT = 7 }; +#endif /* !__ASSEMBLY__ */ + #endif /* _ASM_X86_APICDEF_H */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 8bf1c06..449bf12 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -352,6 +353,13 @@ extern unsigned long used_vectors[]; static inline void alloc_system_vector(int vector) { if (!test_bit(vector, used_vectors)) { +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_IPIPE) + unsigned cpu; + + for_each_possible_cpu(cpu) + per_cpu(vector_irq, cpu)[vector] = + ipipe_apic_vector_irq(vector); +#endif set_bit(vector, used_vectors); if (first_system_vector > vector) first_system_vector = vector; diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 75f4c6d..605fc5e 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -352,7 +352,11 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta { fpu_switch_t fpu; +#ifndef CONFIG_IPIPE fpu.preload = tsk_used_math(new) && new->fpu_counter > 5; +#else + fpu.preload = 0; +#endif if (__thread_has_fpu(old)) { if (!__save_init_fpu(old)) cpu = ~0; @@ -360,15 +364,18 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta old->thread.fpu.has_fpu = 0; /* But leave fpu_owner_task! */ /* Don't change CR0.TS if we just switch! */ +#ifndef CONFIG_IPIPE if (fpu.preload) { new->fpu_counter++; __thread_set_has_fpu(new); prefetch(new->thread.fpu.state); } else +#endif stts(); } else { old->fpu_counter = 0; old->thread.fpu.last_cpu = ~0; +#ifndef CONFIG_IPIPE if (fpu.preload) { new->fpu_counter++; if (fpu_lazy_restore(new, cpu)) @@ -377,6 +384,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta prefetch(new->thread.fpu.state); __thread_fpu_begin(new); } +#endif } return fpu; } @@ -389,10 +397,12 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta */ static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) { +#ifndef CONFIG_IPIPE if (fpu.preload) { if (unlikely(restore_fpu_checking(new))) __thread_fpu_end(new); } +#endif } /* diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index eb92a6e..74ca8ee 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -35,6 +35,14 @@ extern void spurious_interrupt(void); extern void thermal_interrupt(void); extern void reschedule_interrupt(void); +#ifdef CONFIG_IPIPE +void ipipe_ipi0(void); +void ipipe_ipi1(void); +void ipipe_ipi2(void); +void ipipe_ipi3(void); +void ipipe_ipiX(void); +#endif + extern void invalidate_interrupt(void); extern void invalidate_interrupt0(void); extern void invalidate_interrupt1(void); @@ -147,6 +155,7 @@ extern void smp_apic_timer_interrupt(struct pt_regs *); extern void smp_spurious_interrupt(struct pt_regs *); extern void smp_x86_platform_ipi(struct pt_regs *); extern void smp_error_interrupt(struct pt_regs *); +extern void smp_irq_work_interrupt(struct pt_regs *); #ifdef CONFIG_X86_IO_APIC extern asmlinkage void smp_irq_move_cleanup_interrupt(void); #endif @@ -159,6 +168,7 @@ extern void smp_invalidate_interrupt(struct pt_regs *); #else extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *); #endif +extern asmlinkage void smp_reboot_interrupt(void); #endif extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index a203659..637db3a 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -24,7 +24,7 @@ extern unsigned int cached_irq_mask; #define SLAVE_ICW4_DEFAULT 0x01 #define PIC_ICW4_AEOI 2 -extern raw_spinlock_t i8259A_lock; +IPIPE_DECLARE_RAW_SPINLOCK(i8259A_lock); /* the PIC may need a careful delay on some platforms, hence specific calls */ static inline unsigned char inb_pic(unsigned int port) diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h index 615fa90..f60651a 100644 --- a/arch/x86/include/asm/ipi.h +++ b/arch/x86/include/asm/ipi.h @@ -61,13 +61,16 @@ static inline void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest) { /* - * Subtle. In the case of the 'never do double writes' workaround - * we have to lock out interrupts to be safe. As we don't care - * of the value read we use an atomic rmw access to avoid costly - * cli/sti. Otherwise we use an even cheaper single atomic write - * to the APIC. + * Subtle. In the case of the 'never do double writes' + * workaround we have to lock out interrupts to be safe. As + * we don't care of the value read we use an atomic rmw access + * to avoid costly cli/sti (except if running the interrupt + * pipeline). Otherwise we use an even cheaper single atomic + * write to the APIC. */ - unsigned int cfg; + unsigned int cfg, flags; + + flags = hard_cond_local_irq_save(); /* * Wait for idle. @@ -83,6 +86,8 @@ __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest * Send the IPI. The write to APIC_ICR fires this off. */ native_apic_mem_write(APIC_ICR, cfg); + + hard_cond_local_irq_restore(flags); } /* @@ -92,7 +97,9 @@ __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest static inline void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest) { - unsigned long cfg; + unsigned long cfg, flags; + + flags = hard_cond_local_irq_save(); /* * Wait for idle. @@ -117,6 +124,8 @@ static inline void * Send the IPI. The write to APIC_ICR fires this off. */ native_apic_mem_write(APIC_ICR, cfg); + + hard_cond_local_irq_restore(flags); } extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, diff --git a/arch/x86/include/asm/ipipe.h b/arch/x86/include/asm/ipipe.h new file mode 100644 index 0000000..09f250e --- /dev/null +++ b/arch/x86/include/asm/ipipe.h @@ -0,0 +1,104 @@ +/* -*- linux-c -*- + * arch/x86/include/asm/ipipe.h + * + * Copyright (C) 2007 Philippe Gerum. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __X86_IPIPE_H +#define __X86_IPIPE_H + +#ifdef CONFIG_IPIPE + +#define IPIPE_CORE_RELEASE 3 + +struct ipipe_domain; + +struct ipipe_arch_sysinfo { +}; + +/* + * The logical processor id and the current Linux task are read from the PDA, + * so this is always safe, regardless of the underlying stack. + */ +#define ipipe_processor_id() raw_smp_processor_id() +#define ipipe_safe_current() current + +#define ipipe_mm_switch_protect(flags) \ + do { (flags) = hard_cond_local_irq_save(); } while (0) +#define ipipe_mm_switch_unprotect(flags) \ + hard_cond_local_irq_restore(flags) + +/* Private interface -- Internal use only */ + +#define __ipipe_early_core_setup() do { } while(0) + +#define __ipipe_enable_irq(irq) irq_to_desc(irq)->chip->enable(irq) +#define __ipipe_disable_irq(irq) irq_to_desc(irq)->chip->disable(irq) +#define __ipipe_enable_irqdesc(ipd, irq) do { } while(0) +#define __ipipe_disable_irqdesc(ipd, irq) do { } while(0) + +#ifdef CONFIG_SMP +void __ipipe_hook_critical_ipi(struct ipipe_domain *ipd); +#else +#define __ipipe_hook_critical_ipi(ipd) do { } while(0) +#endif + +void __ipipe_enable_pipeline(void); + +#ifdef CONFIG_IPIPE_DEBUG +void __ipipe_serial_debug(const char *fmt, ...); +#else +#define __ipipe_serial_debug(fmt, args...) do { } while (0) +#endif + +#define __ipipe_syscall_watched_p(p, sc) \ + (ipipe_notifier_enabled_p(p) || (unsigned long)sc >= NR_syscalls) + +#define __ipipe_root_tick_p(regs) ((regs)->flags & X86_EFLAGS_IF) + +static inline void ipipe_mute_pic(void) { } + +static inline void ipipe_unmute_pic(void) { } + +static inline void ipipe_notify_root_preemption(void) +{ + __ipipe_notify_vm_preemption(); +} + +#else /* !CONFIG_IPIPE */ + +#define ipipe_mm_switch_protect(flags) do { (void)(flags); } while(0) +#define ipipe_mm_switch_unprotect(flags) do { (void)(flags); } while(0) + +#endif /* CONFIG_IPIPE */ + +#if defined(CONFIG_SMP) && defined(CONFIG_IPIPE) +#define __ipipe_move_root_irq(irq) \ + do { \ + if (irq < NR_IRQS) { \ + struct irq_desc *desc = irq_to_desc(irq); \ + struct irq_chip *chip = desc->irq_data.chip; \ + if (chip->irq_move) \ + chip->irq_move(&desc->irq_data); \ + } \ + } while (0) +#else /* !(CONFIG_SMP && CONFIG_IPIPE) */ +#define __ipipe_move_root_irq(irq) do { } while (0) +#endif /* !(CONFIG_SMP && CONFIG_IPIPE) */ + +#endif /* !__X86_IPIPE_H */ diff --git a/arch/x86/include/asm/ipipe_32.h b/arch/x86/include/asm/ipipe_32.h new file mode 100644 index 0000000..0410da9 --- /dev/null +++ b/arch/x86/include/asm/ipipe_32.h @@ -0,0 +1,96 @@ +/* -*- linux-c -*- + * arch/x86/include/asm/ipipe_32.h + * + * Copyright (C) 2002-2012 Philippe Gerum. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __X86_IPIPE_32_H +#define __X86_IPIPE_32_H + +#include + +#define ipipe_read_tsc(t) \ + __asm__ __volatile__(ALTERNATIVE("call __ipipe_get_cs_tsc", \ + "rdtsc", \ + X86_FEATURE_TSC) : "=A"(t)) + +#define ipipe_tsc2ns(t) \ +({ \ + unsigned long long delta = (t) * 1000000ULL; \ + unsigned long long freq = __ipipe_hrclock_freq; \ + do_div(freq, 1000); \ + do_div(delta, (unsigned)freq + 1); \ + (unsigned long)delta; \ +}) + +#define ipipe_tsc2us(t) \ +({ \ + unsigned long long delta = (t) * 1000ULL; \ + unsigned long long freq = __ipipe_hrclock_freq; \ + do_div(freq, 1000); \ + do_div(delta, (unsigned)freq + 1); \ + (unsigned long)delta; \ +}) + +/* Private interface -- Internal use only */ + +extern unsigned int cpu_khz; +#define __ipipe_cpu_freq ({ unsigned long long __freq = 1000ULL * cpu_khz; __freq; }) + +#define ipipe_clock_name() \ + (cpu_has_tsc ? "tsc" : __ipipe_cs->name) + +#define __ipipe_hrclock_freq \ + (cpu_has_tsc ? __ipipe_cpu_freq : __ipipe_cs_freq) + +static inline unsigned long __ipipe_ffnz(unsigned long ul) +{ + __asm__("bsrl %1, %0":"=r"(ul) : "r"(ul)); + return ul; +} + +struct irq_desc; + +#define __root_irq_trampoline(__handler__, __regs__) \ + do { \ + __asm__ __volatile__("pushfl\n\t" \ + "orl %[x86if],(%%esp)\n\t" \ + "pushl %%cs\n\t" \ + "pushl $1f\n\t" \ + "pushl %%eax\n\t" \ + "pushl %%gs\n\t" \ + "pushl %%fs\n\t" \ + "pushl %%es\n\t" \ + "pushl %%ds\n\t" \ + "pushl %%eax\n\t" \ + "pushl %%ebp\n\t" \ + "pushl %%edi\n\t" \ + "pushl %%esi\n\t" \ + "pushl %%edx\n\t" \ + "pushl %%ecx\n\t" \ + "pushl %%ebx\n\t" \ + "call *%1\n\t" \ + "jmp ret_from_intr\n\t" \ + "1: cli\n" \ + : /* no output */ \ + : "a" (__regs__), \ + "r" (__handler__), \ + [x86if] "i" (X86_EFLAGS_IF)); \ + } while (0) + +#endif /* !__X86_IPIPE_32_H */ diff --git a/arch/x86/include/asm/ipipe_64.h b/arch/x86/include/asm/ipipe_64.h new file mode 100644 index 0000000..d000d7e --- /dev/null +++ b/arch/x86/include/asm/ipipe_64.h @@ -0,0 +1,95 @@ +/* -*- linux-c -*- + * arch/x86/include/asm/ipipe_64.h + * + * Copyright (C) 2007-2012 Philippe Gerum. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __X86_IPIPE_64_H +#define __X86_IPIPE_64_H + +#define ipipe_read_tsc(t) do { \ + unsigned int __a,__d; \ + asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \ + (t) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \ +} while(0) + +extern unsigned int cpu_khz; +#define __ipipe_cpu_freq ({ unsigned long long __freq = (1000ULL * cpu_khz); __freq; }) +#define __ipipe_hrclock_freq __ipipe_cpu_freq + +#define ipipe_tsc2ns(t) (((t) * 1000UL) / (__ipipe_hrclock_freq / 1000000UL)) +#define ipipe_tsc2us(t) ((t) / (__ipipe_hrclock_freq / 1000000UL)) + +static inline const char *ipipe_clock_name(void) +{ + return "tsc"; +} + +/* Private interface -- Internal use only */ + +static inline unsigned long __ipipe_ffnz(unsigned long ul) +{ + __asm__("bsrq %1, %0":"=r"(ul) + : "rm"(ul)); + return ul; +} + +struct irq_desc; + +#define __root_irq_trampoline(__handler__, __regs__) \ + do { \ + __asm__ __volatile__("movq %%rsp, %%rax\n\t" \ + "pushq $0\n\t" \ + "pushq %%rax\n\t" \ + "pushfq \n\t" \ + "orq %[x86if],(%%rsp)\n\t" \ + "pushq %[kernel_cs]\n\t" \ + "pushq $1f\n\t" \ + "pushq %[vector]\n\t" \ + "subq $9*8,%%rsp\n\t" \ + "movq %%rdi,8*8(%%rsp)\n\t" \ + "movq %%rsi,7*8(%%rsp)\n\t" \ + "movq %%rdx,6*8(%%rsp)\n\t" \ + "movq %%rcx,5*8(%%rsp)\n\t" \ + "movq %%rax,4*8(%%rsp)\n\t" \ + "movq %%r8,3*8(%%rsp)\n\t" \ + "movq %%r9,2*8(%%rsp)\n\t" \ + "movq %%r10,1*8(%%rsp)\n\t" \ + "movq %%r11,(%%rsp)\n\t" \ + "call *%[handler]\n\t" \ + "cli\n\t" \ + "jmp exit_intr\n\t" \ + "1: cli\n" \ + : /* no output */ \ + : [kernel_cs] "i" (__KERNEL_CS), \ + [vector] "rm" (__regs__->orig_ax), \ + [handler] "r" (__handler__), \ + "D" (__regs__), \ + [x86if] "i" (X86_EFLAGS_IF) \ + : "rax"); \ + } while (0) + +#ifdef CONFIG_PREEMPT +#define __ipipe_check_root_resched() \ + (preempt_count() == 0 && need_resched() && \ + per_cpu(irq_count, ipipe_processor_id()) < 0) +#else +#define __ipipe_check_root_resched() 0 +#endif + +#endif /* !__X86_IPIPE_64_H */ diff --git a/arch/x86/include/asm/ipipe_base.h b/arch/x86/include/asm/ipipe_base.h new file mode 100644 index 0000000..4af5264 --- /dev/null +++ b/arch/x86/include/asm/ipipe_base.h @@ -0,0 +1,226 @@ +/* -*- linux-c -*- + * arch/x86/include/asm/ipipe_base.h + * + * Copyright (C) 2007-2012 Philippe Gerum. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __X86_IPIPE_BASE_H +#define __X86_IPIPE_BASE_H + +#include +#include + +#ifdef CONFIG_X86_32 +/* 32 from IDT + iret_error + mayday trap */ +#define IPIPE_TRAP_MAYDAY 33 /* Internal recovery trap */ +#define IPIPE_NR_FAULTS 34 +#else +/* 32 from IDT + mayday trap */ +#define IPIPE_TRAP_MAYDAY 32 /* Internal recovery trap */ +#define IPIPE_NR_FAULTS 33 +#endif + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) +/* + * Special APIC interrupts are mapped above the last defined external + * IRQ number. + */ +#define nr_apic_vectors (NR_VECTORS - FIRST_SYSTEM_VECTOR) +#define IPIPE_FIRST_APIC_IRQ NR_IRQS +#define IPIPE_HRTIMER_IPI ipipe_apic_vector_irq(IPIPE_HRTIMER_VECTOR) +#ifdef CONFIG_SMP +#define IPIPE_RESCHEDULE_IPI ipipe_apic_vector_irq(IPIPE_RESCHEDULE_VECTOR) +#define IPIPE_CRITICAL_IPI ipipe_apic_vector_irq(IPIPE_CRITICAL_VECTOR) +#endif /* CONFIG_SMP */ +#define IPIPE_NR_XIRQS (NR_IRQS + nr_apic_vectors) +#define ipipe_apic_irq_vector(irq) ((irq) - IPIPE_FIRST_APIC_IRQ + FIRST_SYSTEM_VECTOR) +#define ipipe_apic_vector_irq(vec) ((vec) - FIRST_SYSTEM_VECTOR + IPIPE_FIRST_APIC_IRQ) +#else /* !(CONFIG_X86_64 || CONFIG_X86_LOCAL_APIC) */ +#define IPIPE_NR_XIRQS NR_IRQS +#endif /* !(CONFIG_X86_64 || CONFIG_X86_LOCAL_APIC) */ + +#define ex_do_divide_error 0 +#define ex_do_debug 1 +/* NMI not pipelined. */ +#define ex_do_int3 3 +#define ex_do_overflow 4 +#define ex_do_bounds 5 +#define ex_do_invalid_op 6 +#define ex_do_device_not_available 7 +/* Double fault not pipelined. */ +#define ex_do_coprocessor_segment_overrun 9 +#define ex_do_invalid_TSS 10 +#define ex_do_segment_not_present 11 +#define ex_do_stack_segment 12 +#define ex_do_general_protection 13 +#define ex_do_page_fault 14 +#define ex_do_spurious_interrupt_bug 15 +#define ex_do_coprocessor_error 16 +#define ex_do_alignment_check 17 +#define ex_machine_check_vector 18 +#define ex_reserved ex_machine_check_vector +#define ex_do_simd_coprocessor_error 19 +#define ex_do_iret_error 32 + +#ifndef __ASSEMBLY__ + +#include + +#ifdef CONFIG_X86_32 +# include "ipipe_32.h" +#else +# include "ipipe_64.h" +#endif + +struct pt_regs; +struct irq_desc; +struct ipipe_vm_notifier; + +static inline unsigned __ipipe_get_irq_vector(int irq) +{ +#ifdef CONFIG_X86_IO_APIC + unsigned int __ipipe_get_ioapic_irq_vector(int irq); + return __ipipe_get_ioapic_irq_vector(irq); +#elif defined(CONFIG_X86_LOCAL_APIC) + return irq >= IPIPE_FIRST_APIC_IRQ && irq < IPIPE_NR_XIRQS ? + ipipe_apic_irq_vector(irq) : irq + IRQ0_VECTOR; +#else + return irq + IRQ0_VECTOR; +#endif +} + +void __ipipe_halt_root(void); + +void ipipe_hrtimer_interrupt(void); + +void ipipe_reschedule_interrupt(void); + +void ipipe_critical_interrupt(void); + +int __ipipe_handle_irq(struct pt_regs *regs); + +void __ipipe_handle_vm_preemption(struct ipipe_vm_notifier *nfy); + +extern int __ipipe_hrtimer_irq; + +#ifdef CONFIG_SMP + +#include + +#ifdef CONFIG_X86_32 +#define GET_ROOT_STATUS_ADDR \ + "pushfl; cli;" \ + "movl %%fs:this_cpu_off, %%eax;" \ + "lea ipipe_percpu(%%eax), %%eax;" +#define PUT_ROOT_STATUS_ADDR "popfl;" +#define TEST_AND_SET_ROOT_STATUS \ + "btsl $0,(%%eax);" +#define TEST_ROOT_STATUS \ + "btl $0,(%%eax);" +#define ROOT_TEST_CLOBBER_LIST "eax" +#else /* CONFIG_X86_64 */ +#define GET_ROOT_STATUS_ADDR \ + "pushfq; cli;" \ + "movq %%gs:this_cpu_off, %%rax;" \ + "lea ipipe_percpu(%%rax), %%rax;" +#define PUT_ROOT_STATUS_ADDR "popfq;" +#define TEST_AND_SET_ROOT_STATUS \ + "btsl $0,(%%rax);" +#define TEST_ROOT_STATUS \ + "btl $0,(%%rax);" +#define ROOT_TEST_CLOBBER_LIST "rax" +#endif /* CONFIG_X86_64 */ + +static inline void ipipe_stall_root(void) +{ + __asm__ __volatile__(GET_ROOT_STATUS_ADDR + TEST_AND_SET_ROOT_STATUS + PUT_ROOT_STATUS_ADDR + : : : ROOT_TEST_CLOBBER_LIST, "memory"); +} + +static inline unsigned long ipipe_test_and_stall_root(void) +{ + int oldbit; + + __asm__ __volatile__(GET_ROOT_STATUS_ADDR + TEST_AND_SET_ROOT_STATUS + "sbbl %0,%0;" + PUT_ROOT_STATUS_ADDR + :"=r" (oldbit) + : : ROOT_TEST_CLOBBER_LIST, "memory"); + return oldbit; +} + +static inline unsigned long ipipe_test_root(void) +{ + int oldbit; + + __asm__ __volatile__(GET_ROOT_STATUS_ADDR + TEST_ROOT_STATUS + "sbbl %0,%0;" + PUT_ROOT_STATUS_ADDR + :"=r" (oldbit) + : : ROOT_TEST_CLOBBER_LIST); + return oldbit; +} + +#else /* !CONFIG_SMP */ + +extern unsigned long __ipipe_root_status; + +static inline void ipipe_stall_root(void) +{ + volatile unsigned long *p = &__ipipe_root_status; + __asm__ __volatile__("btsl $0,%0;" + :"+m" (*p) : : "memory"); +} + +static inline unsigned long ipipe_test_and_stall_root(void) +{ + volatile unsigned long *p = &__ipipe_root_status; + int oldbit; + + __asm__ __volatile__("btsl $0,%1;" + "sbbl %0,%0;" + :"=r" (oldbit), "+m" (*p) + : : "memory"); + return oldbit; +} + +static inline unsigned long ipipe_test_root(void) +{ + volatile unsigned long *p = &__ipipe_root_status; + int oldbit; + + __asm__ __volatile__("btl $0,%1;" + "sbbl %0,%0;" + :"=r" (oldbit) + :"m" (*p)); + return oldbit; +} + +#endif /* !CONFIG_SMP */ + +#ifdef CONFIG_IPIPE_LEGACY +#define __ipipe_tick_irq __ipipe_hrtimer_irq +#endif + +#endif /* !__ASSEMBLY__ */ + +#endif /* !__X86_IPIPE_BASE_H */ diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 4b44487..947e822 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -112,6 +112,11 @@ /* Xen vector callback to receive events in a HVM domain */ #define XEN_HVM_EVTCHN_CALLBACK 0xf3 +/* Interrupt pipeline IPIs */ +#define IPIPE_CRITICAL_VECTOR 0xf2 +#define IPIPE_HRTIMER_VECTOR 0xf1 +#define IPIPE_RESCHEDULE_VECTOR 0xf0 + /* * Local APIC timer IRQ vector is on a different priority level, * to work around the 'lost local interrupt if more than 2 IRQ @@ -130,6 +135,12 @@ #define INVALIDATE_TLB_VECTOR_START \ (INVALIDATE_TLB_VECTOR_END-NUM_INVALIDATE_TLB_VECTORS+1) +/* + * I-pipe: Lowest vector number which may be assigned to a special + * APIC IRQ. We must know this at build time. + */ +#define FIRST_SYSTEM_VECTOR INVALIDATE_TLB_VECTOR_START + #define NR_VECTORS 256 #define FPU_IRQ 13 diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index bba3cf8..65567b6 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -4,6 +4,11 @@ #include #ifndef __ASSEMBLY__ + +#include +#include +#include + /* * Interrupt control: */ @@ -54,6 +59,13 @@ static inline void native_halt(void) asm volatile("hlt": : :"memory"); } +static inline int native_irqs_disabled(void) +{ + unsigned long flags = native_save_fl(); + + return !(flags & X86_EFLAGS_IF); +} + #endif #ifdef CONFIG_PARAVIRT @@ -64,22 +76,45 @@ static inline void native_halt(void) static inline notrace unsigned long arch_local_save_flags(void) { +#ifdef CONFIG_IPIPE + unsigned long flags; + + flags = (!ipipe_test_root()) << 9; + barrier(); + return flags; +#else return native_save_fl(); +#endif } static inline notrace void arch_local_irq_restore(unsigned long flags) { +#ifdef CONFIG_IPIPE + barrier(); + ipipe_restore_root(!(flags & X86_EFLAGS_IF)); +#else native_restore_fl(flags); +#endif } static inline notrace void arch_local_irq_disable(void) { +#ifdef CONFIG_IPIPE + ipipe_stall_root(); + barrier(); +#else native_irq_disable(); +#endif } static inline notrace void arch_local_irq_enable(void) { +#ifdef CONFIG_IPIPE + barrier(); + ipipe_unstall_root(); +#else native_irq_enable(); +#endif } /* @@ -88,7 +123,12 @@ static inline notrace void arch_local_irq_enable(void) */ static inline void arch_safe_halt(void) { +#ifdef CONFIG_IPIPE + barrier(); + __ipipe_halt_root(); +#else native_safe_halt(); +#endif } /* @@ -100,6 +140,20 @@ static inline void halt(void) native_halt(); } +/* Merge virtual+real interrupt mask bits into a single word. */ +static inline unsigned long arch_mangle_irq_bits(int virt, unsigned long real) +{ + return (real & ~(1L << 31)) | ((unsigned long)(virt != 0) << 31); +} + +/* Converse operation of arch_mangle_irq_bits() */ +static inline int arch_demangle_irq_bits(unsigned long *x) +{ + int virt = (*x & (1L << 31)) != 0; + *x &= ~(1L << 31); + return virt; +} + /* * For spinlocks, etc: */ @@ -114,6 +168,14 @@ static inline notrace unsigned long arch_local_irq_save(void) #define ENABLE_INTERRUPTS(x) sti #define DISABLE_INTERRUPTS(x) cli +#ifdef CONFIG_IPIPE +#define HARD_COND_ENABLE_INTERRUPTS sti +#define HARD_COND_DISABLE_INTERRUPTS cli +#else /* !CONFIG_IPIPE */ +#define HARD_COND_ENABLE_INTERRUPTS +#define HARD_COND_DISABLE_INTERRUPTS +#endif /* !CONFIG_IPIPE */ + #ifdef CONFIG_X86_64 #define SWAPGS swapgs /* @@ -164,6 +226,121 @@ static inline int arch_irqs_disabled(void) return arch_irqs_disabled_flags(flags); } +static inline unsigned long hard_local_irq_save_notrace(void) +{ + unsigned long flags; + + flags = native_save_fl(); + native_irq_disable(); + + return flags; +} + +static inline void hard_local_irq_restore_notrace(unsigned long flags) +{ + native_restore_fl(flags); +} + +static inline void hard_local_irq_disable_notrace(void) +{ + native_irq_disable(); +} + +static inline void hard_local_irq_enable_notrace(void) +{ + native_irq_enable(); +} + +static inline int hard_irqs_disabled(void) +{ + return native_irqs_disabled(); +} + +#define hard_irqs_disabled_flags(flags) arch_irqs_disabled_flags(flags) + +#ifdef CONFIG_IPIPE_TRACE_IRQSOFF + +static inline void hard_local_irq_disable(void) +{ + if (!native_irqs_disabled()) { + native_irq_disable(); + ipipe_trace_begin(0x80000000); + } +} + +static inline void hard_local_irq_enable(void) +{ + if (native_irqs_disabled()) { + ipipe_trace_end(0x80000000); + native_irq_enable(); + } +} + +static inline unsigned long hard_local_irq_save(void) +{ + unsigned long flags; + + flags = native_save_fl(); + if (flags & X86_EFLAGS_IF) { + native_irq_disable(); + ipipe_trace_begin(0x80000001); + } + + return flags; +} + +static inline void hard_local_irq_restore(unsigned long flags) +{ + if (flags & X86_EFLAGS_IF) + ipipe_trace_end(0x80000001); + + native_restore_fl(flags); +} + +#else /* !CONFIG_IPIPE_TRACE_IRQSOFF */ + +static inline unsigned long hard_local_irq_save(void) +{ + return hard_local_irq_save_notrace(); +} + +static inline void hard_local_irq_restore(unsigned long flags) +{ + hard_local_irq_restore_notrace(flags); +} + +static inline void hard_local_irq_enable(void) +{ + hard_local_irq_enable_notrace(); +} + +static inline void hard_local_irq_disable(void) +{ + hard_local_irq_disable_notrace(); +} + +#endif /* CONFIG_IPIPE_TRACE_IRQSOFF */ + +static inline unsigned long hard_local_save_flags(void) +{ + return native_save_fl(); +} + +#ifndef CONFIG_IPIPE +#define hard_cond_local_irq_enable() do { } while(0) +#define hard_cond_local_irq_disable() do { } while(0) +#define hard_cond_local_irq_save() 0 +#define hard_cond_local_irq_restore(flags) do { (void)(flags); } while(0) +#endif + +#if defined(CONFIG_SMP) && defined(CONFIG_IPIPE) +#define hard_smp_local_irq_save() hard_local_irq_save() +#define hard_smp_local_irq_restore(flags) hard_local_irq_restore(flags) +#else /* !CONFIG_SMP */ +#define hard_smp_local_irq_save() 0 +#define hard_smp_local_irq_restore(flags) do { (void)(flags); } while(0) +#endif /* CONFIG_SMP */ + #else #ifdef CONFIG_X86_64 @@ -182,7 +359,10 @@ static inline int arch_irqs_disabled(void) pushl %eax; \ pushl %ecx; \ pushl %edx; \ + pushfl; \ + sti; \ call lockdep_sys_exit; \ + popfl; \ popl %edx; \ popl %ecx; \ popl %eax; @@ -191,8 +371,38 @@ static inline int arch_irqs_disabled(void) #endif #ifdef CONFIG_TRACE_IRQFLAGS +# ifdef CONFIG_IPIPE +# ifdef CONFIG_X86_64 +# define TRACE_IRQS_ON \ + call trace_hardirqs_on_thunk; \ + pushq %rax; \ + PER_CPU(ipipe_percpu, %rax); \ + btrl $0,(%rax); \ + popq %rax +# define TRACE_IRQS_OFF \ + pushq %rax; \ + PER_CPU(ipipe_percpu, %rax); \ + btsl $0,(%rax); \ + popq %rax; \ + call trace_hardirqs_off_thunk +# else /* CONFIG_X86_32 */ +# define TRACE_IRQS_ON \ + call trace_hardirqs_on_thunk; \ + pushl %eax; \ + PER_CPU(ipipe_percpu, %eax); \ + btrl $0,(%eax); \ + popl %eax +# define TRACE_IRQS_OFF \ + pushl %eax; \ + PER_CPU(ipipe_percpu, %eax); \ + btsl $0,(%eax); \ + popl %eax; \ + call trace_hardirqs_off_thunk +# endif /* CONFIG_X86_32 */ +# else /* !CONFIG_IPIPE */ # define TRACE_IRQS_ON call trace_hardirqs_on_thunk; # define TRACE_IRQS_OFF call trace_hardirqs_off_thunk; +# endif /* !CONFIG_IPIPE */ #else # define TRACE_IRQS_ON # define TRACE_IRQS_OFF diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index cdbf367..b1fe0d6 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -30,11 +30,14 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) #endif } -static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, - struct task_struct *tsk) +static inline void __switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) { unsigned cpu = smp_processor_id(); +#ifdef CONFIG_IPIPE_DEBUG_INTERNAL + WARN_ON_ONCE(!hard_irqs_disabled()); +#endif if (likely(prev != next)) { #ifdef CONFIG_SMP this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); @@ -71,10 +74,22 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, #endif } +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned long flags; + flags = hard_cond_local_irq_save(); + __switch_mm(prev, next, tsk); + hard_cond_local_irq_restore(flags); +} + +#define ipipe_switch_mm_head(prev, next, tsk) \ + __switch_mm(prev, next, tsk) + #define activate_mm(prev, next) \ do { \ paravirt_activate_mm((prev), (next)); \ - switch_mm((prev), (next), NULL); \ + __switch_mm((prev), (next), NULL); \ } while (0); #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 320f7bb..e969fdd 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -1,7 +1,11 @@ #ifndef _ASM_X86_PAGE_64_DEFS_H #define _ASM_X86_PAGE_64_DEFS_H +#ifdef CONFIG_IPIPE +#define THREAD_SIZE_ORDER 2 +#else #define THREAD_SIZE_ORDER 1 +#endif #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) #define CURRENT_MASK (~(THREAD_SIZE - 1)) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 39bc577..59e4265 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -447,6 +447,7 @@ struct thread_struct { unsigned short ds; unsigned short fsindex; unsigned short gsindex; + unsigned long rip; #endif #ifdef CONFIG_X86_32 unsigned long ip; diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 41fc93a..6b9ab77 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -4,6 +4,10 @@ #ifdef __KERNEL__ +#include + +DECLARE_PER_CPU(unsigned long, __ipipe_cr2); + static inline void native_clts(void) { asm volatile("clts"); @@ -117,15 +121,23 @@ static inline void write_cr0(unsigned long x) native_write_cr0(x); } +#ifdef CONFIG_IPIPE +#define read_cr2() __this_cpu_read(__ipipe_cr2) +#else static inline unsigned long read_cr2(void) { return native_read_cr2(); } +#endif +#ifdef CONFIG_IPIPE +#define write_cr2(x) __this_cpu_write(__ipipe_cr2, x) +#else static inline void write_cr2(unsigned long x) { native_write_cr2(x); } +#endif static inline unsigned long read_cr3(void) { diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 4ec45b3..b95b6d4 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -105,7 +105,11 @@ do { \ asm volatile(SAVE_CONTEXT \ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ - "call __switch_to\n\t" \ + "movq $thread_return,%P[threadrip](%[prev])\n\t" /* save RIP */ \ + "pushq %P[threadrip](%[next])\n\t" /* restore RIP */ \ + "jmp __switch_to\n\t" \ + ".globl thread_return\n\t" \ + "thread_return:\n\t" \ "movq "__percpu_arg([current_task])",%%rsi\n\t" \ __switch_canary \ "movq %P[thread_info](%%rsi),%%r8\n\t" \ @@ -117,6 +121,7 @@ do { \ __switch_canary_oparam \ : [next] "S" (next), [prev] "D" (prev), \ [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ + [threadrip] "i" (offsetof(struct task_struct, thread.rip)), \ [ti_flags] "i" (offsetof(struct thread_info, flags)), \ [_tif_fork] "i" (_TIF_FORK), \ [thread_info] "i" (offsetof(struct task_struct, stack)), \ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 89f794f..1167fbe 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -22,6 +22,7 @@ struct exec_domain; #include #include #include +#include struct thread_info { struct task_struct *task; /* main task structure */ @@ -42,6 +43,7 @@ struct thread_info { #endif unsigned int sig_on_uaccess_error:1; unsigned int uaccess_err:1; /* uaccess failed */ + struct ipipe_threadinfo ipipe_data; }; #define INIT_THREAD_INFO(tsk) \ diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 88eae2a..70cb836 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -85,9 +85,9 @@ extern int panic_on_unrecovered_nmi; void math_error(struct pt_regs *, int, int); void math_emulate(struct math_emu_info *); #ifndef CONFIG_X86_32 -asmlinkage void smp_thermal_interrupt(void); asmlinkage void mce_threshold_interrupt(void); #endif +asmlinkage void smp_thermal_interrupt(void); /* Interrupts/Exceptions */ enum { diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index c91e8b9..1637088 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -14,6 +14,7 @@ */ typedef unsigned long long cycles_t; +extern struct clocksource clocksource_tsc; extern unsigned int cpu_khz; extern unsigned int tsc_khz; diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 8215e56..06d515f 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -50,6 +50,7 @@ obj-y += reboot.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_PCI) += early-quirks.o +obj-$(CONFIG_IPIPE) += ipipe.o apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 39a222e..12758ee 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -466,7 +467,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, if (evt->features & CLOCK_EVT_FEAT_DUMMY) return; - local_irq_save(flags); + flags = hard_local_irq_save(); switch (mode) { case CLOCK_EVT_MODE_PERIODIC: @@ -486,7 +487,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, break; } - local_irq_restore(flags); + hard_local_irq_restore(flags); } /* @@ -499,6 +500,17 @@ static void lapic_timer_broadcast(const struct cpumask *mask) #endif } +#ifdef CONFIG_IPIPE +static void lapic_itimer_ack(void) +{ + __ack_APIC_irq(); +} + +static DEFINE_PER_CPU(struct ipipe_timer, lapic_itimer) = { + .irq = ipipe_apic_vector_irq(LOCAL_TIMER_VECTOR), + .ack = lapic_itimer_ack, +}; +#endif /* CONFIG_IPIPE */ /* * The local apic timer can be used for any function which is CPU local. @@ -532,6 +544,16 @@ static void __cpuinit setup_APIC_timer(void) memcpy(levt, &lapic_clockevent, sizeof(*levt)); levt->cpumask = cpumask_of(smp_processor_id()); +#ifdef CONFIG_IPIPE + if (!(lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY)) + levt->ipipe_timer = &__get_cpu_var(lapic_itimer); + else { + static atomic_t once = ATOMIC_INIT(-1); + if (atomic_inc_and_test(&once)) + printk(KERN_INFO + "I-pipe: cannot use LAPIC as a tick device\n"); + } +#endif /* CONFIG_IPIPE */ clockevents_register_device(levt); } @@ -1034,7 +1056,7 @@ void lapic_shutdown(void) if (!cpu_has_apic && !apic_from_smp_config()) return; - local_irq_save(flags); + flags = hard_local_irq_save(); #ifdef CONFIG_X86_32 if (!enabled_via_apicbase) @@ -1044,7 +1066,7 @@ void lapic_shutdown(void) disable_local_APIC(); - local_irq_restore(flags); + hard_local_irq_restore(flags); } /* @@ -1316,7 +1338,7 @@ void __cpuinit setup_local_APIC(void) value = apic_read(APIC_ISR + i*0x10); for (j = 31; j >= 0; j--) { if (value & (1<> 1)); if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) - ack_APIC_irq(); + __ack_APIC_irq(); inc_irq_stat(irq_spurious_count); @@ -2198,7 +2220,7 @@ static void lapic_resume(void) if (!apic_pm_state.active) return; - local_irq_save(flags); + flags = hard_local_irq_save(); if (irq_remapping_enabled) { /* * IO-APIC and PIC have their own resume routines. @@ -2254,7 +2276,7 @@ static void lapic_resume(void) if (irq_remapping_enabled) irq_remapping_reenable(x2apic_mode); - local_irq_restore(flags); + hard_local_irq_restore(flags); } /* diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 0e881c4..ecba51e 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -79,9 +79,9 @@ static inline void _flat_send_IPI_mask(unsigned long mask, int vector) { unsigned long flags; - local_irq_save(flags); + flags = hard_local_irq_save(); __default_send_IPI_dest_field(mask, vector, apic->dest_logical); - local_irq_restore(flags); + hard_local_irq_restore(flags); } static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5f0ff59..08e5ad4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -90,8 +90,8 @@ static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip) */ int sis_apic_bug = -1; -static DEFINE_RAW_SPINLOCK(ioapic_lock); -static DEFINE_RAW_SPINLOCK(vector_lock); +static IPIPE_DEFINE_RAW_SPINLOCK(ioapic_lock); +static IPIPE_DEFINE_RAW_SPINLOCK(vector_lock); static struct ioapic { /* @@ -400,6 +400,7 @@ static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e union entry_union eu = {{0, 0}}; eu.entry = e; + io_apic_write(apic, 0x11 + 2*pin, eu.w2); io_apic_write(apic, 0x10 + 2*pin, eu.w1); } @@ -524,18 +525,24 @@ static void io_apic_sync(struct irq_pin_list *entry) readl(&io_apic->data); } -static void mask_ioapic(struct irq_cfg *cfg) +static inline void __mask_ioapic(struct irq_cfg *cfg) +{ + io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); +} + +static void mask_ioapic(unsigned int irq, struct irq_cfg *cfg) { unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); + ipipe_lock_irq(irq); + __mask_ioapic(cfg); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void mask_ioapic_irq(struct irq_data *data) { - mask_ioapic(data->chip_data); + mask_ioapic(data->irq, data->chip_data); } static void __unmask_ioapic(struct irq_cfg *cfg) @@ -543,18 +550,19 @@ static void __unmask_ioapic(struct irq_cfg *cfg) io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); } -static void unmask_ioapic(struct irq_cfg *cfg) +static void unmask_ioapic(unsigned int irq, struct irq_cfg *cfg) { unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); __unmask_ioapic(cfg); + ipipe_unlock_irq(irq); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void unmask_ioapic_irq(struct irq_data *data) { - unmask_ioapic(data->chip_data); + unmask_ioapic(data->irq, data->chip_data); } /* @@ -606,17 +614,27 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector, struct irq_cfg *cfg) } } -static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +static void __eoi_ioapic_irq(struct irq_cfg *cfg) { struct irq_pin_list *entry; - unsigned long flags; - raw_spin_lock_irqsave(&ioapic_lock, flags); for_each_irq_pin(entry, cfg->irq_2_pin) __eoi_ioapic_pin(entry->apic, entry->pin, cfg->vector, cfg); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); } +#if !defined(CONFIG_IPIPE) || defined(CONFIG_SMP) + +static inline void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + __eoi_ioapic_irq(cfg); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +#endif /* !CONFIG_IPIPE || CONFIG_SMP */ + static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; @@ -1246,6 +1264,9 @@ void __setup_vector_irq(int cpu) } /* Mark the free vectors */ for (vector = 0; vector < NR_VECTORS; ++vector) { + /* I-pipe requires initialized vector_irq for system vectors */ + if (test_bit(vector, used_vectors)) + continue; irq = per_cpu(vector_irq, cpu)[vector]; if (irq < 0) continue; @@ -1272,8 +1293,8 @@ static inline int IO_APIC_irq_trigger(int irq) } } /* - * nonexistent IRQs are edge default - */ + * nonexistent IRQs are edge default + */ return 0; } #else @@ -1338,6 +1359,19 @@ static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, return 0; } +#ifdef CONFIG_IPIPE +static void mask_legacy_irq(unsigned irq) +{ + unsigned long flags; + legacy_pic->mask(irq); + flags = hard_local_irq_save(); + __ipipe_unlock_irq(irq); + hard_local_irq_restore(flags); +} +#else /* !CONFIG_IPIPE */ +#define mask_legacy_irq(irq) legacy_pic->mask(irq) +#endif /* !CONFIG_IPIPE */ + static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, struct io_apic_irq_attr *attr) { @@ -1375,7 +1409,7 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, ioapic_register_intr(irq, cfg, attr->trigger); if (irq < legacy_pic->nr_legacy_irqs) - legacy_pic->mask(irq); + mask_legacy_irq(irq); ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry); } @@ -2167,6 +2201,7 @@ static unsigned int startup_ioapic_irq(struct irq_data *data) was_pending = 1; } __unmask_ioapic(data->chip_data); + ipipe_unlock_irq(irq); raw_spin_unlock_irqrestore(&ioapic_lock, flags); return was_pending; @@ -2284,7 +2319,8 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) exit_idle(); me = smp_processor_id(); - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + for (vector = FIRST_EXTERNAL_VECTOR + 1; vector < NR_VECTORS; + vector++) { unsigned int irq; unsigned int irr; struct irq_desc *desc; @@ -2364,14 +2400,16 @@ static inline void irq_complete_move(struct irq_cfg *cfg) { } static void ack_apic_edge(struct irq_data *data) { +#ifndef CONFIG_IPIPE irq_complete_move(data->chip_data); irq_move_irq(data); - ack_APIC_irq(); +#endif /* CONFIG_IPIPE */ + __ack_APIC_irq(); } atomic_t irq_mis_count; -#ifdef CONFIG_GENERIC_PENDING_IRQ +#if defined(CONFIG_GENERIC_PENDING_IRQ) || (defined(CONFIG_IPIPE) && defined(CONFIG_SMP)) static bool io_apic_level_ack_pending(struct irq_cfg *cfg) { struct irq_pin_list *entry; @@ -2399,7 +2437,7 @@ static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) { /* If we are moving the irq we need to mask it */ if (unlikely(irqd_is_setaffinity_pending(data))) { - mask_ioapic(cfg); + mask_ioapic(data->irq, cfg); return true; } return false; @@ -2437,7 +2475,7 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, */ if (!io_apic_level_ack_pending(cfg)) irq_move_masked_irq(data); - unmask_ioapic(cfg); + unmask_ioapic(data->irq, cfg); } } #else @@ -2451,12 +2489,44 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, } #endif +#if defined(CONFIG_IPIPE) && defined(CONFIG_SMP) + +static void move_xxapic_irq(struct irq_data *data) +{ + unsigned int irq = data->irq; + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = data->chip_data; + + if (desc->handle_irq == &handle_edge_irq) { + raw_spin_lock(&desc->lock); + irq_complete_move(cfg); + irq_move_irq(data); + raw_spin_unlock(&desc->lock); + } else if (desc->handle_irq == &handle_fasteoi_irq) { + raw_spin_lock(&desc->lock); + irq_complete_move(cfg); + if (irq_remapped(cfg)) + eoi_ioapic_irq(irq, cfg); + if (unlikely(irqd_is_setaffinity_pending(data))) { + if (!io_apic_level_ack_pending(cfg)) + irq_move_masked_irq(data); + unmask_ioapic(irq, cfg); + } + raw_spin_unlock(&desc->lock); + } else + WARN_ON_ONCE(1); +} + +#endif /* CONFIG_IPIPE && CONFIG_SMP */ + static void ack_apic_level(struct irq_data *data) { struct irq_cfg *cfg = data->chip_data; - int i, irq = data->irq; unsigned long v; + int i; +#ifndef CONFIG_IPIPE bool masked; + int irq = data->irq; irq_complete_move(cfg); masked = ioapic_irqd_mask(data, cfg); @@ -2516,17 +2586,59 @@ static void ack_apic_level(struct irq_data *data) } ioapic_irqd_unmask(data, cfg, masked); +#else /* CONFIG_IPIPE */ + /* + * Prevent low priority IRQs grabbed by high priority domains + * from being delayed, waiting for a high priority interrupt + * handler running in a low priority domain to complete. + * This code assumes hw interrupts off. + */ + i = cfg->vector; + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); + if (unlikely(!(v & (1 << (i & 0x1f))))) { + /* IO-APIC erratum: see comment above. */ + atomic_inc(&irq_mis_count); + raw_spin_lock(&ioapic_lock); + __eoi_ioapic_irq(cfg); + raw_spin_unlock(&ioapic_lock); + } + __ack_APIC_irq(); +#endif /* CONFIG_IPIPE */ +} + +#ifdef CONFIG_IPIPE + +static void hold_ioapic_irq(struct irq_data *data) +{ + struct irq_cfg *cfg = data->chip_data; + + raw_spin_lock(&ioapic_lock); + __mask_ioapic(cfg); + raw_spin_unlock(&ioapic_lock); + ack_apic_level(data); +} + +static void release_ioapic_irq(struct irq_data *data) +{ + struct irq_cfg *cfg = data->chip_data; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + __unmask_ioapic(cfg); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } +#endif /* CONFIG_IPIPE */ + #ifdef CONFIG_IRQ_REMAP static void ir_ack_apic_edge(struct irq_data *data) { - ack_APIC_irq(); + __ack_APIC_irq(); } static void ir_ack_apic_level(struct irq_data *data) { - ack_APIC_irq(); + __ack_APIC_irq(); eoi_ioapic_irq(data->irq, data->chip_data); } @@ -2543,6 +2655,13 @@ static void irq_remap_modify_chip_defaults(struct irq_chip *chip) #ifdef CONFIG_SMP chip->irq_set_affinity = set_remapped_irq_affinity; +#ifdef CONFIG_IPIPE + chip->irq_move = move_xxapic_irq; +#endif +#endif +#ifdef CONFIG_IPIPE + chip->irq_hold = hold_ioapic_irq; + chip->irq_release = release_ioapic_irq; #endif } #endif /* CONFIG_IRQ_REMAP */ @@ -2556,6 +2675,13 @@ static struct irq_chip ioapic_chip __read_mostly = { .irq_eoi = ack_apic_level, #ifdef CONFIG_SMP .irq_set_affinity = ioapic_set_affinity, +#ifdef CONFIG_IPIPE + .irq_move = move_xxapic_irq, +#endif +#endif +#ifdef CONFIG_IPIPE + .irq_hold = hold_ioapic_irq, + .irq_release = release_ioapic_irq, #endif .irq_retrigger = ioapic_retrigger_irq, }; @@ -2599,23 +2725,29 @@ static inline void init_IO_APIC_traps(void) static void mask_lapic_irq(struct irq_data *data) { - unsigned long v; + unsigned long v, flags; + flags = hard_cond_local_irq_save(); + ipipe_lock_irq(data->irq); v = apic_read(APIC_LVT0); apic_write(APIC_LVT0, v | APIC_LVT_MASKED); + hard_cond_local_irq_restore(flags); } static void unmask_lapic_irq(struct irq_data *data) { - unsigned long v; + unsigned long v, flags; + flags = hard_cond_local_irq_save(); v = apic_read(APIC_LVT0); apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); + ipipe_unlock_irq(data->irq); + hard_cond_local_irq_restore(flags); } static void ack_lapic_irq(struct irq_data *data) { - ack_APIC_irq(); + __ack_APIC_irq(); } static struct irq_chip lapic_chip __read_mostly = { @@ -2623,6 +2755,9 @@ static struct irq_chip lapic_chip __read_mostly = { .irq_mask = mask_lapic_irq, .irq_unmask = unmask_lapic_irq, .irq_ack = ack_lapic_irq, +#if defined(CONFIG_IPIPE) && defined(CONFIG_SMP) + .irq_move = move_xxapic_irq, +#endif }; static void lapic_register_intr(int irq) @@ -2723,7 +2858,7 @@ static inline void __init check_timer(void) /* * get/set the timer IRQ vector: */ - legacy_pic->mask(0); + mask_legacy_irq(0); assign_irq_vector(0, cfg, apic->target_cpus()); /* @@ -2781,7 +2916,7 @@ static inline void __init check_timer(void) int idx; idx = find_irq_entry(apic1, pin1, mp_INT); if (idx != -1 && irq_trigger(idx)) - unmask_ioapic(cfg); + unmask_ioapic(0, cfg); } if (timer_irq_works()) { if (disable_timer_pin_1 > 0) @@ -2815,7 +2950,7 @@ static inline void __init check_timer(void) * Cleanup, just in case ... */ local_irq_disable(); - legacy_pic->mask(0); + mask_legacy_irq(0); clear_IO_APIC_pin(apic2, pin2); apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); } @@ -2824,6 +2959,10 @@ static inline void __init check_timer(void) "...trying to set up timer as Virtual Wire IRQ...\n"); lapic_register_intr(0); +#if defined(CONFIG_IPIPE) && defined(CONFIG_X86_64) + irq_to_desc(0)->ipipe_ack = __ipipe_ack_edge_irq; + irq_to_desc(0)->ipipe_end = __ipipe_nop_irq; +#endif apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ legacy_pic->unmask(0); @@ -2832,7 +2971,7 @@ static inline void __init check_timer(void) goto out; } local_irq_disable(); - legacy_pic->mask(0); + mask_legacy_irq(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); @@ -2890,8 +3029,8 @@ void __init setup_IO_APIC(void) apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); /* - * Set up IO-APIC IRQ routing. - */ + * Set up IO-APIC IRQ routing. + */ x86_init.mpparse.setup_ioapic_ids(); sync_Arb_IDs(); @@ -3107,6 +3246,9 @@ static struct irq_chip msi_chip = { .irq_ack = ack_apic_edge, #ifdef CONFIG_SMP .irq_set_affinity = msi_set_affinity, +#ifdef CONFIG_IPIPE + .irq_move = move_xxapic_irq, +#endif #endif .irq_retrigger = ioapic_retrigger_irq, }; @@ -3352,6 +3494,9 @@ static struct irq_chip ht_irq_chip = { .irq_ack = ack_apic_edge, #ifdef CONFIG_SMP .irq_set_affinity = ht_set_affinity, +#ifdef CONFIG_IPIPE + .irq_move = move_xxapic_irq, +#endif #endif .irq_retrigger = ioapic_retrigger_irq, }; @@ -3634,6 +3779,18 @@ int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity) return 0; } +#ifdef CONFIG_IPIPE +unsigned __ipipe_get_ioapic_irq_vector(int irq) +{ + if (irq >= IPIPE_FIRST_APIC_IRQ && irq < IPIPE_NR_XIRQS) + return ipipe_apic_irq_vector(irq); + else if (irq == IRQ_MOVE_CLEANUP_VECTOR) + return irq; + else + return irq_cfg(irq)->vector; +} +#endif /* CONFIG_IPIPE */ + /* * This function currently is only a helper for the i386 smp boot process where * we need to reprogram the ioredtbls to cater for the cpus which have come online diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index cce91bf..83d88a8 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -29,12 +29,12 @@ void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector) * to an arbitrary mask, so I do a unicast to each CPU instead. * - mbligh */ - local_irq_save(flags); + flags = hard_local_irq_save(); for_each_cpu(query_cpu, mask) { __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu), vector, APIC_DEST_PHYSICAL); } - local_irq_restore(flags); + hard_local_irq_restore(flags); } void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, @@ -46,14 +46,14 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, /* See Hack comment above */ - local_irq_save(flags); + flags = hard_local_irq_save(); for_each_cpu(query_cpu, mask) { if (query_cpu == this_cpu) continue; __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu), vector, APIC_DEST_PHYSICAL); } - local_irq_restore(flags); + hard_local_irq_restore(flags); } #ifdef CONFIG_X86_32 @@ -70,12 +70,12 @@ void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, * should be modified to do 1 message per cluster ID - mbligh */ - local_irq_save(flags); + flags = hard_local_irq_save(); for_each_cpu(query_cpu, mask) __default_send_IPI_dest_field( early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), vector, apic->dest_logical); - local_irq_restore(flags); + hard_local_irq_restore(flags); } void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, @@ -87,7 +87,7 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, /* See Hack comment above */ - local_irq_save(flags); + flags = hard_local_irq_save(); for_each_cpu(query_cpu, mask) { if (query_cpu == this_cpu) continue; @@ -95,7 +95,7 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), vector, apic->dest_logical); } - local_irq_restore(flags); + hard_local_irq_restore(flags); } /* @@ -109,10 +109,10 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) if (WARN_ONCE(!mask, "empty IPI mask")) return; - local_irq_save(flags); + flags = hard_local_irq_save(); WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); __default_send_IPI_dest_field(mask, vector, apic->dest_logical); - local_irq_restore(flags); + hard_local_irq_restore(flags); } void default_send_IPI_allbutself(int vector) diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index ff35cff..48f1919 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -35,7 +35,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) x2apic_wrmsr_fence(); - local_irq_save(flags); + flags = hard_local_irq_save(); this_cpu = smp_processor_id(); @@ -72,7 +72,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr); } - local_irq_restore(flags); + hard_local_irq_restore(flags); } static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index c17e982..fb9aeb9 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -43,7 +43,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) x2apic_wrmsr_fence(); - local_irq_save(flags); + flags = hard_local_irq_save(); this_cpu = smp_processor_id(); for_each_cpu(query_cpu, mask) { @@ -52,7 +52,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), vector, APIC_DEST_PHYSICAL); } - local_irq_restore(flags); + hard_local_irq_restore(flags); } static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 68a3343..057710f 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -18,7 +18,7 @@ cyrix_get_arr(unsigned int reg, unsigned long *base, arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ - local_irq_save(flags); + flags = hard_local_irq_save(); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ @@ -28,7 +28,7 @@ cyrix_get_arr(unsigned int reg, unsigned long *base, rcr = getCx86(CX86_RCR_BASE + reg); setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ - local_irq_restore(flags); + hard_local_irq_restore(flags); shift = ((unsigned char *) base)[1] & 0x0f; *base >>= PAGE_SHIFT; @@ -178,6 +178,7 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) { unsigned char arr, arr_type, arr_size; + unsigned long flags; arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ @@ -221,6 +222,8 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base, } } + flags = hard_local_irq_save(); + prepare_set(); base <<= PAGE_SHIFT; @@ -230,6 +233,8 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base, setCx86(CX86_RCR_BASE + reg, arr_type); post_set(); + + hard_local_irq_restore(flags); } typedef struct { @@ -247,8 +252,10 @@ static unsigned char ccr_state[7] = { 0, 0, 0, 0, 0, 0, 0 }; static void cyrix_set_all(void) { + unsigned long flags; int i; + flags = hard_local_irq_save(); prepare_set(); /* the CCRs are not contiguous */ @@ -263,6 +270,7 @@ static void cyrix_set_all(void) } post_set(); + hard_local_irq_restore(flags); } static const struct mtrr_ops cyrix_mtrr_ops = { diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 75772ae..98b3d0f 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -718,7 +718,7 @@ static void generic_set_all(void) unsigned long mask, count; unsigned long flags; - local_irq_save(flags); + flags = hard_local_irq_save(); prepare_set(); /* Actually set the state */ @@ -728,7 +728,7 @@ static void generic_set_all(void) pat_init(); post_set(); - local_irq_restore(flags); + hard_local_irq_restore(flags); /* Use the atomic bitops to update the global mask */ for (count = 0; count < sizeof mask * 8; ++count) { @@ -752,12 +752,13 @@ static void generic_set_all(void) static void generic_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) { - unsigned long flags; + unsigned long rflags, vflags; struct mtrr_var_range *vr; vr = &mtrr_state.var_ranges[reg]; - local_irq_save(flags); + local_irq_save(vflags); + rflags = hard_local_irq_save(); prepare_set(); if (size == 0) { @@ -778,7 +779,8 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, } post_set(); - local_irq_restore(flags); + hard_local_irq_restore(rflags); + local_irq_restore(vflags); } int generic_validate_add_page(unsigned long base, unsigned long size, diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index e0b1d78..b4f0bcc 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -92,6 +92,9 @@ void show_regs(struct pt_regs *regs) printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", TASK_COMM_LEN, current->comm, task_pid_nr(current), current_thread_info(), current, task_thread_info(current)); +#ifdef CONFIG_IPIPE + printk(KERN_EMERG "I-pipe domain %s\n", ipipe_current_domain->name); +#endif /* CONFIG_IPIPE */ /* * When in-kernel, we also print out the stack and code at the * time of the fault.. diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 791b761..0a1626f 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -256,6 +256,11 @@ void show_regs(struct pt_regs *regs) printk("CPU %d ", cpu); print_modules(); __show_regs(regs, 1); +#ifdef CONFIG_IPIPE + if (ipipe_current_domain != ipipe_root_domain) + printk("I-pipe domain %s\n", ipipe_current_domain->name); + else +#endif /* CONFIG_IPIPE */ printk("Process %s (pid: %d, threadinfo %p, task %p)\n", cur->comm, cur->pid, task_thread_info(cur), cur); diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 623f288..593b754 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -83,6 +84,64 @@ * enough to patch inline, increasing performance. */ +#ifdef CONFIG_IPIPE +#define CATCH_ROOT_SYSCALL(bypass_check,bypass_nocheck) \ + movl %esp,%eax ; \ + call __ipipe_syscall_root ; \ + testl %eax,%eax ; \ + movl PT_EAX(%esp),%eax ; \ + js bypass_check ; \ + jne bypass_nocheck ; \ + movl PT_ORIG_EAX(%esp),%eax +#define PUSH_XCODE(v) pushl $ ex_ ## v +#define PUSH_XCODE_CFI(v) pushl $ ex_ ## v ; CFI_ADJUST_CFA_OFFSET 4 +#define PUSH_XVEC(v) pushl $ ex_ ## v +#define PUSH_XVEC_CFI(v) pushl $ ex_ ## v ; CFI_ADJUST_CFA_OFFSET 4 +#define HANDLE_EXCEPTION(code) movl %code,%ecx ; \ + call __ipipe_handle_exception ; \ + testl %eax,%eax ; \ + jnz restore_nocheck +#define DIVERT_EXCEPTION(code) movl $(__USER_DS), %ecx ; \ + movl %ecx, %ds ; \ + movl %ecx, %es ; \ + movl %esp, %eax ; \ + movl $ex_ ## code,%edx ; \ + call __ipipe_divert_exception ; \ + testl %eax,%eax ; \ + jnz restore_nocheck +#define PREEMPT_SCHEDULE_IRQ call __ipipe_preempt_schedule_irq + +#ifdef CONFIG_IPIPE_TRACE_IRQSOFF +# define IPIPE_TRACE_IRQ_ENTER \ + movl PT_ORIG_EAX(%esp), %eax; \ + lea PT_EIP-4(%esp), %ebp; \ + cmp $0, %eax; \ + jge 9998f; \ + not %eax; \ +9998: call ipipe_trace_begin +# define IPIPE_TRACE_IRQ_EXIT \ + pushl %eax; \ + movl PT_ORIG_EAX+4(%esp), %eax; \ + cmp $0, %eax; \ + jge 9999f; \ + not %eax; \ +9999: call ipipe_trace_end; \ + popl %eax +#else /* !CONFIG_IPIPE_TRACE_IRQSOFF */ +#define IPIPE_TRACE_IRQ_ENTER +#define IPIPE_TRACE_IRQ_EXIT +#endif /* CONFIG_IPIPE_TRACE_IRQSOFF */ +#else /* !CONFIG_IPIPE */ +#define CATCH_ROOT_SYSCALL(bypass_check,bypass_nocheck) +#define PUSH_XCODE(v) pushl $v +#define PUSH_XCODE_CFI(v) pushl $v ; CFI_ADJUST_CFA_OFFSET 4 +#define PUSH_XVEC(v) pushl v +#define PUSH_XVEC_CFI(v) pushl v ; CFI_ADJUST_CFA_OFFSET 4 +#define HANDLE_EXCEPTION(code) call *%code +#define DIVERT_EXCEPTION(code) +#define PREEMPT_SCHEDULE_IRQ call preempt_schedule_irq +#endif /* CONFIG_IPIPE */ + #ifdef CONFIG_PREEMPT #define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else @@ -287,6 +346,7 @@ .endm ENTRY(ret_from_fork) + HARD_COND_ENABLE_INTERRUPTS CFI_STARTPROC pushl_cfi %eax call schedule_tail @@ -314,7 +374,7 @@ END(ret_from_fork) RING0_PTREGS_FRAME ret_from_exception: preempt_stop(CLBR_ANY) -ret_from_intr: +ENTRY(ret_from_intr) GET_THREAD_INFO(%ebp) #ifdef CONFIG_VM86 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS @@ -355,7 +415,7 @@ need_resched: jz restore_all testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? jz restore_all - call preempt_schedule_irq + PREEMPT_SCHEDULE_IRQ jmp need_resched END(resume_kernel) #endif @@ -412,6 +472,7 @@ sysenter_past_esp: _ASM_EXTABLE(1b,syscall_fault) GET_THREAD_INFO(%ebp) + CATCH_ROOT_SYSCALL(sysenter_tail,sysenter_exit) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz sysenter_audit @@ -420,6 +481,7 @@ sysenter_do_call: jae syscall_badsys call *sys_call_table(,%eax,4) movl %eax,PT_EAX(%esp) +sysenter_tail: LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF @@ -491,6 +553,7 @@ ENTRY(system_call) pushl_cfi %eax # save orig_eax SAVE_ALL GET_THREAD_INFO(%ebp) + CATCH_ROOT_SYSCALL(syscall_exit,restore_nocheck) # system call tracing in operation / emulation testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz syscall_trace_entry @@ -529,7 +592,7 @@ irq_return: .section .fixup,"ax" ENTRY(iret_exc) pushl $0 # no error code - pushl $do_iret_error + PUSH_XCODE(do_iret_error) jmp error_code .previous _ASM_EXTABLE(irq_return,iret_exc) @@ -592,6 +655,7 @@ work_pending: testb $_TIF_NEED_RESCHED, %cl jz work_notifysig work_resched: + HARD_COND_ENABLE_INTERRUPTS call schedule LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt @@ -607,6 +671,7 @@ work_resched: work_notifysig: # deal with pending signals and # notify-resume requests + HARD_COND_ENABLE_INTERRUPTS #ifdef CONFIG_VM86 testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) movl %esp, %eax @@ -819,6 +884,44 @@ END(irq_entries_start) END(interrupt) .previous +#ifdef CONFIG_IPIPE + .p2align CONFIG_X86_L1_CACHE_SHIFT +common_interrupt: + addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ + SAVE_ALL + IPIPE_TRACE_IRQ_ENTER + movl %esp, %eax + call __ipipe_handle_irq + IPIPE_TRACE_IRQ_EXIT + testl %eax,%eax + jnz ret_from_intr + jmp restore_nocheck + CFI_ENDPROC + + .pushsection .kprobes.text, "ax" +#define BUILD_INTERRUPT3(name, nr, fn) \ +ENTRY(name) \ + RING0_INT_FRAME; \ + pushl_cfi $~(nr); \ + SAVE_ALL; \ + IPIPE_TRACE_IRQ_ENTER; \ + movl %esp, %eax; \ + call __ipipe_handle_irq; \ + IPIPE_TRACE_IRQ_EXIT; \ + testl %eax,%eax; \ + jnz ret_from_intr; \ + jmp restore_nocheck; \ + CFI_ENDPROC + +#define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name) + +#ifdef CONFIG_X86_LOCAL_APIC + BUILD_INTERRUPT(ipipe_hrtimer_interrupt, IPIPE_HRTIMER_VECTOR) + BUILD_INTERRUPT(ipipe_reschedule_interrupt, IPIPE_RESCHEDULE_VECTOR) + BUILD_INTERRUPT(ipipe_critical_interrupt, IPIPE_CRITICAL_VECTOR) +#endif + +#else /* !CONFIG_IPIPE */ /* * the CPU automatically disables interrupts when executing an IRQ vector, * so IRQ-flags tracing has to follow that: @@ -852,13 +955,15 @@ ENDPROC(name) #define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name) +#endif /* !CONFIG_IPIPE */ + /* The include is where all of the SMP etc. interrupts come from */ #include ENTRY(coprocessor_error) RING0_INT_FRAME pushl_cfi $0 - pushl_cfi $do_coprocessor_error + PUSH_XCODE_CFI(do_coprocessor_error) jmp error_code CFI_ENDPROC END(coprocessor_error) @@ -868,17 +973,17 @@ ENTRY(simd_coprocessor_error) pushl_cfi $0 #ifdef CONFIG_X86_INVD_BUG /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ -661: pushl_cfi $do_general_protection +661: PUSH_XCODE_CFI(do_general_protection) 662: .section .altinstructions,"a" altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f .previous .section .altinstr_replacement,"ax" -663: pushl $do_simd_coprocessor_error +663: PUSH_XCODE(do_simd_coprocessor_error) 664: .previous #else - pushl_cfi $do_simd_coprocessor_error + PUSH_XCODE_CFI(do_simd_coprocessor_error) #endif jmp error_code CFI_ENDPROC @@ -887,7 +992,7 @@ END(simd_coprocessor_error) ENTRY(device_not_available) RING0_INT_FRAME pushl_cfi $-1 # mark this as an int - pushl_cfi $do_device_not_available + PUSH_XCODE_CFI(do_device_not_available) jmp error_code CFI_ENDPROC END(device_not_available) @@ -907,7 +1012,7 @@ END(native_irq_enable_sysexit) ENTRY(overflow) RING0_INT_FRAME pushl_cfi $0 - pushl_cfi $do_overflow + PUSH_XCODE_CFI(do_overflow) jmp error_code CFI_ENDPROC END(overflow) @@ -915,7 +1020,7 @@ END(overflow) ENTRY(bounds) RING0_INT_FRAME pushl_cfi $0 - pushl_cfi $do_bounds + PUSH_XCODE_CFI(do_bounds) jmp error_code CFI_ENDPROC END(bounds) @@ -923,7 +1028,7 @@ END(bounds) ENTRY(invalid_op) RING0_INT_FRAME pushl_cfi $0 - pushl_cfi $do_invalid_op + PUSH_XCODE_CFI(do_invalid_op) jmp error_code CFI_ENDPROC END(invalid_op) @@ -931,35 +1036,35 @@ END(invalid_op) ENTRY(coprocessor_segment_overrun) RING0_INT_FRAME pushl_cfi $0 - pushl_cfi $do_coprocessor_segment_overrun + PUSH_XCODE_CFI(do_coprocessor_segment_overrun) jmp error_code CFI_ENDPROC END(coprocessor_segment_overrun) ENTRY(invalid_TSS) RING0_EC_FRAME - pushl_cfi $do_invalid_TSS + PUSH_XCODE_CFI(do_invalid_TSS) jmp error_code CFI_ENDPROC END(invalid_TSS) ENTRY(segment_not_present) RING0_EC_FRAME - pushl_cfi $do_segment_not_present + PUSH_XCODE_CFI(do_segment_not_present) jmp error_code CFI_ENDPROC END(segment_not_present) ENTRY(stack_segment) RING0_EC_FRAME - pushl_cfi $do_stack_segment + PUSH_XCODE_CFI(do_stack_segment) jmp error_code CFI_ENDPROC END(stack_segment) ENTRY(alignment_check) RING0_EC_FRAME - pushl_cfi $do_alignment_check + PUSH_XCODE_CFI(do_alignment_check) jmp error_code CFI_ENDPROC END(alignment_check) @@ -967,7 +1072,7 @@ END(alignment_check) ENTRY(divide_error) RING0_INT_FRAME pushl_cfi $0 # no error code - pushl_cfi $do_divide_error + PUSH_XCODE_CFI(do_divide_error) jmp error_code CFI_ENDPROC END(divide_error) @@ -976,7 +1081,7 @@ END(divide_error) ENTRY(machine_check) RING0_INT_FRAME pushl_cfi $0 - pushl_cfi machine_check_vector + PUSH_XVEC_CFI(machine_check_vector) jmp error_code CFI_ENDPROC END(machine_check) @@ -985,7 +1090,7 @@ END(machine_check) ENTRY(spurious_interrupt_bug) RING0_INT_FRAME pushl_cfi $0 - pushl_cfi $do_spurious_interrupt_bug + PUSH_XCODE_CFI(do_spurious_interrupt_bug) jmp error_code CFI_ENDPROC END(spurious_interrupt_bug) @@ -1207,7 +1312,7 @@ return_to_handler: ENTRY(page_fault) RING0_EC_FRAME - pushl_cfi $do_page_fault + PUSH_XCODE_CFI(do_page_fault) ALIGN error_code: /* the function address is in %gs's slot on the stack */ @@ -1244,9 +1349,11 @@ error_code: movl $(__USER_DS), %ecx movl %ecx, %ds movl %ecx, %es +#ifndef CONFIG_IPIPE TRACE_IRQS_OFF +#endif movl %esp,%eax # pt_regs pointer - call *%edi + HANDLE_EXCEPTION(edi) jmp ret_from_exception CFI_ENDPROC END(page_fault) @@ -1286,6 +1393,7 @@ debug_stack_correct: pushl_cfi $-1 # mark this as an int SAVE_ALL TRACE_IRQS_OFF + DIVERT_EXCEPTION(do_debug) xorl %edx,%edx # error code 0 movl %esp,%eax # pt_regs pointer call do_debug @@ -1376,6 +1484,7 @@ ENTRY(int3) pushl_cfi $-1 # mark this as an int SAVE_ALL TRACE_IRQS_OFF + DIVERT_EXCEPTION(do_int3) xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_int3 @@ -1385,7 +1494,7 @@ END(int3) ENTRY(general_protection) RING0_EC_FRAME - pushl_cfi $do_general_protection + PUSH_XCODE_CFI(do_general_protection) jmp error_code CFI_ENDPROC END(general_protection) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 7d65133..08a5120 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -64,6 +65,12 @@ #define __AUDIT_ARCH_64BIT 0x80000000 #define __AUDIT_ARCH_LE 0x40000000 +#ifdef CONFIG_IPIPE +#define PREEMPT_SCHEDULE_IRQ call __ipipe_preempt_schedule_irq +#else /* !CONFIG_IPIPE */ +#define PREEMPT_SCHEDULE_IRQ call preempt_schedule_irq +#endif /* !CONFIG_IPIPE */ + .code64 .section .entry.text, "ax" @@ -369,8 +376,10 @@ ENDPROC(native_usergs_sysret64) * moving irq_enter into assembly, which would be too much work) */ 1: incl PER_CPU_VAR(irq_count) +#ifndef CONFIG_IPIPE cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp CFI_DEF_CFA_REGISTER rsi +#endif /* Store previous stack value */ pushq %rsi @@ -380,7 +389,9 @@ ENDPROC(native_usergs_sysret64) 0x08 /* DW_OP_const1u */, SS+8-RBP, \ 0x22 /* DW_OP_plus */ /* We entered an interrupt context - irqs are off: */ +#ifndef CONFIG_IPIPE TRACE_IRQS_OFF +#endif .endm ENTRY(save_rest) @@ -442,6 +453,7 @@ ENTRY(ret_from_fork) pushq_cfi kernel_eflags(%rip) popfq_cfi # reset kernel eflags + HARD_COND_ENABLE_INTERRUPTS call schedule_tail # rdi: 'prev' task parameter @@ -517,6 +529,17 @@ GLOBAL(system_call_after_swapgs) movq %rax,ORIG_RAX-ARGOFFSET(%rsp) movq %rcx,RIP-ARGOFFSET(%rsp) CFI_REL_OFFSET rip,RIP-ARGOFFSET +#ifdef CONFIG_IPIPE + pushq %rdi + pushq %rax + leaq -(ARGOFFSET-16)(%rsp),%rdi # regs for handler + call __ipipe_syscall_root_thunk + testl %eax, %eax + popq %rax + popq %rdi + js ret_from_sys_call + jnz sysret_fastexit +#endif testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jnz tracesys system_call_fastpath: @@ -549,6 +572,7 @@ sysret_check: * sysretq will re-enable interrupts: */ TRACE_IRQS_ON +sysret_fastexit: movq RIP-ARGOFFSET(%rsp),%rcx CFI_REGISTER rip,rcx RESTORE_ARGS 1,-ARG_SKIP,0 @@ -560,6 +584,8 @@ sysret_check: /* Handle reschedules */ /* edx: work, edi: workmask */ sysret_careful: + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),%edx + jnz ret_from_sys_call_trace bt $TIF_NEED_RESCHED,%edx jnc sysret_signal TRACE_IRQS_ON @@ -569,6 +595,16 @@ sysret_careful: popq_cfi %rdi jmp sysret_check +ret_from_sys_call_trace: + TRACE_IRQS_ON + sti + SAVE_REST + FIXUP_TOP_OF_STACK %rdi + movq %rsp,%rdi + LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ + RESTORE_REST + jmp int_ret_from_sys_call + /* Handle a signal */ sysret_signal: TRACE_IRQS_ON @@ -867,11 +903,35 @@ END(interrupt) /* 0(%rsp): ~(interrupt number) */ .macro interrupt func - /* reserve pt_regs for scratch regs and rbp */ + /* Reserve pt_regs for scratch regs and rbp */ subq $ORIG_RAX-RBP, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP SAVE_ARGS_IRQ +#ifdef CONFIG_IPIPE_TRACE_IRQSOFF + /* pt_regs is %rdi (clobbered in the following) or %rsp + 8-RBP */ + movq ORIG_RAX+8-RBP(%rsp), %rdi # IRQ number... + notq %rdi # ...is inverted, fix up + leaq RIP-8+8-RBP(%rsp), %rbp # Show interrupted address in trace + call ipipe_trace_begin + + /* Restore original content of %rdi and %rbp for the interrupt handler */ + leaq 8-RBP(%rsp), %rdi + movq RBP+8-RBP(%rsp), %rbp + + call \func + + /* Show IRQ number and interrupted address in the trace, as before */ + movq ORIG_RAX+8-RBP(%rsp), %rdi + notq %rdi + leaq RIP-8+8-RBP(%rsp), %rbp + + pushq %rax + call ipipe_trace_end + popq %rax + movq RBP+8-RBP(%rsp), %rbp +#else call \func +#endif .endm /* @@ -884,9 +944,25 @@ END(interrupt) */ .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: +#ifdef CONFIG_IPIPE + XCPT_FRAME + addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ + interrupt __ipipe_handle_irq + testl %eax, %eax + jnz ret_from_intr + decl PER_CPU_VAR(irq_count) + popq %rsi + leaq 16(%rsi), %rsp + CFI_DEF_CFA_REGISTER rsp + CFI_ADJUST_CFA_OFFSET -16 + testl $3,CS-ARGOFFSET(%rsp) + jz restore_args + jmp retint_swapgs_notrace +#else /* !CONFIG_IPIPE */ XCPT_FRAME addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ interrupt do_IRQ +#endif /* !CONFIG_IPIPE */ /* 0(%rsp): old_rsp-ARGOFFSET */ ret_from_intr: DISABLE_INTERRUPTS(CLBR_NONE) @@ -900,7 +976,7 @@ ret_from_intr: CFI_DEF_CFA_REGISTER rsp CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET -exit_intr: +ENTRY(exit_intr) GET_THREAD_INFO(%rcx) testl $3,CS-ARGOFFSET(%rsp) je retint_kernel @@ -920,20 +996,20 @@ retint_check: jnz retint_careful retint_swapgs: /* return to user-space */ + TRACE_IRQS_IRETQ /* * The iretq could re-enable interrupts: */ - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_IRETQ +retint_swapgs_notrace: SWAPGS +retint_noswapgs: jmp restore_args retint_restore_args: /* return to kernel space */ - DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_IRETQ /* * The iretq could re-enable interrupts: */ - TRACE_IRQS_IRETQ restore_args: RESTORE_ARGS 1,8,1 @@ -1007,7 +1083,15 @@ ENTRY(retint_kernel) jnc retint_restore_args bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ jnc retint_restore_args - call preempt_schedule_irq +#ifdef CONFIG_IPIPE + /* + * We may have preempted call_softirq before __do_softirq raised or + * after it lowered the preemption counter. + */ + cmpl $0,PER_CPU_VAR(irq_count) + jge retint_restore_args +#endif + PREEMPT_SCHEDULE_IRQ jmp exit_intr #endif @@ -1026,11 +1110,27 @@ ENTRY(\sym) INTR_FRAME pushq_cfi $~(\num) .Lcommon_\sym: +#ifdef CONFIG_IPIPE + interrupt __ipipe_handle_irq + testl %eax, %eax + jnz ret_from_intr + decl PER_CPU_VAR(irq_count) + popq %rsi + leaq 16(%rsi), %rsp + CFI_DEF_CFA_REGISTER rsp + CFI_ADJUST_CFA_OFFSET -16 + testl $3,CS-ARGOFFSET(%rsp) + jz restore_args + jmp retint_swapgs_notrace + CFI_ENDPROC + .endm +#else /* !CONFIG_IPIPE */ interrupt \do_sym jmp ret_from_intr CFI_ENDPROC END(\sym) .endm +#endif /* !CONFIG_IPIPE */ #ifdef CONFIG_SMP apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ @@ -1078,6 +1178,14 @@ apicinterrupt CALL_FUNCTION_VECTOR \ call_function_interrupt smp_call_function_interrupt apicinterrupt RESCHEDULE_VECTOR \ reschedule_interrupt smp_reschedule_interrupt +#ifdef CONFIG_IPIPE +apicinterrupt IPIPE_RESCHEDULE_VECTOR ipipe_reschedule_interrupt smp_spurious_interrupt +apicinterrupt IPIPE_CRITICAL_VECTOR ipipe_critical_interrupt smp_spurious_interrupt +#endif +#endif + +#ifdef CONFIG_IPIPE +apicinterrupt IPIPE_HRTIMER_VECTOR ipipe_hrtimer_interrupt smp_spurious_interrupt #endif apicinterrupt ERROR_APIC_VECTOR \ @@ -1093,7 +1201,7 @@ apicinterrupt IRQ_WORK_VECTOR \ /* * Exception entry points. */ -.macro zeroentry sym do_sym +.macro zeroentry sym do_sym ex_code ENTRY(\sym) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME @@ -1104,13 +1212,28 @@ ENTRY(\sym) DEFAULT_FRAME 0 movq %rsp,%rdi /* pt_regs pointer */ xorl %esi,%esi /* no error code */ +#ifdef CONFIG_IPIPE + movq $\ex_code,%rdx + call __ipipe_handle_exception /* handle(regs, error_code, ex_code) */ + TRACE_IRQS_OFF + testl %eax, %eax + jz error_exit + movl %ebx,%eax + RESTORE_REST + DISABLE_INTERRUPTS(CLBR_NONE) + testl %eax,%eax + jne retint_noswapgs + jmp retint_swapgs_notrace +#else /* !CONFIG_IPIPE */ + TRACE_IRQS_OFF call \do_sym +#endif /* !CONFIG_IPIPE */ jmp error_exit /* %ebx: no swapgs flag */ CFI_ENDPROC END(\sym) .endm -.macro paranoidzeroentry sym do_sym +.macro paranoidzeroentry sym do_sym ex_code=0 ENTRY(\sym) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME @@ -1118,17 +1241,32 @@ ENTRY(\sym) subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call save_paranoid - TRACE_IRQS_OFF movq %rsp,%rdi /* pt_regs pointer */ +#ifdef CONFIG_IPIPE + .if \ex_code + movq $\ex_code,%rsi + call __ipipe_divert_exception /* handle(regs, ex_code) */ + TRACE_IRQS_OFF + testl %eax,%eax + jnz 1f + movq %rsp,%rdi + .endif +#else + TRACE_IRQS_OFF +#endif xorl %esi,%esi /* no error code */ call \do_sym +#ifdef CONFIG_IPIPE + xorl %eax,%eax /* tell paranoid_exit to propagate the exception */ +1: +#endif jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC END(\sym) .endm #define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) -.macro paranoidzeroentry_ist sym do_sym ist +.macro paranoidzeroentry_ist sym do_sym ist ex_code=0 ENTRY(\sym) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME @@ -1138,16 +1276,29 @@ ENTRY(\sym) call save_paranoid TRACE_IRQS_OFF_DEBUG movq %rsp,%rdi /* pt_regs pointer */ +#ifdef CONFIG_IPIPE + .if \ex_code + movq $\ex_code,%rsi + call __ipipe_divert_exception /* handle(regs, ex_code) */ + testl %eax,%eax + jnz 1f + movq %rsp,%rdi + .endif +#endif xorl %esi,%esi /* no error code */ subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) call \do_sym addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) +#ifdef CONFIG_IPIPE + xorl %eax,%eax /* tell paranoid_exit to propagate the exception */ +1: +#endif jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC END(\sym) .endm -.macro errorentry sym do_sym +.macro errorentry sym do_sym ex_code ENTRY(\sym) XCPT_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME @@ -1158,14 +1309,29 @@ ENTRY(\sym) movq %rsp,%rdi /* pt_regs pointer */ movq ORIG_RAX(%rsp),%rsi /* get error code */ movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ +#ifdef CONFIG_IPIPE + movq $\ex_code,%rdx + call __ipipe_handle_exception /* handle(regs, error_code, ex_code) */ + TRACE_IRQS_OFF + testl %eax, %eax + jz error_exit + movl %ebx,%eax + RESTORE_REST + DISABLE_INTERRUPTS(CLBR_NONE) + testl %eax,%eax + jne retint_noswapgs + jmp retint_swapgs_notrace +#else /* !CONFIG_IPIPE */ + TRACE_IRQS_OFF call \do_sym +#endif /* !CONFIG_IPIPE */ jmp error_exit /* %ebx: no swapgs flag */ CFI_ENDPROC END(\sym) .endm /* error code is on the stack already */ -.macro paranoiderrorentry sym do_sym +.macro paranoiderrorentry sym do_sym ex_code=0 ENTRY(\sym) XCPT_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME @@ -1175,27 +1341,40 @@ ENTRY(\sym) DEFAULT_FRAME 0 TRACE_IRQS_OFF movq %rsp,%rdi /* pt_regs pointer */ +#ifdef CONFIG_IPIPE + .if \ex_code + movq $\ex_code,%rsi + call __ipipe_divert_exception /* handle(regs, ex_code) */ + testl %eax,%eax + jnz 1f + movq %rsp,%rdi + .endif +#endif movq ORIG_RAX(%rsp),%rsi /* get error code */ movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ call \do_sym +#ifdef CONFIG_IPIPE + xorl %eax,%eax /* tell paranoid_exit to propagate the exception */ +1: +#endif jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC END(\sym) .endm -zeroentry divide_error do_divide_error -zeroentry overflow do_overflow -zeroentry bounds do_bounds -zeroentry invalid_op do_invalid_op -zeroentry device_not_available do_device_not_available +zeroentry divide_error do_divide_error ex_do_divide_error +zeroentry overflow do_overflow ex_do_overflow +zeroentry bounds do_bounds ex_do_bounds +zeroentry invalid_op do_invalid_op ex_do_invalid_op +zeroentry device_not_available do_device_not_available ex_do_device_not_available paranoiderrorentry double_fault do_double_fault -zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun -errorentry invalid_TSS do_invalid_TSS -errorentry segment_not_present do_segment_not_present -zeroentry spurious_interrupt_bug do_spurious_interrupt_bug -zeroentry coprocessor_error do_coprocessor_error -errorentry alignment_check do_alignment_check -zeroentry simd_coprocessor_error do_simd_coprocessor_error +zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun ex_do_coprocessor_segment_overrun +errorentry invalid_TSS do_invalid_TSS ex_do_invalid_TSS +errorentry segment_not_present do_segment_not_present ex_do_segment_not_present +zeroentry spurious_interrupt_bug do_spurious_interrupt_bug ex_do_spurious_interrupt_bug +zeroentry coprocessor_error do_coprocessor_error ex_do_coprocessor_error +errorentry alignment_check do_alignment_check ex_do_alignment_check +zeroentry simd_coprocessor_error do_simd_coprocessor_error ex_do_simd_coprocessor_error /* Reload gs selector with exception handling */ @@ -1277,15 +1456,19 @@ ENTRY(call_softirq) CFI_REL_OFFSET rbp,0 mov %rsp,%rbp CFI_DEF_CFA_REGISTER rbp + HARD_COND_DISABLE_INTERRUPTS incl PER_CPU_VAR(irq_count) cmove PER_CPU_VAR(irq_stack_ptr),%rsp + HARD_COND_ENABLE_INTERRUPTS push %rbp # backlink for old unwinder call __do_softirq + HARD_COND_DISABLE_INTERRUPTS leaveq CFI_RESTORE rbp CFI_DEF_CFA_REGISTER rsp CFI_ADJUST_CFA_OFFSET -8 decl PER_CPU_VAR(irq_count) + HARD_COND_ENABLE_INTERRUPTS ret CFI_ENDPROC END(call_softirq) @@ -1397,16 +1580,16 @@ apicinterrupt XEN_HVM_EVTCHN_CALLBACK \ */ .pushsection .kprobes.text, "ax" -paranoidzeroentry_ist debug do_debug DEBUG_STACK -paranoidzeroentry_ist int3 do_int3 DEBUG_STACK +paranoidzeroentry_ist debug do_debug DEBUG_STACK ex_do_debug +paranoidzeroentry_ist int3 do_int3 DEBUG_STACK ex_do_int3 paranoiderrorentry stack_segment do_stack_segment #ifdef CONFIG_XEN zeroentry xen_debug do_debug zeroentry xen_int3 do_int3 errorentry xen_stack_segment do_stack_segment #endif -errorentry general_protection do_general_protection -errorentry page_fault do_page_fault +errorentry general_protection do_general_protection ex_do_general_protection +errorentry page_fault do_page_fault ex_do_page_fault #ifdef CONFIG_KVM_GUEST errorentry async_page_fault do_async_page_fault #endif @@ -1432,8 +1615,13 @@ ENTRY(paranoid_exit) DEFAULT_FRAME DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF_DEBUG +paranoid_notrace: testl %ebx,%ebx /* swapgs needed? */ jnz paranoid_restore +#ifdef CONFIG_IPIPE + testl %eax,%eax + jnz paranoid_swapgs +#endif testl $3,CS(%rsp) jnz paranoid_userspace paranoid_swapgs: @@ -1504,7 +1692,6 @@ ENTRY(error_entry) error_swapgs: SWAPGS error_sti: - TRACE_IRQS_OFF ret /* diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 1460a5d..d6fb3ec 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -49,6 +50,9 @@ struct hpet_dev { int cpu; unsigned int irq; unsigned int flags; +#ifdef CONFIG_IPIPE + struct ipipe_timer itimer; +#endif /* CONFIG_IPIPE */ char name[10]; }; @@ -233,6 +237,12 @@ static void hpet_legacy_set_mode(enum clock_event_mode mode, static int hpet_legacy_next_event(unsigned long delta, struct clock_event_device *evt); +#ifdef CONFIG_IPIPE +static struct ipipe_timer hpet_itimer = { + .irq = 0, +}; +#endif /* CONFIG_IPIPE */ + /* * The hpet clock event device */ @@ -243,6 +253,9 @@ static struct clock_event_device hpet_clockevent = { .set_next_event = hpet_legacy_next_event, .irq = 0, .rating = 50, +#ifdef CONFIG_IPIPE + .ipipe_timer = &hpet_itimer, +#endif /* CONFIG_IPIPE */ }; static void hpet_stop_counter(void) @@ -617,8 +630,20 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) hdev->flags |= HPET_DEV_FSB_CAP; hdev->flags |= HPET_DEV_VALID; num_timers_used++; - if (num_timers_used == num_possible_cpus()) + if (num_timers_used == num_possible_cpus()) { +#ifdef CONFIG_IPIPE + /* + * Only register ipipe_timers if there is one + * for each cpu + */ + for (i = 0; i < num_timers_used; i++) { + hdev = &hpet_devs[i]; + hdev->evt.ipipe_timer = &hdev->itimer; + hdev->itimer.irq = hdev->irq; + } +#endif /* CONFIG_IPIPE */ break; + } } printk(KERN_INFO "HPET: %d timers in total, %d timers will be used for per-cpu timer\n", diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index f250431..a505498 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -80,9 +80,11 @@ EXPORT_SYMBOL(irq_fpu_usable); void kernel_fpu_begin(void) { struct task_struct *me = current; + unsigned long flags; WARN_ON_ONCE(!irq_fpu_usable()); preempt_disable(); + flags = hard_cond_local_irq_save(); if (__thread_has_fpu(me)) { __save_init_fpu(me); __thread_clear_has_fpu(me); @@ -91,6 +93,7 @@ void kernel_fpu_begin(void) this_cpu_write(fpu_owner_task, NULL); clts(); } + hard_cond_local_irq_restore(flags); } EXPORT_SYMBOL(kernel_fpu_begin); diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index f2b96de..51e9d08 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -26,6 +26,10 @@ void __init setup_pit_timer(void) #ifndef CONFIG_X86_64 static int __init init_pit_clocksource(void) { +#ifdef CONFIG_IPIPE + if (cpu_has_tsc == 0 && is_hpet_enabled() == 0) + return clocksource_i8253_init(); +#endif /* CONFIG_IPIPE */ /* * Several reasons not to register PIT as a clocksource: * diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 36d1853..52099a5 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -31,7 +31,7 @@ static void init_8259A(int auto_eoi); static int i8259A_auto_eoi; -DEFINE_RAW_SPINLOCK(i8259A_lock); +IPIPE_DEFINE_RAW_SPINLOCK(i8259A_lock); /* * 8259A PIC functions to handle ISA devices: @@ -59,6 +59,7 @@ static void mask_8259A_irq(unsigned int irq) unsigned long flags; raw_spin_lock_irqsave(&i8259A_lock, flags); + ipipe_lock_irq(irq); cached_irq_mask |= mask; if (irq & 8) outb(cached_slave_mask, PIC_SLAVE_IMR); @@ -74,15 +75,18 @@ static void disable_8259A_irq(struct irq_data *data) static void unmask_8259A_irq(unsigned int irq) { - unsigned int mask = ~(1 << irq); + unsigned int mask = (1 << irq); unsigned long flags; raw_spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask &= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); + if (cached_irq_mask & mask) { + cached_irq_mask &= ~mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + ipipe_unlock_irq(irq); + } raw_spin_unlock_irqrestore(&i8259A_lock, flags); } @@ -169,6 +173,18 @@ static void mask_and_ack_8259A(struct irq_data *data) */ if (cached_irq_mask & irqmask) goto spurious_8259A_irq; +#ifdef CONFIG_IPIPE + if (irq == 0) { + /* + * Fast timer ack -- don't mask (unless supposedly + * spurious). We trace outb's in order to detect + * broken hardware inducing large delays. + */ + outb(0x60, PIC_MASTER_CMD); /* Specific EOI to master. */ + raw_spin_unlock_irqrestore(&i8259A_lock, flags); + return; + } +#endif /* CONFIG_IPIPE */ cached_irq_mask |= irqmask; handle_real_irq: diff --git a/arch/x86/kernel/ipipe.c b/arch/x86/kernel/ipipe.c new file mode 100644 index 0000000..f833663 --- /dev/null +++ b/arch/x86/kernel/ipipe.c @@ -0,0 +1,675 @@ +/* -*- linux-c -*- + * linux/arch/x86/kernel/ipipe.c + * + * Copyright (C) 2002-2012 Philippe Gerum. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Architecture-dependent I-PIPE support for x86. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_X86_LOCAL_APIC +#include +#include +#include +#include +#ifdef CONFIG_X86_IO_APIC +#include +#endif /* CONFIG_X86_IO_APIC */ +#include +#endif /* CONFIG_X86_LOCAL_APIC */ +#include +#include +#include +#include + +EXPORT_SYMBOL(io_apic_irqs); +EXPORT_SYMBOL(find_task_by_pid_ns); +extern void *sys_call_table; +EXPORT_SYMBOL(sys_call_table); +#ifdef CONFIG_X86_32 +extern spinlock_t i8259A_lock; +EXPORT_SYMBOL(i8259A_lock); +#endif + +DEFINE_PER_CPU(unsigned long, __ipipe_cr2); +EXPORT_PER_CPU_SYMBOL_GPL(__ipipe_cr2); + +void ipipe_raise_irq(unsigned int irq) +{ + struct pt_regs regs; + unsigned long flags; + + flags = hard_local_irq_save(); + regs.flags = flags; + regs.orig_ax = irq; /* >= 0, IRQ won't be acked */ + regs.cs = __KERNEL_CS; + __ipipe_handle_irq(®s); + hard_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(ipipe_raise_irq); + +int ipipe_get_sysinfo(struct ipipe_sysinfo *info) +{ + info->sys_nr_cpus = num_online_cpus(); + info->sys_cpu_freq = __ipipe_cpu_freq; + info->sys_hrtimer_irq = per_cpu(ipipe_percpu.hrtimer_irq, 0); + info->sys_hrtimer_freq = __ipipe_hrtimer_freq; + info->sys_hrclock_freq = __ipipe_hrclock_freq; + + return 0; +} +EXPORT_SYMBOL_GPL(ipipe_get_sysinfo); + +#ifdef CONFIG_X86_UV +asmlinkage void uv_bau_message_interrupt(struct pt_regs *regs); +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD +asmlinkage void smp_threshold_interrupt(void); +#endif + +static void __ipipe_ack_irq(unsigned irq, struct irq_desc *desc) +{ + desc->ipipe_ack(irq, desc); +} + +void __ipipe_do_IRQ(unsigned int irq, void *cookie) +{ + void (*handler)(struct pt_regs *regs); + struct pt_regs *regs; + + regs = __this_cpu_ptr(&ipipe_percpu.tick_regs); + regs->orig_ax = ~__ipipe_get_irq_vector(irq); + handler = (typeof(handler))cookie; + __root_irq_trampoline(handler, regs); +} + +#ifdef CONFIG_X86_LOCAL_APIC + +static void __ipipe_noack_apic(unsigned irq, struct irq_desc *desc) +{ +} + +static void __ipipe_ack_apic(unsigned irq, struct irq_desc *desc) +{ + __ack_APIC_irq(); +} + +#endif /* CONFIG_X86_LOCAL_APIC */ + +/* + * __ipipe_enable_pipeline() -- We are running on the boot CPU, hw + * interrupts are off, and secondary CPUs are still lost in space. + */ +void __init __ipipe_enable_pipeline(void) +{ + unsigned int vector, irq; + +#ifdef CONFIG_X86_LOCAL_APIC + + /* Map the APIC system vectors. */ + + ipipe_request_irq(ipipe_root_domain, + ipipe_apic_vector_irq(LOCAL_TIMER_VECTOR), + __ipipe_do_IRQ, smp_apic_timer_interrupt, + __ipipe_ack_apic); + + ipipe_request_irq(ipipe_root_domain, + ipipe_apic_vector_irq(SPURIOUS_APIC_VECTOR), + __ipipe_do_IRQ, smp_spurious_interrupt, + __ipipe_noack_apic); + + ipipe_request_irq(ipipe_root_domain, + ipipe_apic_vector_irq(ERROR_APIC_VECTOR), + __ipipe_do_IRQ, smp_error_interrupt, + __ipipe_ack_apic); + +#ifdef CONFIG_X86_THERMAL_VECTOR + ipipe_request_irq(ipipe_root_domain, + ipipe_apic_vector_irq(THERMAL_APIC_VECTOR), + __ipipe_do_IRQ, smp_thermal_interrupt, + __ipipe_ack_apic); +#endif /* CONFIG_X86_THERMAL_VECTOR */ + +#ifdef CONFIG_X86_MCE_THRESHOLD + ipipe_request_irq(ipipe_root_domain, + ipipe_apic_vector_irq(THRESHOLD_APIC_VECTOR), + __ipipe_do_IRQ, smp_threshold_interrupt, + __ipipe_ack_apic); +#endif /* CONFIG_X86_MCE_THRESHOLD */ + +#ifdef CONFIG_X86_UV + ipipe_request_irq(ipipe_root_domain, + ipipe_apic_vector_irq(UV_BAU_MESSAGE), + __ipipe_do_IRQ, uv_bau_message_interrupt, + __ipipe_ack_apic); +#endif /* CONFIG_X86_UV */ + + ipipe_request_irq(ipipe_root_domain, + ipipe_apic_vector_irq(X86_PLATFORM_IPI_VECTOR), + __ipipe_do_IRQ, smp_x86_platform_ipi, + __ipipe_ack_apic); + + /* + * We expose two high priority APIC vectors the head domain + * may use respectively for hires timing and SMP rescheduling. + * We should never receive them in the root domain. + */ + ipipe_request_irq(ipipe_root_domain, + ipipe_apic_vector_irq(IPIPE_HRTIMER_VECTOR), + __ipipe_do_IRQ, smp_spurious_interrupt, + __ipipe_ack_apic); + + ipipe_request_irq(ipipe_root_domain, + ipipe_apic_vector_irq(IPIPE_RESCHEDULE_VECTOR), + __ipipe_do_IRQ, smp_spurious_interrupt, + __ipipe_ack_apic); + +#ifdef CONFIG_IRQ_WORK + ipipe_request_irq(ipipe_root_domain, + ipipe_apic_vector_irq(IRQ_WORK_VECTOR), + __ipipe_do_IRQ, smp_irq_work_interrupt, + __ipipe_ack_apic); +#endif /* CONFIG_IRQ_WORK */ + +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_SMP + ipipe_request_irq(ipipe_root_domain, + ipipe_apic_vector_irq(RESCHEDULE_VECTOR), + __ipipe_do_IRQ, smp_reschedule_interrupt, + __ipipe_ack_apic); + + for (vector = INVALIDATE_TLB_VECTOR_START; + vector <= INVALIDATE_TLB_VECTOR_END; ++vector) + ipipe_request_irq(ipipe_root_domain, + ipipe_apic_vector_irq(vector), + __ipipe_do_IRQ, smp_invalidate_interrupt, + __ipipe_ack_apic); + + ipipe_request_irq(ipipe_root_domain, + ipipe_apic_vector_irq(CALL_FUNCTION_VECTOR), + __ipipe_do_IRQ, smp_call_function_interrupt, + __ipipe_ack_apic); + + ipipe_request_irq(ipipe_root_domain, + ipipe_apic_vector_irq(CALL_FUNCTION_SINGLE_VECTOR), + __ipipe_do_IRQ, smp_call_function_single_interrupt, + __ipipe_ack_apic); + + ipipe_request_irq(ipipe_root_domain, + IRQ_MOVE_CLEANUP_VECTOR, + __ipipe_do_IRQ, smp_irq_move_cleanup_interrupt, + __ipipe_ack_apic); + + ipipe_request_irq(ipipe_root_domain, + ipipe_apic_vector_irq(REBOOT_VECTOR), + __ipipe_do_IRQ, smp_reboot_interrupt, + __ipipe_ack_apic); +#else + (void)vector; +#endif /* CONFIG_SMP */ + + /* + * Finally, request the remaining ISA and IO-APIC + * interrupts. Interrupts which have already been requested + * will just beget a silent -EBUSY error, that's ok. + */ + for (irq = 0; irq < NR_IRQS; irq++) + ipipe_request_irq(ipipe_root_domain, irq, + __ipipe_do_IRQ, do_IRQ, + __ipipe_ack_irq); +} + +#ifdef CONFIG_SMP + +void ipipe_set_irq_affinity(unsigned int irq, cpumask_t cpumask) +{ + if (WARN_ON_ONCE(irq_get_chip(irq)->irq_set_affinity == NULL)) + return; + + cpus_and(cpumask, cpumask, *cpu_online_mask); + if (WARN_ON_ONCE(cpus_empty(cpumask))) + return; + + irq_get_chip(irq)->irq_set_affinity(irq_get_irq_data(irq), &cpumask, true); +} +EXPORT_SYMBOL_GPL(ipipe_set_irq_affinity); + +void ipipe_send_ipi(unsigned int ipi, cpumask_t cpumask) +{ + unsigned long flags; + + flags = hard_local_irq_save(); + + cpu_clear(ipipe_processor_id(), cpumask); + if (likely(!cpus_empty(cpumask))) + apic->send_IPI_mask(&cpumask, ipipe_apic_irq_vector(ipi)); + + hard_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(ipipe_send_ipi); + +void __ipipe_hook_critical_ipi(struct ipipe_domain *ipd) +{ + unsigned int ipi = IPIPE_CRITICAL_IPI; + + ipd->irqs[ipi].ackfn = __ipipe_ack_apic; + ipd->irqs[ipi].handler = __ipipe_do_critical_sync; + ipd->irqs[ipi].cookie = NULL; + ipd->irqs[ipi].control = IPIPE_HANDLE_MASK|IPIPE_STICKY_MASK; +} + +#endif /* CONFIG_SMP */ + +static inline void __fixup_if(int s, struct pt_regs *regs) +{ + /* + * Have the saved hw state look like the domain stall bit, so + * that __ipipe_unstall_iret_root() restores the proper + * pipeline state for the root stage upon exit. + */ + if (s) + regs->flags &= ~X86_EFLAGS_IF; + else + regs->flags |= X86_EFLAGS_IF; +} + +void __ipipe_halt_root(void) +{ + struct ipipe_percpu_domain_data *p; + + /* Emulate sti+hlt sequence over the root domain. */ + + hard_local_irq_disable(); + + p = ipipe_this_cpu_root_context(); + + trace_hardirqs_on(); + __clear_bit(IPIPE_STALL_FLAG, &p->status); + + if (unlikely(__ipipe_ipending_p(p))) { + __ipipe_sync_stage(); + hard_local_irq_enable(); + } else { +#ifdef CONFIG_IPIPE_TRACE_IRQSOFF + ipipe_trace_end(0x8000000E); +#endif /* CONFIG_IPIPE_TRACE_IRQSOFF */ + asm volatile("sti; hlt": : :"memory"); + } +} +EXPORT_SYMBOL_GPL(__ipipe_halt_root); + +static void do_machine_check_vector(struct pt_regs *regs, long error_code) +{ +#ifdef CONFIG_X86_MCE +#ifdef CONFIG_X86_32 + extern void (*machine_check_vector)(struct pt_regs *, long error_code); + machine_check_vector(regs, error_code); +#else + do_machine_check(regs, error_code); +#endif +#endif /* CONFIG_X86_MCE */ +} + +/* Work around genksyms's issue with over-qualification in decls. */ + +typedef void dotraplinkage __ipipe_exhandler(struct pt_regs *, long); + +typedef __ipipe_exhandler *__ipipe_exptr; + +static __ipipe_exptr __ipipe_std_extable[] = { + + [ex_do_divide_error] = do_divide_error, + [ex_do_overflow] = do_overflow, + [ex_do_bounds] = do_bounds, + [ex_do_invalid_op] = do_invalid_op, + [ex_do_coprocessor_segment_overrun] = do_coprocessor_segment_overrun, + [ex_do_invalid_TSS] = do_invalid_TSS, + [ex_do_segment_not_present] = do_segment_not_present, + [ex_do_stack_segment] = do_stack_segment, + [ex_do_general_protection] = do_general_protection, + [ex_do_page_fault] = (__ipipe_exptr)do_page_fault, + [ex_do_spurious_interrupt_bug] = do_spurious_interrupt_bug, + [ex_do_coprocessor_error] = do_coprocessor_error, + [ex_do_alignment_check] = do_alignment_check, + [ex_machine_check_vector] = do_machine_check_vector, + [ex_do_simd_coprocessor_error] = do_simd_coprocessor_error, + [ex_do_device_not_available] = do_device_not_available, +#ifdef CONFIG_X86_32 + [ex_do_iret_error] = do_iret_error, +#endif +}; + +#ifdef CONFIG_KGDB +#include + +static int __ipipe_xlate_signo[] = { + + [ex_do_divide_error] = SIGFPE, + [ex_do_debug] = SIGTRAP, + [2] = -1, + [ex_do_int3] = SIGTRAP, + [ex_do_overflow] = SIGSEGV, + [ex_do_bounds] = SIGSEGV, + [ex_do_invalid_op] = SIGILL, + [ex_do_device_not_available] = -1, + [8] = -1, + [ex_do_coprocessor_segment_overrun] = SIGFPE, + [ex_do_invalid_TSS] = SIGSEGV, + [ex_do_segment_not_present] = SIGBUS, + [ex_do_stack_segment] = SIGBUS, + [ex_do_general_protection] = SIGSEGV, + [ex_do_page_fault] = SIGSEGV, + [ex_do_spurious_interrupt_bug] = -1, + [ex_do_coprocessor_error] = -1, + [ex_do_alignment_check] = SIGBUS, + [ex_machine_check_vector] = -1, + [ex_do_simd_coprocessor_error] = -1, + [20 ... 31] = -1, +#ifdef CONFIG_X86_32 + [ex_do_iret_error] = SIGSEGV, +#endif +}; +#endif /* CONFIG_KGDB */ + +int __ipipe_handle_exception(struct pt_regs *regs, long error_code, int vector) +{ + bool root_entry = false; + unsigned long flags = 0; + unsigned long cr2 = 0; + + if (ipipe_root_p) { + root_entry = true; + + local_save_flags(flags); + /* + * Replicate hw interrupt state into the virtual mask + * before calling the I-pipe event handler over the + * root domain. Also required later when calling the + * Linux exception handler. + */ + if (hard_irqs_disabled()) + local_irq_disable(); + } +#ifdef CONFIG_KGDB + /* catch exception KGDB is interested in over non-root domains */ + else if (__ipipe_xlate_signo[vector] >= 0 && + !kgdb_handle_exception(vector, __ipipe_xlate_signo[vector], + error_code, regs)) + return 1; +#endif /* CONFIG_KGDB */ + + if (vector == ex_do_page_fault) + cr2 = native_read_cr2(); + + if (unlikely(__ipipe_report_trap(vector, regs))) { + if (root_entry) + ipipe_restore_root_nosync(flags); + return 1; + } + + if (likely(ipipe_root_p)) { + /* + * If root is not the topmost domain or in case we faulted in + * the iret path of x86-32, regs.flags does not match the root + * domain state. The fault handler or the low-level return + * code may evaluate it. So fix this up, either by the root + * state sampled on entry or, if we migrated to root, with the + * current state. + */ + __fixup_if(root_entry ? raw_irqs_disabled_flags(flags) : + raw_irqs_disabled(), regs); + } else { + /* Detect unhandled faults over the head domain. */ + struct ipipe_domain *ipd = ipipe_current_domain; + + /* Switch to root so that Linux can handle the fault cleanly. */ + hard_local_irq_disable(); + __ipipe_set_current_domain(ipipe_root_domain); + + ipipe_trace_panic_freeze(); + + /* Always warn about user land and unfixable faults. */ + if (user_mode_vm(regs) || + !search_exception_tables(instruction_pointer(regs))) { + printk(KERN_ERR "BUG: Unhandled exception over domain" + " %s at 0x%lx - switching to ROOT\n", + ipd->name, instruction_pointer(regs)); + dump_stack(); + ipipe_trace_panic_dump(); +#ifdef CONFIG_IPIPE_DEBUG + /* Also report fixable ones when debugging is enabled. */ + } else { + printk(KERN_WARNING "WARNING: Fixable exception over " + "domain %s at 0x%lx - switching to ROOT\n", + ipd->name, instruction_pointer(regs)); + dump_stack(); + ipipe_trace_panic_dump(); +#endif /* CONFIG_IPIPE_DEBUG */ + } + } + + if (vector == ex_do_page_fault) + write_cr2(cr2); + + __ipipe_std_extable[vector](regs, error_code); + + /* + * Relevant for 64-bit: Restore root domain state as the low-level + * return code will not align it to regs.flags. + */ + if (root_entry) + ipipe_restore_root_nosync(flags); + + return 0; +} + +int __ipipe_divert_exception(struct pt_regs *regs, int vector) +{ + bool root_entry = false; + unsigned long flags = 0; + + if (ipipe_root_p) { + root_entry = true; + local_save_flags(flags); + if (hard_irqs_disabled()) { + /* + * Same root state handling as in + * __ipipe_handle_exception. + */ + local_irq_disable(); + } + } +#ifdef CONFIG_KGDB + /* catch int1 and int3 over non-root domains */ + else { +#ifdef CONFIG_X86_32 + if (vector != ex_do_device_not_available) +#endif + { + unsigned int condition = 0; + if (vector == 1) + get_debugreg(condition, 6); + if (!kgdb_handle_exception(vector, SIGTRAP, condition, regs)) + return 1; + } + } +#endif /* CONFIG_KGDB */ + + if (unlikely(__ipipe_report_trap(vector, regs))) { + if (root_entry) + ipipe_restore_root_nosync(flags); + return 1; + } + + /* see __ipipe_handle_exception */ + if (likely(ipipe_root_p)) + __fixup_if(root_entry ? raw_irqs_disabled_flags(flags) : + raw_irqs_disabled(), regs); + /* + * No need to restore root state in the 64-bit case, the Linux + * handler and the return code will take care of it. + */ + + return 0; +} + +int __ipipe_syscall_root(struct pt_regs *regs) +{ + struct ipipe_percpu_domain_data *p; + unsigned long flags; + int ret; + + /* + * This routine either returns: + * 0 -- if the syscall is to be passed to Linux; + * >0 -- if the syscall should not be passed to Linux, and no + * tail work should be performed; + * <0 -- if the syscall should not be passed to Linux but the + * tail work has to be performed (for handling signals etc). + */ + + if (!__ipipe_syscall_watched_p(current, regs->orig_ax)) + return 0; + + ret = __ipipe_notify_kevent(IPIPE_EVENT_SYSCALL - IPIPE_FIRST_EVENT, regs); // __ipipe_notify_syscall(regs); + + flags = hard_local_irq_save(); + + if (current->ipipe.flags & PF_MAYDAY) { + current->ipipe.flags &= ~PF_MAYDAY; + __ipipe_notify_trap(IPIPE_TRAP_MAYDAY, regs); + } + + if (!ipipe_root_p) + return 1; + + p = ipipe_this_cpu_root_context(); + if (__ipipe_ipending_p(p)) + __ipipe_sync_stage(); + + if (ret == 0) + hard_local_irq_restore(flags); + + return -ret; +} + +int __ipipe_handle_irq(struct pt_regs *regs) +{ + struct ipipe_percpu_data *p = __ipipe_this_cpu_ptr(&ipipe_percpu); + int irq, vector = regs->orig_ax, flags = 0; + struct pt_regs *tick_regs; + + if (likely(vector < 0)) { + irq = __this_cpu_read(vector_irq[~vector]); + BUG_ON(irq < 0); + } else { /* Software-generated. */ + irq = vector; + flags = IPIPE_IRQF_NOACK; + } + + /* + * Given our deferred dispatching model for regular IRQs, we + * only record CPU regs for the last timer interrupt, so that + * the timer handler charges CPU times properly. It is assumed + * that no other interrupt handler cares for such information. + */ + if (irq == p->hrtimer_irq || p->hrtimer_irq == -1) { + tick_regs = &p->tick_regs; + tick_regs->flags = regs->flags; + tick_regs->cs = regs->cs; + tick_regs->ip = regs->ip; + tick_regs->bp = regs->bp; +#ifdef CONFIG_X86_64 + tick_regs->ss = regs->ss; + tick_regs->sp = regs->sp; +#endif + if (!__ipipe_root_p) + tick_regs->flags &= ~X86_EFLAGS_IF; + } + + __ipipe_dispatch_irq(irq, flags); + + if (user_mode(regs) && (current->ipipe.flags & PF_MAYDAY)) { + current->ipipe.flags &= ~PF_MAYDAY; + __ipipe_notify_trap(IPIPE_TRAP_MAYDAY, regs); + } + + if (!__ipipe_root_p || + test_bit(IPIPE_STALL_FLAG, &__ipipe_root_status)) + return 0; + + return 1; +} + +#ifdef CONFIG_X86_32 +void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, + struct clocksource *clock, u32 mult) +{ + if (clock == &clocksource_tsc) + ipipe_update_hostrt(wall_time, wtm, clock, mult); +} + +void update_vsyscall_tz(void) +{ +} + +#ifdef CONFIG_IPIPE_WANT_CLOCKSOURCE +u64 __ipipe_get_cs_tsc(void); +EXPORT_SYMBOL_GPL(__ipipe_get_cs_tsc); +#endif + +#endif /* CONFIG_X86_32 */ + +struct task_struct *__switch_to(struct task_struct *prev_p, + struct task_struct *next_p); +EXPORT_SYMBOL_GPL(do_munmap); +EXPORT_SYMBOL_GPL(__switch_to); +EXPORT_SYMBOL_GPL(show_stack); +EXPORT_PER_CPU_SYMBOL_GPL(fpu_owner_task); + +EXPORT_PER_CPU_SYMBOL_GPL(init_tss); +#ifdef CONFIG_SMP +EXPORT_PER_CPU_SYMBOL_GPL(cpu_tlbstate); +#endif /* CONFIG_SMP */ + +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) +EXPORT_SYMBOL(tasklist_lock); +#endif /* CONFIG_SMP || CONFIG_DEBUG_SPINLOCK */ + +#if defined(CONFIG_CC_STACKPROTECTOR) && defined(CONFIG_X86_64) +EXPORT_PER_CPU_SYMBOL_GPL(irq_stack_union); +#endif diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3dafc60..7936d3c 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -41,7 +41,7 @@ void ack_bad_irq(unsigned int irq) * completely. * But only ack when the APIC is enabled -AK */ - ack_APIC_irq(); + __ack_APIC_irq(); } #define irq_stats(x) (&per_cpu(irq_stat, x)) @@ -186,11 +186,12 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) unsigned vector = ~regs->orig_ax; unsigned irq; + irq = __this_cpu_read(vector_irq[vector]); + __ipipe_move_root_irq(irq); + irq_enter(); exit_idle(); - irq = __this_cpu_read(vector_irq[vector]); - if (!handle_irq(irq, regs)) { ack_APIC_irq(); diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 252981a..dd348e1 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -165,6 +165,8 @@ static void __init smp_intr_init(void) { #ifdef CONFIG_SMP #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) + unsigned cpu; + /* * The reschedule interrupt is a CPU-to-CPU reschedule-helper * IPI, driven by wakeup. @@ -254,9 +256,16 @@ static void __init smp_intr_init(void) /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); + for_each_possible_cpu(cpu) + per_cpu(vector_irq, cpu)[IRQ_MOVE_CLEANUP_VECTOR] = + IRQ_MOVE_CLEANUP_VECTOR; /* IPI used for rebooting/stopping */ alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt); +#ifdef CONFIG_IPIPE + alloc_intr_gate(IPIPE_RESCHEDULE_VECTOR, ipipe_reschedule_interrupt); + alloc_intr_gate(IPIPE_CRITICAL_VECTOR, ipipe_critical_interrupt); +#endif #endif #endif /* CONFIG_SMP */ } @@ -282,6 +291,9 @@ static void __init apic_intr_init(void) /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); +#ifdef CONFIG_IPIPE + alloc_intr_gate(IPIPE_HRTIMER_VECTOR, ipipe_hrtimer_interrupt); +#endif /* IRQ work interrupts: */ # ifdef CONFIG_IRQ_WORK diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c index a311ffc..482b42f 100644 --- a/arch/x86/kernel/pcspeaker.c +++ b/arch/x86/kernel/pcspeaker.c @@ -6,6 +6,13 @@ static __init int add_pcspkr(void) { struct platform_device *pd; +#ifdef CONFIG_IPIPE + if (cpu_has_tsc == 0) { + printk("I-pipe: disabling PC speaker for TSC emulation.\n"); + return -EBUSY; + } +#endif /* CONFIG_IPIPE */ + pd = platform_device_register_simple("pcspkr", -1, NULL, 0); return IS_ERR(pd) ? PTR_ERR(pd) : 0; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 735279e..2afee62 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -73,6 +73,14 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) if (ret) return ret; fpu_copy(&dst->thread.fpu, &src->thread.fpu); + } else { +#ifdef CONFIG_IPIPE + /* unconditionally allocate, RT domain may need it */ + memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu)); + ret = fpu_alloc(&dst->thread.fpu); + if (ret) + return ret; +#endif } return 0; } @@ -93,6 +101,10 @@ void arch_task_cache_init(void) kmem_cache_create("task_xstate", xstate_size, __alignof__(union thread_xstate), SLAB_PANIC | SLAB_NOTRACK, NULL); +#ifdef CONFIG_IPIPE + memset(¤t->thread.fpu, 0, sizeof(current->thread.fpu)); + fpu_alloc(¤t->thread.fpu); +#endif } static inline void drop_fpu(struct task_struct *tsk) @@ -518,7 +530,7 @@ bool set_pm_idle_to_default(void) } void stop_this_cpu(void *dummy) { - local_irq_disable(); + hard_local_irq_disable(); /* * Remove this CPU: */ @@ -646,6 +658,11 @@ static void amd_e400_idle(void) if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halt in AMD C1E"); printk(KERN_INFO "System has AMD C1E enabled\n"); +#ifdef CONFIG_IPIPE + printk(KERN_INFO + "I-pipe: will not be able to use LAPIC as a tick device\n" + "I-pipe: disable C1E power state in your BIOS\n"); +#endif } } @@ -679,6 +696,11 @@ static void amd_e400_idle(void) void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { +#ifdef CONFIG_IPIPE +#define default_to_mwait (boot_option_idle_override == IDLE_FORCE_MWAIT) +#else +#define default_to_mwait 1 +#endif #ifdef CONFIG_SMP if (pm_idle == poll_idle && smp_num_siblings > 1) { printk_once(KERN_WARNING "WARNING: polling idle and HT enabled," @@ -688,7 +710,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) if (pm_idle) return; - if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { + if (default_to_mwait && cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { /* * One CPU supports mwait => All CPUs supports mwait */ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 516fa18..59856da 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -190,10 +190,12 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) regs->cs = __USER_CS; regs->ip = new_ip; regs->sp = new_sp; +#ifndef CONFIG_IPIPE /* Lazily handled, init_fpu() will reset the state. */ /* * Free the old FP and other extended state */ free_thread_xstate(current); +#endif } EXPORT_SYMBOL_GPL(start_thread); @@ -230,7 +232,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; - int cpu = smp_processor_id(); + int cpu = raw_smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); fpu_switch_t fpu; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 61cdf7f..12aae51 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -53,6 +53,7 @@ asmlinkage extern void ret_from_fork(void); DEFINE_PER_CPU(unsigned long, old_rsp); +asmlinkage extern void thread_return(void); /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs *regs, int all) @@ -166,6 +167,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.sp = (unsigned long) childregs; p->thread.sp0 = (unsigned long) (childregs+1); p->thread.usersp = me->thread.usersp; + p->thread.rip = (unsigned long) thread_return; set_tsk_thread_flag(p, TIF_FORK); @@ -232,10 +234,15 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, regs->cs = _cs; regs->ss = _ss; regs->flags = X86_EFLAGS_IF; +#ifndef CONFIG_IPIPE /* * Free the old FP and other extended state + * + * Lazily handled when the pipeline is enabled; init_fpu() + * will reset the state. */ free_thread_xstate(current); +#endif } void @@ -270,7 +277,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread; struct thread_struct *next = &next_p->thread; - int cpu = smp_processor_id(); + int cpu = raw_smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); unsigned fsindex, gsindex; fpu_switch_t fpu; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index c4c6a5c..7a14db5 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -1511,6 +1512,10 @@ void syscall_trace_leave(struct pt_regs *regs) { bool step; +#ifdef CONFIG_IPIPE + if (syscall_get_nr(current, regs) >= NR_syscalls) + return; +#endif audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 48d2b7d..c7597e5 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -241,9 +241,9 @@ static void native_stop_other_cpus(int wait) } finish: - local_irq_save(flags); + flags = hard_local_irq_save(); disable_local_APIC(); - local_irq_restore(flags); + hard_local_irq_restore(flags); } /* diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7bd8a08..41196d2 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -229,7 +229,7 @@ static void __cpuinit smp_callin(void) /* * Activate a secondary processor. */ -notrace static void __cpuinit start_secondary(void *unused) +static void __cpuinit start_secondary(void *unused) { /* * Don't put *anything* before cpu_init(), SMP booting is too @@ -796,7 +796,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) { int apicid = apic->cpu_present_to_apicid(cpu); - unsigned long flags; + unsigned long vflags, rflags; int err; WARN_ON(irqs_disabled()); @@ -836,9 +836,11 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) * Check TSC synchronization with the AP (keep irqs disabled * while doing so): */ - local_irq_save(flags); + local_irq_save(vflags); + rflags = hard_local_irq_save(); check_tsc_sync_source(cpu); - local_irq_restore(flags); + hard_local_irq_restore(rflags); + local_irq_restore(vflags); while (!cpu_online(cpu)) { cpu_relax(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 05b31d9..4a78c15 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -595,6 +595,7 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) void math_state_restore(void) { struct task_struct *tsk = current; + unsigned long flags; if (!tsk_used_math(tsk)) { local_irq_enable(); @@ -611,17 +612,20 @@ void math_state_restore(void) local_irq_disable(); } + flags = hard_cond_local_irq_save(); __thread_fpu_begin(tsk); /* * Paranoid restore. send a SIGSEGV if we fail to restore the state. */ if (unlikely(restore_fpu_checking(tsk))) { __thread_fpu_end(tsk); + hard_cond_local_irq_enable(); force_sig(SIGSEGV, tsk); return; } tsk->fpu_counter++; + hard_cond_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(math_state_restore); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index fc0a147..1b545f2 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -406,9 +406,9 @@ unsigned long native_calibrate_tsc(void) unsigned long flags, latch, ms, fast_calibrate; int hpet = is_hpet_enabled(), i, loopmin; - local_irq_save(flags); + flags = hard_local_irq_save(); fast_calibrate = quick_pit_calibrate(); - local_irq_restore(flags); + hard_local_irq_restore(flags); if (fast_calibrate) return fast_calibrate; @@ -451,11 +451,11 @@ unsigned long native_calibrate_tsc(void) * calibration, which will take at least 50ms, and * read the end value. */ - local_irq_save(flags); + flags = hard_local_irq_save(); tsc1 = tsc_read_refs(&ref1, hpet); tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin); tsc2 = tsc_read_refs(&ref2, hpet); - local_irq_restore(flags); + hard_local_irq_restore(flags); /* Pick the lowest PIT TSC calibration so far */ tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); @@ -739,7 +739,7 @@ core_initcall(cpufreq_tsc); /* clocksource code */ -static struct clocksource clocksource_tsc; +struct clocksource clocksource_tsc; /* * We compare the TSC to the cycle_last value in the clocksource @@ -766,7 +766,7 @@ static void resume_tsc(struct clocksource *cs) clocksource_tsc.cycle_last = 0; } -static struct clocksource clocksource_tsc = { +struct clocksource clocksource_tsc = { .name = "tsc", .rating = 300, .read = read_tsc, diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 255f58a..3c7ea3e 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -148,12 +148,14 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) do_exit(SIGSEGV); } + hard_cond_local_irq_disable(); tss = &per_cpu(init_tss, get_cpu()); current->thread.sp0 = current->thread.saved_sp0; current->thread.sysenter_cs = __KERNEL_CS; load_sp0(tss, ¤t->thread); current->thread.saved_sp0 = 0; put_cpu(); + hard_cond_local_irq_enable(); ret = KVM86->regs32; @@ -326,12 +328,14 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk tsk->thread.saved_fs = info->regs32->fs; tsk->thread.saved_gs = get_user_gs(info->regs32); + hard_cond_local_irq_disable(); tss = &per_cpu(init_tss, get_cpu()); tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; if (cpu_has_sep) tsk->thread.sysenter_cs = 0; load_sp0(tss, &tsk->thread); put_cpu(); + hard_cond_local_irq_enable(); tsk->thread.screen_bitmap = info->screen_bitmap; if (info->flags & VM86_SCREEN_BITMAP) diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 5db36ca..710a57b 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -106,6 +107,9 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, timespec_add(vsyscall_gtod_data.wall_time_coarse, *wtm); write_seqcount_end(&vsyscall_gtod_data.seq); + + if (clock == &clocksource_tsc) + ipipe_update_hostrt(wall_time, wtm, clock, mult); } static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f75af40..8175c93 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3812,7 +3812,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) clgi(); - local_irq_enable(); + hard_local_irq_enable(); asm volatile ( "push %%"R"bp; \n\t" @@ -3896,7 +3896,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) reload_tss(vcpu); - local_irq_disable(); + hard_local_irq_disable(); vcpu->arch.cr2 = svm->vmcb->save.cr2; vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 61593fd..ef95ed0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1482,9 +1482,11 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) static void vmx_load_host_state(struct vcpu_vmx *vmx) { - preempt_disable(); + unsigned long flags; + + flags = hard_preempt_disable(); __vmx_load_host_state(vmx); - preempt_enable(); + hard_preempt_enable(flags); } /* @@ -1754,6 +1756,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) int save_nmsrs, index; unsigned long *msr_bitmap; + hard_cond_local_irq_disable(); save_nmsrs = 0; #ifdef CONFIG_X86_64 if (is_long_mode(&vmx->vcpu)) { @@ -1783,6 +1786,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) move_msr_up(vmx, index, save_nmsrs++); vmx->save_nmsrs = save_nmsrs; + hard_cond_local_irq_enable(); if (cpu_has_vmx_msr_bitmap()) { if (is_long_mode(&vmx->vcpu)) @@ -6360,7 +6364,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx_vcpu_load(&vmx->vcpu, cpu); vmx->vcpu.cpu = cpu; err = vmx_vcpu_setup(vmx); + hard_cond_local_irq_disable(); vmx_vcpu_put(&vmx->vcpu); + hard_cond_local_irq_enable(); put_cpu(); if (err) goto free_vmcs; @@ -6382,6 +6388,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->nested.current_vmptr = -1ull; vmx->nested.current_vmcs12 = NULL; +#ifdef CONFIG_IPIPE + vmx->vcpu.ipipe_notifier.handler = __ipipe_handle_vm_preemption; +#endif return &vmx->vcpu; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 14c290d..d347f3c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -111,6 +112,7 @@ struct kvm_shared_msrs_global { struct kvm_shared_msrs { struct user_return_notifier urn; bool registered; + bool dirty; struct kvm_shared_msr_values { u64 host; u64 curr; @@ -167,22 +169,36 @@ static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) vcpu->arch.apf.gfns[i] = ~0; } +static void kvm_restore_shared_msrs(struct kvm_shared_msrs *locals) +{ + struct kvm_shared_msr_values *values; + unsigned long flags; + unsigned int slot; + + flags = hard_cond_local_irq_save(); + if (locals->dirty) { + for (slot = 0; slot < shared_msrs_global.nr; ++slot) { + values = &locals->values[slot]; + if (values->host != values->curr) { + wrmsrl(shared_msrs_global.msrs[slot], + values->host); + values->curr = values->host; + } + } + locals->dirty = false; + } + hard_cond_local_irq_restore(flags); +} + static void kvm_on_user_return(struct user_return_notifier *urn) { - unsigned slot; struct kvm_shared_msrs *locals = container_of(urn, struct kvm_shared_msrs, urn); - struct kvm_shared_msr_values *values; - for (slot = 0; slot < shared_msrs_global.nr; ++slot) { - values = &locals->values[slot]; - if (values->host != values->curr) { - wrmsrl(shared_msrs_global.msrs[slot], values->host); - values->curr = values->host; - } - } + kvm_restore_shared_msrs(locals); locals->registered = false; user_return_notifier_unregister(urn); + __ipipe_exit_vm(); } static void shared_msr_update(unsigned slot, u32 msr) @@ -228,6 +244,7 @@ void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) return; smsr->values[slot].curr = value; wrmsrl(shared_msrs_global.msrs[slot], value); + smsr->dirty = true; if (!smsr->registered) { smsr->urn.on_user_return = kvm_on_user_return; user_return_notifier_register(&smsr->urn); @@ -2317,10 +2334,35 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { + struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); + unsigned long flags; + + flags = hard_cond_local_irq_save(); + kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); vcpu->arch.last_host_tsc = native_read_tsc(); + + if (!smsr->dirty) + __ipipe_exit_vm(); + + hard_cond_local_irq_restore(flags); +} + +#ifdef CONFIG_IPIPE + +void __ipipe_handle_vm_preemption(struct ipipe_vm_notifier *nfy) +{ + struct kvm_vcpu *vcpu; + + vcpu = container_of(nfy, struct kvm_vcpu, ipipe_notifier); + kvm_arch_vcpu_put(vcpu); + kvm_restore_shared_msrs(__this_cpu_ptr(&shared_msrs)); + __ipipe_exit_vm(); } +EXPORT_SYMBOL_GPL(__ipipe_handle_vm_preemption); + +#endif static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) @@ -5305,6 +5347,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } preempt_disable(); + local_irq_disable(); + hard_cond_local_irq_disable(); + + __ipipe_enter_vm(&vcpu->ipipe_notifier); kvm_x86_ops->prepare_guest_switch(vcpu); if (vcpu->fpu_active) @@ -5318,13 +5364,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ smp_mb(); - local_irq_disable(); - if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests || need_resched() || signal_pending(current)) { vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - local_irq_enable(); + hard_cond_local_irq_enable(); preempt_enable(); kvm_x86_ops->cancel_injection(vcpu); r = 1; @@ -5363,6 +5407,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); + hard_cond_local_irq_enable(); local_irq_enable(); ++vcpu->stat.exits; diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c index c9f2d9b..ccf6144 100644 --- a/arch/x86/lib/mmx_32.c +++ b/arch/x86/lib/mmx_32.c @@ -30,7 +30,7 @@ void *_mmx_memcpy(void *to, const void *from, size_t len) void *p; int i; - if (unlikely(in_interrupt())) + if (unlikely(!ipipe_root_p || in_interrupt())) return __memcpy(to, from, len); p = to; diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index a63efd6..45f2129 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S @@ -43,3 +43,24 @@ restore: RESTORE_ARGS ret CFI_ENDPROC + +#ifdef CONFIG_IPIPE + /* rdi: arg1 ... normal C conventions. rax is passed from C. */ + .macro THUNK_RETRAX name,func + .globl \name +\name: + CFI_STARTPROC + SAVE_ARGS + call \func + jmp restore_norax + CFI_ENDPROC + .endm + THUNK_RETRAX __ipipe_syscall_root_thunk,__ipipe_syscall_root + + CFI_STARTPROC + SAVE_ARGS +restore_norax: + RESTORE_ARGS 0 + ret + CFI_ENDPROC +#endif diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 76dcd9d..718c79c 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -354,9 +354,9 @@ void vmalloc_sync_all(void) * * This assumes no large pages in there. */ -static noinline __kprobes int vmalloc_fault(unsigned long address) +static inline int vmalloc_sync_one(pgd_t *pgd, unsigned long address) { - pgd_t *pgd, *pgd_ref; + pgd_t *pgd_ref; pud_t *pud, *pud_ref; pmd_t *pmd, *pmd_ref; pte_t *pte, *pte_ref; @@ -372,7 +372,6 @@ static noinline __kprobes int vmalloc_fault(unsigned long address) * happen within a race in page table update. In the later * case just flush: */ - pgd = pgd_offset(current->active_mm, address); pgd_ref = pgd_offset_k(address); if (pgd_none(*pgd_ref)) return -1; @@ -420,6 +419,12 @@ static noinline __kprobes int vmalloc_fault(unsigned long address) return 0; } +static noinline __kprobes int vmalloc_fault(unsigned long address) +{ + pgd_t *pgd = pgd_offset(current->active_mm, address); + return vmalloc_sync_one(pgd, address); +} + #ifdef CONFIG_CPU_SUP_AMD static const char errata93_warning[] = KERN_ERR @@ -1018,6 +1023,11 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) /* Get the faulting address: */ address = read_cr2(); +#ifdef CONFIG_IPIPE + if (ipipe_root_domain != ipipe_head_domain) + hard_cond_local_irq_enable(); +#endif + /* * Detect and handle instructions that would cause a page fault for * both a tracked kernel page and a userspace page. @@ -1209,3 +1219,43 @@ good_area: up_read(&mm->mmap_sem); } + +#ifdef CONFIG_IPIPE +void __ipipe_pin_range_globally(unsigned long start, unsigned long end) +{ +#ifdef CONFIG_X86_32 + unsigned long next, addr = start; + + do { + unsigned long flags; + struct page *page; + + next = pgd_addr_end(addr, end); + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) + vmalloc_sync_one(page_address(page), addr); + spin_unlock_irqrestore(&pgd_lock, flags); + + } while (addr = next, addr != end); +#else + unsigned long next, addr = start; + int ret = 0; + + do { + struct page *page; + + next = pgd_addr_end(addr, end); + spin_lock(&pgd_lock); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(addr); + ret = vmalloc_sync_one(pgd, addr); + if (ret) + break; + } + spin_unlock(&pgd_lock); + addr = next; + } while (!ret && addr != end); +#endif +} +#endif /* CONFIG_IPIPE */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 5e57e11..77837c2 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -61,13 +61,17 @@ static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset); */ void leave_mm(int cpu) { + unsigned long flags; + struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) BUG(); + flags = hard_cond_local_irq_save(); if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); load_cr3(swapper_pg_dir); } + hard_cond_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(leave_mm); @@ -194,6 +198,9 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, apic->send_IPI_mask(to_cpumask(f->flush_cpumask), INVALIDATE_TLB_VECTOR_START + sender); +#ifdef CONFIG_IPIPE + WARN_ON_ONCE(hard_irqs_disabled()); +#endif while (!cpumask_empty(to_cpumask(f->flush_cpumask))) cpu_relax(); } diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 59880af..72605be 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -2101,6 +2101,11 @@ static int __init uv_bau_init(void) init_uvhub(uvhub, vector, uv_base_pnode); alloc_intr_gate(vector, uv_bau_message_intr1); +#ifdef CONFIG_IPIPE + for_each_possible_cpu(cur_cpu) + per_cpu(vector_irq, cur_cpu)[vector] = + ipipe_apic_vector_irq(vector); +#endif for_each_possible_blade(uvhub) { if (uv_blade_nr_possible_cpus(uvhub)) { diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index dd3e661..585b983 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -8,6 +8,9 @@ obj-$(CONFIG_SH_TIMER_MTU2) += sh_mtu2.o obj-$(CONFIG_SH_TIMER_TMU) += sh_tmu.o obj-$(CONFIG_EM_TIMER_STI) += em_sti.o obj-$(CONFIG_CLKBLD_I8253) += i8253.o +ifdef CONFIG_X86_32 +obj-$(CONFIG_IPIPE_WANT_CLOCKSOURCE) += ipipe_i486_tsc_emu.o +endif obj-$(CONFIG_CLKSRC_MMIO) += mmio.o obj-$(CONFIG_DW_APB_TIMER) += dw_apb_timer.o obj-$(CONFIG_CLKSRC_DBX500_PRCMU) += clksrc-dbx500-prcmu.o \ No newline at end of file diff --git a/drivers/clocksource/i8253.c b/drivers/clocksource/i8253.c index e7cab2d..cd39ba2 100644 --- a/drivers/clocksource/i8253.c +++ b/drivers/clocksource/i8253.c @@ -9,6 +9,8 @@ #include #include #include +#include +#include /* * Protects access to I/O ports @@ -16,8 +18,9 @@ * 0040-0043 : timer0, i8253 / i8254 * 0061-0061 : NMI Control Register which contains two speaker control bits. */ -DEFINE_RAW_SPINLOCK(i8253_lock); +IPIPE_DEFINE_RAW_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); +static unsigned periodic_pit_ch0; #ifdef CONFIG_CLKSRC_I8253 /* @@ -33,6 +36,10 @@ static cycle_t i8253_read(struct clocksource *cs) int count; u32 jifs; + if (periodic_pit_ch0 == 0) + /* The PIT is not running in periodic mode. */ + return jiffies * PIT_LATCH + (PIT_LATCH - 1) - old_count; + raw_spin_lock_irqsave(&i8253_lock, flags); /* * Although our caller may have the read side of xtime_lock, @@ -93,8 +100,37 @@ static struct clocksource i8253_cs = { .mask = CLOCKSOURCE_MASK(32), }; +#ifdef CONFIG_IPIPE + +#define IPIPE_PIT_COUNT2LATCH 0xfffe + +extern cycle_t __ipipe_get_8253_tsc(struct clocksource *cs); + +int __ipipe_last_8253_counter2; + +void ipipe_setup_8253_tsc(void) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&i8253_lock, flags); + outb_p(0xb4, PIT_MODE); + outb_p(IPIPE_PIT_COUNT2LATCH & 0xff, PIT_CH2); + outb_p(IPIPE_PIT_COUNT2LATCH >> 8, PIT_CH2); + /* Gate high, disable speaker */ + outb_p((inb_p(0x61) & ~0x2) | 1, 0x61); + + raw_spin_unlock_irqrestore(&i8253_lock, flags); + + i8253_cs.ipipe_read = __ipipe_get_8253_tsc; +} +#else /* !CONFIG_IPIPE */ +#define ipipe_setup_8253_tsc() do { } while(0) +#endif /* !CONFIG_IPIPE */ + int __init clocksource_i8253_init(void) { + if (cpu_has_tsc == 0) + ipipe_setup_8253_tsc(); return clocksource_register_hz(&i8253_cs, PIT_TICK_RATE); } #endif @@ -110,8 +146,10 @@ static void init_pit_timer(enum clock_event_mode mode, { raw_spin_lock(&i8253_lock); + periodic_pit_ch0 = 0; switch (mode) { case CLOCK_EVT_MODE_PERIODIC: + periodic_pit_ch0 = 1; /* binary, mode 2, LSB/MSB, ch 0 */ outb_p(0x34, PIT_MODE); outb_p(PIT_LATCH & 0xff , PIT_CH0); /* LSB */ @@ -148,13 +186,25 @@ static void init_pit_timer(enum clock_event_mode mode, static int pit_next_event(unsigned long delta, struct clock_event_device *evt) { raw_spin_lock(&i8253_lock); +#ifndef CONFIG_IPIPE outb_p(delta & 0xff , PIT_CH0); /* LSB */ outb_p(delta >> 8 , PIT_CH0); /* MSB */ +#else /* CONFIG_IPIPE */ + outb(delta & 0xff , PIT_CH0); /* LSB */ + outb(delta >> 8 , PIT_CH0); /* MSB */ +#endif /* CONFIG_IPIPE */ raw_spin_unlock(&i8253_lock); return 0; } +#ifdef CONFIG_IPIPE +static struct ipipe_timer i8253_itimer = { + .irq = 0, + .min_delay_ticks = 1, +}; +#endif /* CONFIG_IPIPE */ + /* * On UP the PIT can serve all of the possible timer functions. On SMP systems * it can be solely used for the global tick. @@ -164,6 +214,9 @@ struct clock_event_device i8253_clockevent = { .features = CLOCK_EVT_FEAT_PERIODIC, .set_mode = init_pit_timer, .set_next_event = pit_next_event, +#ifdef CONFIG_IPIPE + .ipipe_timer = &i8253_itimer, +#endif /* CONFIG_IPIPE */ }; /* diff --git a/drivers/clocksource/ipipe_i486_tsc_emu.S b/drivers/clocksource/ipipe_i486_tsc_emu.S new file mode 100644 index 0000000..3fe423c --- /dev/null +++ b/drivers/clocksource/ipipe_i486_tsc_emu.S @@ -0,0 +1,106 @@ +#include +#include + +#define PIT_MODE 0x43 +#define PIT_CH2 0x42 +#define PIT_COUNT2LATCH 0xfffe + +.macro SAVE reg + pushl_cfi %\reg + CFI_REL_OFFSET \reg, 0 +.endm + +.macro RESTORE reg + popl_cfi %\reg + CFI_RESTORE \reg +.endm + +ENTRY(__ipipe_get_8253_tsc) + CFI_STARTPROC + + mov $0xd8, %al + out %al, $(PIT_MODE) + in $(PIT_CH2), %al + xor %ecx, %ecx + mov %al, %cl + in $(PIT_CH2), %al + mov %al, %ch + + mov __ipipe_last_8253_counter2, %eax + mov __ipipe_cs_last_tsc + 4, %edx + sub %ecx, %eax + mov %ecx, __ipipe_last_8253_counter2 + test %eax, %eax + mov __ipipe_cs_last_tsc, %ecx + jg 1f + add $(PIT_COUNT2LATCH), %eax +1: add %ecx, %eax + adc $0, %edx + mov %eax, __ipipe_cs_last_tsc + mov %edx, __ipipe_cs_last_tsc + 4 + + ret + + CFI_ENDPROC +ENDPROC(__ipipe_get_8253_tsc) + +ENTRY(__ipipe_get_cs_tsc) + CFI_STARTPROC + + SAVE ecx + + pushfl_cfi + cli + + mov __ipipe_cs_mask + 4, %ecx + mov __ipipe_cs_mask, %edx + cmp $0xffffffff, %ecx + mov __ipipe_cs, %eax + jne 1f + + /* 64 bits clocksource */ + call *__ipipe_cs_read + jmp 4f + +1: cmp $0xffffffff, %edx + jne 2f + + /* 32 bits clocksource */ + call *__ipipe_cs_read + + mov __ipipe_cs_last_tsc + 4, %edx + cmp __ipipe_cs_last_tsc, %eax + adc $0, %edx + + jmp 4f + + /* n bits (< 32) clocksource */ +2: SAVE ebx + + mov %edx, %ebx + call *__ipipe_cs_read + + mov __ipipe_cs_last_tsc, %ecx + and %ebx, %eax + mov %ebx, %edx + and %ecx, %ebx + not %edx + cmp %ebx, %eax + jae 3f + sub %edx, %eax +3: and %edx, %ecx + mov __ipipe_cs_last_tsc + 4, %edx + add %ecx, %eax + adc $0, %edx + + RESTORE ebx + +4: mov %eax, __ipipe_cs_last_tsc + mov %edx, __ipipe_cs_last_tsc + 4 + popfl_cfi + RESTORE ecx + ret + + /* n bits clocksource with 32 < n < 64, not supported. */ + CFI_ENDPROC +ENDPROC(__ipipe_get_cs_tsc) diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c index 6e373ea..f32da7b 100644 --- a/drivers/pci/htirq.c +++ b/drivers/pci/htirq.c @@ -21,7 +21,7 @@ * With multiple simultaneous hypertransport irq devices it might pay * to make this more fine grained. But start with simple, stupid, and correct. */ -static DEFINE_SPINLOCK(ht_irq_lock); +static IPIPE_DEFINE_SPINLOCK(ht_irq_lock); struct ht_irq_cfg { struct pci_dev *dev; diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig index 78a666d..20e9852 100644 --- a/drivers/cpuidle/Kconfig +++ b/drivers/cpuidle/Kconfig @@ -2,6 +2,7 @@ config CPU_IDLE bool "CPU idle PM support" default y if ACPI || PPC_PSERIES + depends on !(ARCH_OMAP4 && IPIPE) help CPU idle is a generic framework for supporting software-controlled idle processor power management. It includes modular cross-platform diff --git a/drivers/tty/serial/8250/8250.c b/drivers/tty/serial/8250/8250.c index 6e1958a..88bc148 100644 --- a/drivers/tty/serial/8250/8250.c +++ b/drivers/tty/serial/8250/8250.c @@ -3055,6 +3055,84 @@ static int serial8250_resume(struct platform_device *dev) return 0; } +#if defined(CONFIG_IPIPE_DEBUG) && defined(CONFIG_SERIAL_8250_CONSOLE) + +static IPIPE_DEFINE_SPINLOCK(ipipe_8250_lock); + +#include + +static void wait_for_xmitr_nodelay(struct uart_8250_port *up, int bits) +{ + unsigned int status, tmout = 10000; + + for (;;) { + status = serial_in(up, UART_LSR); + + up->lsr_saved_flags |= status & LSR_SAVE_FLAGS; + + if ((status & bits) == bits) + break; + if (--tmout == 0) + break; + cpu_relax(); + } +} + +static void serial8250_console_putchar_nodelay(struct uart_port *port, int ch) +{ + struct uart_8250_port *up = + container_of(port, struct uart_8250_port, port); + + wait_for_xmitr_nodelay(up, UART_LSR_THRE); + serial_port_out(port, UART_TX, ch); +} + +void __weak __ipipe_serial_debug(const char *fmt, ...) +{ + struct uart_8250_port *up = &serial8250_ports[0]; + unsigned int ier, count; + unsigned long flags; + char buf[128]; + va_list ap; + + if (up->port.membase == NULL + && up->port.iobase == 0 + && up->port.mapbase == 0) + return; + + va_start(ap, fmt); + vsprintf(buf, fmt, ap); + va_end(ap); + count = strlen(buf); + + touch_nmi_watchdog(); + + spin_lock_irqsave(&ipipe_8250_lock, flags); + + /* + * First save the IER then disable the interrupts + */ + ier = serial_in(up, UART_IER); + + if (up->capabilities & UART_CAP_UUE) + serial_out(up, UART_IER, UART_IER_UUE); + else + serial_out(up, UART_IER, 0); + + uart_console_write(&up->port, buf, count, serial8250_console_putchar_nodelay); + + /* + * Finally, wait for transmitter to become empty + * and restore the IER + */ + wait_for_xmitr_nodelay(up, BOTH_EMPTY); + serial_out(up, UART_IER, ier); + + spin_unlock_irqrestore(&ipipe_8250_lock, flags); +} + +#endif + static struct platform_driver serial8250_isa_driver = { .probe = serial8250_probe, .remove = __devexit_p(serial8250_remove), diff --git a/fs/exec.c b/fs/exec.c index e95aeed..5dbf818 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -815,6 +815,7 @@ static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; struct mm_struct * old_mm, *active_mm; + unsigned long flags; /* Notify parent that we're no longer interested in the old VM */ tsk = current; @@ -838,8 +839,10 @@ static int exec_mmap(struct mm_struct *mm) task_lock(tsk); active_mm = tsk->active_mm; tsk->mm = mm; + ipipe_mm_switch_protect(flags); tsk->active_mm = mm; activate_mm(active_mm, mm); + ipipe_mm_switch_unprotect(flags); task_unlock(tsk); arch_pick_mmap_layout(mm); if (old_mm) { diff --git a/fs/proc/array.c b/fs/proc/array.c index c1c207c..95e4092 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -143,6 +143,10 @@ static const char * const task_state_array[] = { "x (dead)", /* 64 */ "K (wakekill)", /* 128 */ "W (waking)", /* 256 */ +#ifdef CONFIG_IPIPE + "A (atomic switch)", /* 512 */ + "N (wakeup disabled)", /* 1024 */ +#endif }; static inline const char *get_task_state(struct task_struct *tsk) diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h index 1ced641..f41c3d2 100644 --- a/include/asm-generic/atomic.h +++ b/include/asm-generic/atomic.h @@ -68,11 +68,11 @@ static inline int atomic_add_return(int i, atomic_t *v) unsigned long flags; int temp; - raw_local_irq_save(flags); /* Don't trace it in an irqsoff handler */ + flags = hard_local_irq_save(); /* Don't trace it in an irqsoff handler */ temp = v->counter; temp += i; v->counter = temp; - raw_local_irq_restore(flags); + hard_local_irq_restore(flags); return temp; } @@ -91,11 +91,11 @@ static inline int atomic_sub_return(int i, atomic_t *v) unsigned long flags; int temp; - raw_local_irq_save(flags); /* Don't trace it in an irqsoff handler */ + flags = hard_local_irq_save(); temp = v->counter; temp -= i; v->counter = temp; - raw_local_irq_restore(flags); + hard_local_irq_restore(flags); return temp; } @@ -164,9 +164,9 @@ static inline void atomic_clear_mask(unsigned long mask, atomic_t *v) unsigned long flags; mask = ~mask; - raw_local_irq_save(flags); /* Don't trace it in a irqsoff handler */ + flags = hard_local_irq_save(); v->counter &= mask; - raw_local_irq_restore(flags); + hard_local_irq_restore(flags); } #endif diff --git a/include/asm-generic/bitops/atomic.h b/include/asm-generic/bitops/atomic.h index 9ae6c34..3d4535a 100644 --- a/include/asm-generic/bitops/atomic.h +++ b/include/asm-generic/bitops/atomic.h @@ -21,20 +21,20 @@ extern arch_spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] __lock_aligned; * this is the substitute */ #define _atomic_spin_lock_irqsave(l,f) do { \ arch_spinlock_t *s = ATOMIC_HASH(l); \ - local_irq_save(f); \ + (f) = hard_local_irq_save(); \ arch_spin_lock(s); \ } while(0) #define _atomic_spin_unlock_irqrestore(l,f) do { \ arch_spinlock_t *s = ATOMIC_HASH(l); \ arch_spin_unlock(s); \ - local_irq_restore(f); \ + hard_local_irq_restore(f); \ } while(0) #else -# define _atomic_spin_lock_irqsave(l,f) do { local_irq_save(f); } while (0) -# define _atomic_spin_unlock_irqrestore(l,f) do { local_irq_restore(f); } while (0) +# define _atomic_spin_lock_irqsave(l,f) do { (f) = hard_local_irq_save(); } while (0) +# define _atomic_spin_unlock_irqrestore(l,f) do { hard_local_irq_restore(f); } while (0) #endif /* diff --git a/include/asm-generic/cmpxchg-local.h b/include/asm-generic/cmpxchg-local.h index 2533fdd..6559e2d 100644 --- a/include/asm-generic/cmpxchg-local.h +++ b/include/asm-generic/cmpxchg-local.h @@ -21,7 +21,7 @@ static inline unsigned long __cmpxchg_local_generic(volatile void *ptr, if (size == 8 && sizeof(unsigned long) != 8) wrong_size_cmpxchg(ptr); - local_irq_save(flags); + flags = hard_local_irq_save(); switch (size) { case 1: prev = *(u8 *)ptr; if (prev == old) @@ -42,7 +42,7 @@ static inline unsigned long __cmpxchg_local_generic(volatile void *ptr, default: wrong_size_cmpxchg(ptr); } - local_irq_restore(flags); + hard_local_irq_restore(flags); return prev; } @@ -55,11 +55,11 @@ static inline u64 __cmpxchg64_local_generic(volatile void *ptr, u64 prev; unsigned long flags; - local_irq_save(flags); + flags = hard_local_irq_save(); prev = *(u64 *)ptr; if (prev == old) *(u64 *)ptr = new; - local_irq_restore(flags); + hard_local_irq_restore(flags); return prev; } diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h index d030d2c..5612229 100644 --- a/include/asm-generic/mman-common.h +++ b/include/asm-generic/mman-common.h @@ -19,6 +19,9 @@ #define MAP_TYPE 0x0f /* Mask for type of mapping */ #define MAP_FIXED 0x10 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x20 /* don't use a file */ +#ifndef MAP_BRK +# define MAP_BRK 0 +#endif #ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED # define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be uninitialized */ #else diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index d17784e..d8e2912 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -67,6 +67,22 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; #define __get_cpu_var(var) (*this_cpu_ptr(&(var))) #define __raw_get_cpu_var(var) (*__this_cpu_ptr(&(var))) +#ifdef CONFIG_IPIPE +#if defined(CONFIG_IPIPE_DEBUG_INTERNAL) && defined(CONFIG_SMP) +extern int __ipipe_check_percpu_access(void); +#define __ipipe_cpu_offset \ + ({ \ + WARN_ON_ONCE(__ipipe_check_percpu_access()); \ + __my_cpu_offset; \ + }) +#else +#define __ipipe_cpu_offset __my_cpu_offset +#endif +#ifndef __ipipe_this_cpu_ptr +#define __ipipe_this_cpu_ptr(ptr) SHIFT_PERCPU_PTR(ptr, __ipipe_cpu_offset) +#endif +#define __ipipe_this_cpu_read(var) (*__ipipe_this_cpu_ptr(&(var))) +#endif /* CONFIG_IPIPE */ #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA extern void setup_per_cpu_areas(void); @@ -82,6 +98,8 @@ extern void setup_per_cpu_areas(void); #define per_cpu(var, cpu) (*((void)(cpu), VERIFY_PERCPU_PTR(&(var)))) #define __get_cpu_var(var) (*VERIFY_PERCPU_PTR(&(var))) #define __raw_get_cpu_var(var) (*VERIFY_PERCPU_PTR(&(var))) +#define __ipipe_this_cpu_ptr(ptr) VERIFY_PERCPU_PTR(ptr) +#define __ipipe_this_cpu_read(var) (*__ipipe_this_cpu_ptr(&(var))) #define this_cpu_ptr(ptr) per_cpu_ptr(ptr, 0) #define __this_cpu_ptr(ptr) this_cpu_ptr(ptr) diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h index 61fa862..d36ceb9 100644 --- a/include/asm-generic/resource.h +++ b/include/asm-generic/resource.h @@ -58,6 +58,14 @@ #endif /* + * Limit the stack by to some sane default: root can always + * increase this limit if needed.. 8MB seems reasonable. + */ +#ifndef _STK_LIM +# define _STK_LIM (8*1024*1024) +#endif + +/* * RLIMIT_STACK default maximum - some architectures override it: */ #ifndef _STK_LIM_MAX diff --git a/include/ipipe/setup.h b/include/ipipe/setup.h new file mode 100644 index 0000000..c2bc521 --- /dev/null +++ b/include/ipipe/setup.h @@ -0,0 +1,10 @@ +#ifndef _IPIPE_SETUP_H +#define _IPIPE_SETUP_H + +/* + * Placeholders for setup hooks defined by client domains. + */ + +static inline void __ipipe_early_client_setup(void) { } + +#endif /* !_IPIPE_SETUP_H */ diff --git a/include/ipipe/thread_info.h b/include/ipipe/thread_info.h new file mode 100644 index 0000000..1f6e9c3 --- /dev/null +++ b/include/ipipe/thread_info.h @@ -0,0 +1,14 @@ +#ifndef _IPIPE_THREAD_INFO_H +#define _IPIPE_THREAD_INFO_H + +/* + * Placeholder for private thread information defined by client + * domains. + */ + +struct ipipe_threadinfo { +}; + +static inline void __ipipe_init_threadinfo(struct ipipe_threadinfo *p) { } + +#endif /* !_IPIPE_THREAD_INFO_H */ diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index acba8943..2d3f39e 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -105,6 +105,15 @@ struct clock_event_device { int irq; const struct cpumask *cpumask; struct list_head list; + +#ifdef CONFIG_IPIPE + struct ipipe_timer *ipipe_timer; + unsigned ipipe_stolen; + +#define clockevent_ipipe_stolen(evt) ((evt)->ipipe_stolen) +#else +#define clockevent_ipipe_stolen(evt) (0) +#endif /* !CONFIG_IPIPE */ } ____cacheline_aligned; /* diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index fbe89e1..33f6a2f 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -195,6 +195,10 @@ struct clocksource { cycle_t cs_last; cycle_t wd_last; #endif +#ifdef CONFIG_IPIPE_WANT_CLOCKSOURCE + cycle_t (*ipipe_read)(struct clocksource *cs); +#endif /* CONFIG_IPIPE_WANT_CLOCKSOURCE */ + } ____cacheline_aligned; /* diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index bb7f309..249dec1 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -189,6 +189,7 @@ extern void irq_exit(void); #define nmi_enter() \ do { \ + __ipipe_nmi_enter(); \ ftrace_nmi_enter(); \ BUG_ON(in_nmi()); \ add_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ @@ -205,6 +206,7 @@ extern void irq_exit(void); BUG_ON(!in_nmi()); \ sub_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ ftrace_nmi_exit(); \ + __ipipe_nmi_exit(); \ } while (0) #endif /* LINUX_HARDIRQ_H */ diff --git a/include/linux/i8253.h b/include/linux/i8253.h index e6bb36a..898a91a 100644 --- a/include/linux/i8253.h +++ b/include/linux/i8253.h @@ -12,6 +12,7 @@ #include #include #include +#include /* i8253A PIT registers */ #define PIT_MODE 0x43 @@ -20,7 +21,7 @@ #define PIT_LATCH ((PIT_TICK_RATE + HZ/2) / HZ) -extern raw_spinlock_t i8253_lock; +IPIPE_DECLARE_RAW_SPINLOCK(i8253_lock); extern struct clock_event_device i8253_clockevent; extern void clockevent_i8253_init(bool oneshot); diff --git a/include/linux/ipipe.h b/include/linux/ipipe.h new file mode 100644 index 0000000..a04ec49 --- /dev/null +++ b/include/linux/ipipe.h @@ -0,0 +1,431 @@ +/* -*- linux-c -*- + * include/linux/ipipe.h + * + * Copyright (C) 2002-2007 Philippe Gerum. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __LINUX_IPIPE_H +#define __LINUX_IPIPE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_IPIPE + +#include + +/* ipipe_set_hooks(..., enables) */ +#define IPIPE_SYSCALL __IPIPE_SYSCALL_E +#define IPIPE_TRAP __IPIPE_TRAP_E +#define IPIPE_KEVENT __IPIPE_KEVENT_E + +struct ipipe_sysinfo { + int sys_nr_cpus; /* Number of CPUs on board */ + int sys_hrtimer_irq; /* hrtimer device IRQ */ + u64 sys_hrtimer_freq; /* hrtimer device frequency */ + u64 sys_hrclock_freq; /* hrclock device frequency */ + u64 sys_cpu_freq; /* CPU frequency (Hz) */ + struct ipipe_arch_sysinfo arch; +}; + +struct ipipe_work_header { + size_t size; + void (*handler)(struct ipipe_work_header *work); +}; + +extern unsigned int __ipipe_printk_virq; + +void __ipipe_set_irq_pending(struct ipipe_domain *ipd, unsigned int irq); + +void __ipipe_complete_domain_migration(void); + +int __ipipe_switch_tail(void); + +int __ipipe_migrate_head(void); + +void __ipipe_reenter_root(void); + +int __ipipe_disable_ondemand_mappings(struct task_struct *p); + +int __ipipe_pin_vma(struct mm_struct *mm, struct vm_area_struct *vma); + +#ifdef CONFIG_IPIPE_WANT_PREEMPTIBLE_SWITCH + +#define prepare_arch_switch(next) \ + do { \ + hard_local_irq_enable(); \ + __ipipe_report_schedule(current, next); \ + } while(0) + +#ifndef ipipe_get_active_mm +static inline struct mm_struct *ipipe_get_active_mm(void) +{ + return __this_cpu_read(ipipe_percpu.active_mm); +} +#define ipipe_get_active_mm ipipe_get_active_mm +#endif + +#else /* !CONFIG_IPIPE_WANT_PREEMPTIBLE_SWITCH */ + +#define prepare_arch_switch(next) \ + do { \ + __ipipe_report_schedule(current, next); \ + hard_local_irq_disable(); \ + } while(0) + +#ifndef ipipe_get_active_mm +#define ipipe_get_active_mm() (current->active_mm) +#endif + +#endif /* !CONFIG_IPIPE_WANT_PREEMPTIBLE_SWITCH */ + +#ifdef CONFIG_IPIPE_WANT_CLOCKSOURCE + +extern unsigned long long __ipipe_cs_freq; + +extern struct clocksource *__ipipe_cs; + +#endif /* CONFIG_IPIPE_WANT_CLOCKSOURCE */ + +static inline void __ipipe_nmi_enter(void) +{ + __this_cpu_write(ipipe_percpu.nmi_state, __ipipe_root_status); + __set_bit(IPIPE_STALL_FLAG, &__ipipe_root_status); + ipipe_save_context_nmi(); +} + +static inline void __ipipe_nmi_exit(void) +{ + ipipe_restore_context_nmi(); + if (!test_bit(IPIPE_STALL_FLAG, __this_cpu_ptr(&ipipe_percpu.nmi_state))) + __clear_bit(IPIPE_STALL_FLAG, &__ipipe_root_status); +} + +/* KVM-side calls, hw IRQs off. */ +static inline void __ipipe_enter_vm(struct ipipe_vm_notifier *vmf) +{ + struct ipipe_percpu_data *p; + + p = __ipipe_this_cpu_ptr(&ipipe_percpu); + p->vm_notifier = vmf; + barrier(); +} + +static inline void __ipipe_exit_vm(void) +{ + struct ipipe_percpu_data *p; + + p = __ipipe_this_cpu_ptr(&ipipe_percpu); + p->vm_notifier = NULL; + barrier(); +} + +/* Client-side call, hw IRQs off. */ +void __ipipe_notify_vm_preemption(void); + +static inline void __ipipe_sync_pipeline(struct ipipe_domain *top) +{ + if (__ipipe_current_domain != top) { + __ipipe_do_sync_pipeline(top); + return; + } + if (!test_bit(IPIPE_STALL_FLAG, &ipipe_this_cpu_context(top)->status)) + __ipipe_sync_stage(); +} + +void ipipe_register_head(struct ipipe_domain *ipd, + const char *name); + +void ipipe_unregister_head(struct ipipe_domain *ipd); + +int ipipe_request_irq(struct ipipe_domain *ipd, + unsigned int irq, + ipipe_irq_handler_t handler, + void *cookie, + ipipe_irq_ackfn_t ackfn); + +void ipipe_free_irq(struct ipipe_domain *ipd, + unsigned int irq); + +void ipipe_raise_irq(unsigned int irq); + +void ipipe_set_hooks(struct ipipe_domain *ipd, + int enables); + +unsigned int ipipe_alloc_virq(void); + +void ipipe_free_virq(unsigned int virq); + +static inline void ipipe_post_irq_head(unsigned int irq) +{ + __ipipe_set_irq_pending(ipipe_head_domain, irq); +} + +static inline void ipipe_post_irq_root(unsigned int irq) +{ + __ipipe_set_irq_pending(&ipipe_root, irq); +} + +static inline void ipipe_stall_head(void) +{ + hard_local_irq_disable(); + __set_bit(IPIPE_STALL_FLAG, &__ipipe_head_status); +} + +static inline unsigned long ipipe_test_and_stall_head(void) +{ + hard_local_irq_disable(); + return __test_and_set_bit(IPIPE_STALL_FLAG, &__ipipe_head_status); +} + +static inline unsigned long ipipe_test_head(void) +{ + unsigned long flags, ret; + + flags = hard_smp_local_irq_save(); + ret = test_bit(IPIPE_STALL_FLAG, &__ipipe_head_status); + hard_smp_local_irq_restore(flags); + + return ret; +} + +void ipipe_unstall_head(void); + +void __ipipe_restore_head(unsigned long x); + +static inline void ipipe_restore_head(unsigned long x) +{ + ipipe_check_irqoff(); + if ((x ^ test_bit(IPIPE_STALL_FLAG, &__ipipe_head_status)) & 1) + __ipipe_restore_head(x); +} + +void __ipipe_post_work_root(struct ipipe_work_header *work); + +#define ipipe_post_work_root(p, header) \ + do { \ + void header_not_at_start(void); \ + if (offsetof(typeof(*(p)), header)) { \ + header_not_at_start(); \ + } \ + __ipipe_post_work_root(&(p)->header); \ + } while (0) + +int ipipe_get_sysinfo(struct ipipe_sysinfo *sysinfo); + +unsigned long ipipe_critical_enter(void (*syncfn)(void)); + +void ipipe_critical_exit(unsigned long flags); + +void ipipe_prepare_panic(void); + +static inline void ipipe_set_foreign_stack(struct ipipe_domain *ipd) +{ + /* Must be called hw interrupts off. */ + __set_bit(IPIPE_NOSTACK_FLAG, &ipipe_this_cpu_context(ipd)->status); +} + +static inline void ipipe_clear_foreign_stack(struct ipipe_domain *ipd) +{ + /* Must be called hw interrupts off. */ + __clear_bit(IPIPE_NOSTACK_FLAG, &ipipe_this_cpu_context(ipd)->status); +} + +static inline int ipipe_test_foreign_stack(void) +{ + /* Must be called hw interrupts off. */ + return test_bit(IPIPE_NOSTACK_FLAG, &__ipipe_current_context->status); +} + +#ifndef ipipe_safe_current +#define ipipe_safe_current() \ + ({ \ + struct task_struct *__p__; \ + unsigned long __flags__; \ + __flags__ = hard_smp_local_irq_save(); \ + __p__ = ipipe_test_foreign_stack() ? &init_task : current; \ + hard_smp_local_irq_restore(__flags__); \ + __p__; \ + }) +#endif + +#ifdef CONFIG_SMP +void ipipe_set_irq_affinity(unsigned int irq, cpumask_t cpumask); +void ipipe_send_ipi(unsigned int ipi, cpumask_t cpumask); +#else /* !CONFIG_SMP */ +static inline +void ipipe_set_irq_affinity(unsigned int irq, cpumask_t cpumask) { } +static inline void ipipe_send_ipi(unsigned int ipi, cpumask_t cpumask) { } +#endif /* CONFIG_SMP */ + +static inline void ipipe_restore_root_nosync(unsigned long x) +{ + unsigned long flags; + + flags = hard_smp_local_irq_save(); + __ipipe_restore_root_nosync(x); + hard_smp_local_irq_restore(flags); +} + +/* Must be called hw IRQs off. */ +static inline void ipipe_lock_irq(unsigned int irq) +{ + struct ipipe_domain *ipd = __ipipe_current_domain; + if (ipd == ipipe_root_domain) + __ipipe_lock_irq(irq); +} + +/* Must be called hw IRQs off. */ +static inline void ipipe_unlock_irq(unsigned int irq) +{ + struct ipipe_domain *ipd = __ipipe_current_domain; + if (ipd == ipipe_root_domain) + __ipipe_unlock_irq(irq); +} + +static inline struct ipipe_threadinfo *ipipe_current_threadinfo(void) +{ + return ¤t_thread_info()->ipipe_data; +} + +#define ipipe_task_threadinfo(p) (&task_thread_info(p)->ipipe_data) + +static inline void ipipe_enable_irq(unsigned int irq) +{ + struct irq_desc *desc; + struct irq_chip *chip; + + desc = irq_to_desc(irq); + if (desc == NULL) + return; + + chip = irq_desc_get_chip(desc); + + if (WARN_ON_ONCE(chip->irq_enable == NULL && chip->irq_unmask == NULL)) + return; + + if (chip->irq_enable) + chip->irq_enable(&desc->irq_data); + else + chip->irq_unmask(&desc->irq_data); +} + +static inline void ipipe_disable_irq(unsigned int irq) +{ + struct irq_desc *desc; + struct irq_chip *chip; + + desc = irq_to_desc(irq); + if (desc == NULL) + return; + + chip = irq_desc_get_chip(desc); + + if (WARN_ON_ONCE(chip->irq_disable == NULL && chip->irq_mask == NULL)) + return; + + if (chip->irq_disable) + chip->irq_disable(&desc->irq_data); + else + chip->irq_mask(&desc->irq_data); +} + +static inline void ipipe_end_irq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc) + desc->ipipe_end(irq, desc); +} + +static inline int ipipe_chained_irq_p(struct irq_desc *desc) +{ + void __ipipe_chained_irq(unsigned irq, struct irq_desc *desc); + + return desc->handle_irq == __ipipe_chained_irq; +} + +static inline void ipipe_handle_demuxed_irq(unsigned int cascade_irq) +{ + ipipe_trace_irq_entry(cascade_irq); + __ipipe_dispatch_irq(cascade_irq, IPIPE_IRQF_NOSYNC); + ipipe_trace_irq_exit(cascade_irq); +} + +#define ipipe_enable_notifier(p) \ + do { \ + barrier(); \ + (p)->ipipe.flags |= PF_EVNOTIFY; \ + } while (0) + +#define ipipe_disable_notifier(p) \ + do { \ + barrier(); \ + (p)->ipipe.flags &= ~(PF_EVNOTIFY|PF_MAYDAY); \ + } while (0) + +#define ipipe_notifier_enabled_p(p) \ + (((p)->ipipe.flags) & PF_EVNOTIFY) + +#define ipipe_raise_mayday(p) \ + do { \ + ipipe_check_irqoff(); \ + if (ipipe_notifier_enabled_p(p)) \ + (p)->ipipe.flags |= PF_MAYDAY; \ + } while (0) + +#include + +#else /* !CONFIG_IPIPE */ + +#define __ipipe_root_p 1 +#define ipipe_root_p 1 + +static inline void __ipipe_complete_domain_migration(void) { } + +static inline int __ipipe_switch_tail(void) +{ + return 0; +} + +static inline void __ipipe_nmi_enter(void) { } + +static inline void __ipipe_nmi_exit(void) { } + +#define ipipe_safe_current() current +#define ipipe_processor_id() smp_processor_id() + +static inline int ipipe_test_foreign_stack(void) +{ + return 0; +} + +static inline void ipipe_lock_irq(unsigned int irq) { } + +static inline void ipipe_unlock_irq(unsigned int irq) { } + +#endif /* !CONFIG_IPIPE */ + +#endif /* !__LINUX_IPIPE_H */ diff --git a/include/linux/ipipe_base.h b/include/linux/ipipe_base.h new file mode 100644 index 0000000..1ae32a1 --- /dev/null +++ b/include/linux/ipipe_base.h @@ -0,0 +1,359 @@ +/* -*- linux-c -*- + * include/linux/ipipe_base.h + * + * Copyright (C) 2002-2012 Philippe Gerum. + * 2007 Jan Kiszka. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __LINUX_IPIPE_BASE_H +#define __LINUX_IPIPE_BASE_H + +struct kvm_vcpu; +struct ipipe_vm_notifier; +struct irq_desc; + +#ifdef CONFIG_IPIPE + +#define IPIPE_CORE_APIREV CONFIG_IPIPE_CORE_APIREV + +#ifdef CONFIG_IPIPE_DEBUG_CONTEXT +void ipipe_root_only(void); +#else /* !CONFIG_IPIPE_DEBUG_CONTEXT */ +static inline void ipipe_root_only(void) { } +#endif /* !CONFIG_IPIPE_DEBUG_CONTEXT */ + +typedef void (*ipipe_irq_handler_t)(unsigned int irq, + void *cookie); + +void ipipe_unstall_root(void); + +void ipipe_restore_root(unsigned long x); + +#include +#include + +#define __bpl_up(x) (((x)+(BITS_PER_LONG-1)) & ~(BITS_PER_LONG-1)) +/* Number of virtual IRQs (must be a multiple of BITS_PER_LONG) */ +#define IPIPE_NR_VIRQS BITS_PER_LONG +/* First virtual IRQ # (must be aligned on BITS_PER_LONG) */ +#define IPIPE_VIRQ_BASE __bpl_up(IPIPE_NR_XIRQS) +/* Total number of IRQ slots */ +#define IPIPE_NR_IRQS (IPIPE_VIRQ_BASE+IPIPE_NR_VIRQS) + +static inline int ipipe_virtual_irq_p(unsigned int irq) +{ + return irq >= IPIPE_VIRQ_BASE && irq < IPIPE_NR_IRQS; +} + +#define IPIPE_IRQ_LOMAPSZ (IPIPE_NR_IRQS / BITS_PER_LONG) +#if IPIPE_IRQ_LOMAPSZ > BITS_PER_LONG +/* + * We need a 3-level mapping. This allows us to handle up to 32k IRQ + * vectors on 32bit machines, 256k on 64bit ones. + */ +#define __IPIPE_3LEVEL_IRQMAP 1 +#define IPIPE_IRQ_MDMAPSZ (__bpl_up(IPIPE_IRQ_LOMAPSZ) / BITS_PER_LONG) +#else +/* + * 2-level mapping is enough. This allows us to handle up to 1024 IRQ + * vectors on 32bit machines, 4096 on 64bit ones. + */ +#define __IPIPE_2LEVEL_IRQMAP 1 +#endif + +/* Per-cpu pipeline status */ +#define IPIPE_STALL_FLAG 0 /* interrupts (virtually) disabled. */ +#define IPIPE_NOSTACK_FLAG 1 /* running on foreign stack. */ +#define IPIPE_STALL_MASK (1L << IPIPE_STALL_FLAG) +#define IPIPE_NOSTACK_MASK (1L << IPIPE_NOSTACK_FLAG) + +/* Interrupt control bits */ +#define IPIPE_HANDLE_FLAG 0 +#define IPIPE_STICKY_FLAG 1 +#define IPIPE_LOCK_FLAG 2 +#define IPIPE_HANDLE_MASK (1 << IPIPE_HANDLE_FLAG) +#define IPIPE_STICKY_MASK (1 << IPIPE_STICKY_FLAG) +#define IPIPE_LOCK_MASK (1 << IPIPE_LOCK_FLAG) + +struct pt_regs; +struct ipipe_domain; + +struct ipipe_trap_data { + int exception; + struct pt_regs *regs; +}; + +#define IPIPE_KEVT_SCHEDULE 0 +#define IPIPE_KEVT_SIGWAKE 1 +#define IPIPE_KEVT_SETSCHED 2 +#define IPIPE_KEVT_EXIT 3 +#define IPIPE_KEVT_CLEANUP 4 +#define IPIPE_KEVT_HOSTRT 5 + +struct ipipe_vm_notifier { + void (*handler)(struct ipipe_vm_notifier *nfy); +}; + +void __ipipe_init_early(void); + +void __ipipe_init(void); + +#ifdef CONFIG_PROC_FS +void __ipipe_init_proc(void); +#ifdef CONFIG_IPIPE_TRACE +void __ipipe_init_tracer(void); +#else /* !CONFIG_IPIPE_TRACE */ +static inline void __ipipe_init_tracer(void) { } +#endif /* CONFIG_IPIPE_TRACE */ +#else /* !CONFIG_PROC_FS */ +static inline void __ipipe_init_proc(void) { } +#endif /* CONFIG_PROC_FS */ + +void __ipipe_restore_root_nosync(unsigned long x); + +#define IPIPE_IRQF_NOACK 0x1 +#define IPIPE_IRQF_NOSYNC 0x2 + +void __ipipe_dispatch_irq(unsigned int irq, int flags); + +void __ipipe_do_sync_stage(void); + +void __ipipe_do_sync_pipeline(struct ipipe_domain *top); + +void __ipipe_lock_irq(unsigned int irq); + +void __ipipe_unlock_irq(unsigned int irq); + +void __ipipe_do_critical_sync(unsigned int irq, void *cookie); + +void __ipipe_ack_edge_irq(unsigned int irq, struct irq_desc *desc); + +void __ipipe_nop_irq(unsigned int irq, struct irq_desc *desc); + +static inline void __ipipe_idle(void) +{ + ipipe_unstall_root(); +} + +#ifndef __ipipe_sync_check +#define __ipipe_sync_check 1 +#endif + +static inline void __ipipe_sync_stage(void) +{ + if (likely(__ipipe_sync_check)) + __ipipe_do_sync_stage(); +} + +#ifndef __ipipe_check_root_resched +#ifdef CONFIG_PREEMPT +#define __ipipe_check_root_resched() \ + (preempt_count() == 0 && need_resched()) +#else +#define __ipipe_check_root_resched() 0 +#endif +#endif + +#ifndef __ipipe_run_irqtail +#define __ipipe_run_irqtail(irq) do { } while(0) +#endif + +void __ipipe_flush_printk(unsigned int irq, void *cookie); + +void __ipipe_pin_range_globally(unsigned long start, + unsigned long end); + +#define hard_preempt_disable() \ + ({ \ + unsigned long __flags__; \ + __flags__ = hard_local_irq_save(); \ + if (__ipipe_root_p) \ + preempt_disable(); \ + __flags__; \ + }) + +#define hard_preempt_enable(flags) \ + do { \ + if (__ipipe_root_p) { \ + preempt_enable_no_resched(); \ + hard_local_irq_restore(flags); \ + preempt_check_resched(); \ + } else \ + hard_local_irq_restore(flags); \ + } while (0) + +#define __ipipe_get_cpu(flags) ({ (flags) = hard_preempt_disable(); ipipe_processor_id(); }) +#define __ipipe_put_cpu(flags) hard_preempt_enable(flags) + +int __ipipe_notify_syscall(struct pt_regs *regs); + +int __ipipe_notify_trap(int exception, struct pt_regs *regs); + +int __ipipe_notify_kevent(int event, void *data); + +#define __ipipe_report_trap(exception, regs) \ + __ipipe_notify_trap(exception, regs) + +#define __ipipe_report_sigwake(p) \ + do { \ + if (ipipe_notifier_enabled_p(p)) \ + __ipipe_notify_kevent(IPIPE_KEVT_SIGWAKE, p); \ + } while (0) + +#define __ipipe_report_exit(p) \ + do { \ + if (ipipe_notifier_enabled_p(p)) \ + __ipipe_notify_kevent(IPIPE_KEVT_EXIT, p); \ + } while (0) + +#define __ipipe_report_setsched(p) \ + do { \ + if (ipipe_notifier_enabled_p(p)) \ + __ipipe_notify_kevent(IPIPE_KEVT_SETSCHED, p); \ + } while (0) + +#define __ipipe_report_schedule(prev, next) \ +do { \ + if ((ipipe_notifier_enabled_p(next) || \ + ipipe_notifier_enabled_p(prev))) { \ + __this_cpu_write(ipipe_percpu.rqlock_owner, prev); \ + __ipipe_notify_kevent(IPIPE_KEVT_SCHEDULE, next); \ + } \ +} while (0) + +#define __ipipe_report_cleanup(mm) \ + __ipipe_notify_kevent(IPIPE_KEVT_CLEANUP, mm) + +void __ipipe_notify_vm_preemption(void); + +#define hard_cond_local_irq_enable() hard_local_irq_enable() +#define hard_cond_local_irq_disable() hard_local_irq_disable() +#define hard_cond_local_irq_save() hard_local_irq_save() +#define hard_cond_local_irq_restore(flags) hard_local_irq_restore(flags) + +struct ipipe_task_info { + unsigned long flags; +}; + +#ifdef CONFIG_IPIPE_LEGACY + +#define IPIPE_FIRST_EVENT IPIPE_NR_FAULTS +#define IPIPE_EVENT_SCHEDULE IPIPE_FIRST_EVENT +#define IPIPE_EVENT_SIGWAKE (IPIPE_FIRST_EVENT + 1) +#define IPIPE_EVENT_SETSCHED (IPIPE_FIRST_EVENT + 2) +#define IPIPE_EVENT_EXIT (IPIPE_FIRST_EVENT + 3) +#define IPIPE_EVENT_CLEANUP (IPIPE_FIRST_EVENT + 4) +#define IPIPE_EVENT_HOSTRT (IPIPE_FIRST_EVENT + 5) +#define IPIPE_EVENT_SYSCALL (IPIPE_FIRST_EVENT + 6) +#define IPIPE_LAST_EVENT IPIPE_EVENT_SYSCALL +#define IPIPE_NR_EVENTS (IPIPE_LAST_EVENT + 1) + +typedef int (*ipipe_event_handler_t)(unsigned int event, + struct ipipe_domain *from, + void *data); +struct ipipe_legacy_context { + unsigned int domid; + int priority; + void *pdd; + ipipe_event_handler_t handlers[IPIPE_NR_EVENTS]; +}; + +#define __ipipe_init_taskinfo(p) \ + do { \ + __ipipe_clear_taskflags(p); \ + memset(p->ptd, 0, sizeof(p->ptd)); \ + } while (0) + +#else /* !CONFIG_IPIPE_LEGACY */ + +struct ipipe_legacy_context { +}; + +#define __ipipe_init_taskinfo(p) \ + do { \ + __ipipe_clear_taskflags(p); \ + } while (0) + +#endif /* !CONFIG_IPIPE_LEGACY */ + +#define __ipipe_clear_taskflags(p) \ + do { \ + (p)->ipipe.flags = 0; \ + } while (0) + +#else /* !CONFIG_IPIPE */ + +struct task_struct; +struct mm_struct; + +struct ipipe_task_info { +}; + +static inline void __ipipe_init_early(void) { } + +static inline void __ipipe_init(void) { } + +static inline void __ipipe_init_proc(void) { } + +static inline void __ipipe_idle(void) { } + +static inline void __ipipe_report_sigwake(struct task_struct *p) { } + +static inline void __ipipe_report_setsched(struct task_struct *p) { } + +static inline void __ipipe_report_exit(struct task_struct *p) { } + +static inline void __ipipe_report_cleanup(struct mm_struct *mm) { } + +#define __ipipe_report_trap(exception, regs) 0 + +static inline void __ipipe_init_taskinfo(struct task_struct *p) { } + +static inline void __ipipe_clear_taskflags(struct task_struct *p) { } + +static inline void __ipipe_pin_range_globally(unsigned long start, + unsigned long end) +{ } + +#define hard_preempt_disable() ({ preempt_disable(); 0; }) +#define hard_preempt_enable(flags) ({ preempt_enable(); (void)(flags); }) + +#define __ipipe_get_cpu(flags) ({ (void)(flags); get_cpu(); }) +#define __ipipe_put_cpu(flags) \ + do { \ + (void)(flags); \ + put_cpu(); \ + } while (0) + +#define __ipipe_root_tick_p(regs) 1 + +#define ipipe_handle_demuxed_irq(irq) generic_handle_irq(irq) + +#define __ipipe_serial_debug(fmt, args...) do { } while (0) + +#define __ipipe_enter_vm(vmf) do { } while (0) + +static inline void __ipipe_exit_vm(void) { } + +static inline void __ipipe_notify_vm_preemption(void) { } + +static inline void ipipe_root_only(void) { } + +#endif /* !CONFIG_IPIPE */ + +#endif /* !__LINUX_IPIPE_BASE_H */ diff --git a/include/linux/ipipe_compat.h b/include/linux/ipipe_compat.h new file mode 100644 index 0000000..fab8f45 --- /dev/null +++ b/include/linux/ipipe_compat.h @@ -0,0 +1,284 @@ +/* -*- linux-c -*- + * include/linux/ipipe_compat.h + * + * Copyright (C) 2012 Philippe Gerum. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __LINUX_IPIPE_COMPAT_H +#define __LINUX_IPIPE_COMPAT_H + +#ifndef __LINUX_IPIPE_H +#error "Do not include this file directly, use linux/ipipe.h instead" +#endif + +#ifdef CONFIG_IPIPE_LEGACY + +#define IPIPE_HEAD_PRIORITY (-1) +#define IPIPE_ROOT_PRIO 100 +#define IPIPE_ROOT_ID 0 +#define IPIPE_ROOT_NPTDKEYS 4 + +#define IPIPE_DUMMY_FLAG 31 +#define IPIPE_WIRED_FLAG IPIPE_HANDLE_FLAG +#define IPIPE_WIRED_MASK (1 << IPIPE_WIRED_FLAG) +#define IPIPE_PASS_FLAG IPIPE_DUMMY_FLAG +#define IPIPE_PASS_MASK (1 << IPIPE_PASS_FLAG) +#define IPIPE_DYNAMIC_FLAG IPIPE_HANDLE_FLAG +#define IPIPE_DYNAMIC_MASK (1 << IPIPE_DYNAMIC_FLAG) +#define IPIPE_SYSTEM_FLAG IPIPE_DUMMY_FLAG +#define IPIPE_SYSTEM_MASK (1 << IPIPE_SYSTEM_FLAG) +#define IPIPE_EXCLUSIVE_FLAG IPIPE_DUMMY_FLAG +#define IPIPE_EXCLUSIVE_MASK (1 << IPIPE_EXCLUSIVE_FLAG) + +#define IPIPE_NR_CPUS NR_CPUS + +#define IPIPE_EVENT_SELF 0x80000000 +#define IPIPE_EVENT_RETURN IPIPE_TRAP_MAYDAY + +#define TASK_ATOMICSWITCH TASK_HARDENING + +struct ipipe_domain_attr { + unsigned int domid; + const char *name; + int priority; + void (*entry) (void); + void *pdd; +}; + +void ipipe_init_attr(struct ipipe_domain_attr *attr); + +int ipipe_register_domain(struct ipipe_domain *ipd, + struct ipipe_domain_attr *attr); + +int ipipe_unregister_domain(struct ipipe_domain *ipd); + +int ipipe_alloc_ptdkey(void); + +int ipipe_free_ptdkey(int key); + +int ipipe_set_ptd(int key, void *value); + +void *ipipe_get_ptd(int key); + +int ipipe_virtualize_irq(struct ipipe_domain *ipd, + unsigned int irq, + ipipe_irq_handler_t handler, + void *cookie, + ipipe_irq_ackfn_t ackfn, + unsigned int modemask); + +ipipe_event_handler_t ipipe_catch_event(struct ipipe_domain *ipd, + unsigned int event, + ipipe_event_handler_t handler); + +int ipipe_setscheduler_root(struct task_struct *p, + int policy, + int prio); + +static inline void ipipe_check_context(struct ipipe_domain *border_ipd) +{ + ipipe_root_only(); +} + +static inline void ipipe_set_printk_sync(struct ipipe_domain *ipd) +{ + ipipe_prepare_panic(); +} + +static inline void __ipipe_propagate_irq(unsigned int irq) +{ + ipipe_post_irq_root(irq); +} + +static inline void __ipipe_schedule_irq_head(unsigned int irq) +{ + ipipe_post_irq_head(irq); +} + +static inline void __ipipe_schedule_irq_root(unsigned int irq) +{ + ipipe_post_irq_root(irq); +} + +static inline int ipipe_trigger_irq(unsigned int irq) +{ + ipipe_raise_irq(irq); + return 1; +} + +static inline void ipipe_stall_pipeline_from(struct ipipe_domain *ipd) +{ + if (ipd != ipipe_root_domain) + ipipe_stall_head(); + else + ipipe_stall_root(); +} + +static inline +unsigned long ipipe_test_and_stall_pipeline_from(struct ipipe_domain *ipd) +{ + if (ipd != ipipe_root_domain) + return ipipe_test_and_stall_head(); + + return ipipe_test_and_stall_root(); +} + +static inline +void ipipe_unstall_pipeline_from(struct ipipe_domain *ipd) +{ + if (ipd != ipipe_root_domain) + ipipe_unstall_head(); + else + ipipe_unstall_root(); +} + +static inline +void ipipe_restore_pipeline_from(struct ipipe_domain *ipd, + unsigned long x) +{ + if (ipd != ipipe_root_domain) + ipipe_restore_head(x); + else + ipipe_restore_root(x); +} + +static inline +unsigned long ipipe_test_pipeline_from(struct ipipe_domain *ipd) +{ + return test_bit(IPIPE_STALL_FLAG, &ipipe_this_cpu_context(ipd)->status); +} + +static inline void ipipe_stall_pipeline_head(void) +{ + ipipe_stall_head(); +} + +static inline unsigned long ipipe_test_and_stall_pipeline_head(void) +{ + return ipipe_test_and_stall_head(); +} + +static inline void ipipe_unstall_pipeline_head(void) +{ + ipipe_unstall_head(); +} + +static inline void ipipe_restore_pipeline_head(unsigned long x) +{ + ipipe_restore_head(x); +} + +static inline int ipipe_disable_ondemand_mappings(struct task_struct *p) +{ + return __ipipe_disable_ondemand_mappings(p); +} + +static inline int ipipe_reenter_root(struct task_struct *prev, + int policy, + int prio) +{ + __ipipe_reenter_root(); + return 0; +} + +static inline void ipipe_root_preempt_notify(void) +{ + ipipe_notify_root_preemption(); +} + +#define ipipe_return_notify(p) ipipe_raise_mayday(p) + +/* + * Keep the following as a macro, so that client code could check for + * the support of the invariant pipeline head optimization. + */ +#define __ipipe_pipeline_head() ipipe_head_domain + +static inline int irqs_disabled_hw(void) +{ + return hard_irqs_disabled(); +} + +static inline void local_irq_disable_hw(void) +{ + hard_local_irq_disable(); +} + +static inline void local_irq_enable_hw(void) +{ + hard_local_irq_enable(); +} + +#define local_irq_save_hw(flags) \ + do { \ + (flags) = hard_local_irq_save(); \ + } while (0) + +static inline void local_irq_restore_hw(unsigned long flags) +{ + hard_local_irq_restore(flags); +} + +#define local_save_flags_hw(flags) \ + do { \ + (flags) = hard_local_save_flags(); \ + } while (0) + +#define local_irq_save_hw_smp(flags) \ + do { \ + (flags) = hard_smp_local_irq_save(); \ + } while (0) +#define local_irq_restore_hw_smp(flags) hard_smp_local_irq_restore(flags) + +#define local_irq_save_hw_cond(flags) \ + do { \ + (flags) = hard_cond_local_irq_save(); \ + } while (0) +#define local_irq_restore_hw_cond(flags) hard_cond_local_irq_restore(flags) + +void __ipipe_legacy_init_stage(struct ipipe_domain *ipd); + +/* + * These values have no real meaning from a versioning POV, however + * they are guaranteed to look more recent than any legacy patch + * release ever published in the past. + */ +#define IPIPE_MAJOR_NUMBER 3 +#define IPIPE_MINOR_NUMBER 0 +#define IPIPE_PATCH_NUMBER 0 + +#define __IPIPE_FEATURE_REQUEST_TICKDEV 1 +#define __IPIPE_FEATURE_FASTPEND_IRQ 1 +#define __IPIPE_FEATURE_TRACE_EVENT 1 +#define __IPIPE_FEATURE_ENABLE_NOTIFIER 1 +#define __IPIPE_FEATURE_PREPARE_PANIC 1 +#define __IPIPE_FEATURE_SYSINFO_V2 1 +#define __IPIPE_FEATURE_PIC_MUTE 1 +#ifdef CONFIG_IPIPE_HAVE_VM_NOTIFIER +#define __IPIPE_FEATURE_ROOTPREEMPT_NOTIFIER 1 +#endif + +#else /* !CONFIG_IPIPE_LEGACY */ + +static inline void __ipipe_legacy_init_stage(struct ipipe_domain *ipd) +{ +} + +#endif /* !CONFIG_IPIPE_LEGACY */ + +#endif /* !__LINUX_IPIPE_COMPAT_H */ diff --git a/include/linux/ipipe_debug.h b/include/linux/ipipe_debug.h new file mode 100644 index 0000000..9b5a4a3 --- /dev/null +++ b/include/linux/ipipe_debug.h @@ -0,0 +1,98 @@ +/* -*- linux-c -*- + * include/linux/ipipe_debug.h + * + * Copyright (C) 2012 Philippe Gerum . + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __LINUX_IPIPE_DEBUG_H +#define __LINUX_IPIPE_DEBUG_H + +#include + +#ifdef CONFIG_IPIPE_DEBUG_CONTEXT + +#include + +static inline int ipipe_disable_context_check(void) +{ + return xchg(__this_cpu_ptr(&ipipe_percpu.context_check), 0); +} + +static inline void ipipe_restore_context_check(int old_state) +{ + __this_cpu_write(ipipe_percpu.context_check, old_state); +} + +static inline void ipipe_context_check_off(void) +{ + int cpu; + for_each_online_cpu(cpu) + per_cpu(ipipe_percpu, cpu).context_check = 0; +} + +static inline void ipipe_save_context_nmi(void) +{ + int state = ipipe_disable_context_check(); + __this_cpu_write(ipipe_percpu.context_check_saved, state); +} + +static inline void ipipe_restore_context_nmi(void) +{ + ipipe_restore_context_check(__this_cpu_read(ipipe_percpu.context_check_saved)); +} + +#else /* !CONFIG_IPIPE_DEBUG_CONTEXT */ + +static inline int ipipe_disable_context_check(void) +{ + return 0; +} + +static inline void ipipe_restore_context_check(int old_state) { } + +static inline void ipipe_context_check_off(void) { } + +static inline void ipipe_save_context_nmi(void) { } + +static inline void ipipe_restore_context_nmi(void) { } + +#endif /* !CONFIG_IPIPE_DEBUG_CONTEXT */ + +#ifdef CONFIG_IPIPE_DEBUG_INTERNAL +#define IPIPE_WARN(c) WARN_ON(c) +#define IPIPE_WARN_ONCE(c) WARN_ON_ONCE(c) +#else +#define IPIPE_WARN(c) do { (void)(c); } while (0) +#define IPIPE_WARN_ONCE(c) do { (void)(c); } while (0) +#endif + +#ifdef CONFIG_IPIPE_DEBUG + +static inline void ipipe_check_irqoff(void) +{ + if (WARN_ON_ONCE(!hard_irqs_disabled())) + hard_local_irq_disable(); +} + +#else /* !CONFIG_IPIPE_DEBUG */ + +static inline void ipipe_check_irqoff(void) { } + +#endif /* !CONFIG_IPIPE_DEBUG */ + +#endif /* !__LINUX_IPIPE_DEBUG_H */ diff --git a/include/linux/ipipe_domain.h b/include/linux/ipipe_domain.h new file mode 100644 index 0000000..d0bde26 --- /dev/null +++ b/include/linux/ipipe_domain.h @@ -0,0 +1,311 @@ +/* -*- linux-c -*- + * include/linux/ipipe_domain.h + * + * Copyright (C) 2007-2012 Philippe Gerum. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __LINUX_IPIPE_DOMAIN_H +#define __LINUX_IPIPE_DOMAIN_H + +#ifdef CONFIG_IPIPE + +#include +#include +#include + +struct task_struct; +struct mm_struct; +struct irq_desc; +struct ipipe_vm_notifier; + +#define __IPIPE_SYSCALL_P 0 +#define __IPIPE_TRAP_P 1 +#define __IPIPE_KEVENT_P 2 +#define __IPIPE_SYSCALL_E (1 << __IPIPE_SYSCALL_P) +#define __IPIPE_TRAP_E (1 << __IPIPE_TRAP_P) +#define __IPIPE_KEVENT_E (1 << __IPIPE_KEVENT_P) +#define __IPIPE_ALL_E 0x7 +#define __IPIPE_SYSCALL_R (8 << __IPIPE_SYSCALL_P) +#define __IPIPE_TRAP_R (8 << __IPIPE_TRAP_P) +#define __IPIPE_KEVENT_R (8 << __IPIPE_KEVENT_P) +#define __IPIPE_SHIFT_R 3 +#define __IPIPE_ALL_R (__IPIPE_ALL_E << __IPIPE_SHIFT_R) + +typedef void (*ipipe_irq_ackfn_t)(unsigned int irq, struct irq_desc *desc); + +struct ipipe_domain { + int context_offset; + struct ipipe_irqdesc { + unsigned long control; + ipipe_irq_ackfn_t ackfn; + ipipe_irq_handler_t handler; + void *cookie; + } ____cacheline_aligned irqs[IPIPE_NR_IRQS]; + const char *name; + struct mutex mutex; + struct ipipe_legacy_context legacy; +}; + +static inline void * +__ipipe_irq_cookie(struct ipipe_domain *ipd, unsigned int irq) +{ + return ipd->irqs[irq].cookie; +} + +static inline ipipe_irq_handler_t +__ipipe_irq_handler(struct ipipe_domain *ipd, unsigned int irq) +{ + return ipd->irqs[irq].handler; +} + +extern struct ipipe_domain ipipe_root; + +#define ipipe_root_domain (&ipipe_root) + +extern struct ipipe_domain *ipipe_head_domain; + +struct ipipe_percpu_domain_data { + unsigned long status; /* <= Must be first in struct. */ + unsigned long irqpend_himap; +#ifdef __IPIPE_3LEVEL_IRQMAP + unsigned long irqpend_mdmap[IPIPE_IRQ_MDMAPSZ]; +#endif + unsigned long irqpend_lomap[IPIPE_IRQ_LOMAPSZ]; + unsigned long irqheld_map[IPIPE_IRQ_LOMAPSZ]; + unsigned long irqall[IPIPE_NR_IRQS]; + struct ipipe_domain *domain; + int coflags; +}; + +struct ipipe_percpu_data { + struct ipipe_percpu_domain_data root; + struct ipipe_percpu_domain_data head; + struct ipipe_percpu_domain_data *curr; + struct pt_regs tick_regs; + int hrtimer_irq; + struct task_struct *task_hijacked; + struct task_struct *rqlock_owner; + struct ipipe_vm_notifier *vm_notifier; + unsigned long nmi_state; +#ifdef CONFIG_IPIPE_WANT_ACTIVE_MM + struct mm_struct *active_mm; +#endif +#ifdef CONFIG_IPIPE_DEBUG_CONTEXT + int context_check; + int context_check_saved; +#endif +}; + +/* + * CAREFUL: all accessors based on __ipipe_this_cpu_ptr() you may find + * in this file should be used only while hw interrupts are off, to + * prevent from CPU migration regardless of the running domain. + */ +DECLARE_PER_CPU(struct ipipe_percpu_data, ipipe_percpu); + +static inline struct ipipe_percpu_domain_data * +__context_of(struct ipipe_percpu_data *p, struct ipipe_domain *ipd) +{ + return (void *)p + ipd->context_offset; +} + +/** + * ipipe_percpu_context - return the address of the pipeline context + * data for a domain on a given CPU. + * + * NOTE: this is the slowest accessor, use it carefully. Prefer + * ipipe_this_cpu_context() for requests targeted at the current + * CPU. Additionally, if the target domain is known at build time, + * consider ipipe_this_cpu_{root, head}_context(). + */ +static inline struct ipipe_percpu_domain_data * +ipipe_percpu_context(struct ipipe_domain *ipd, int cpu) +{ + return __context_of(&per_cpu(ipipe_percpu, cpu), ipd); +} + +/** + * ipipe_this_cpu_context - return the address of the pipeline context + * data for a domain on the current CPU. hw IRQs must be off. + * + * NOTE: this accessor is a bit faster, but since we don't know which + * one of "root" or "head" ipd refers to, we still need to compute the + * context address from its offset. + */ +static inline struct ipipe_percpu_domain_data * +ipipe_this_cpu_context(struct ipipe_domain *ipd) +{ + return __context_of(__ipipe_this_cpu_ptr(&ipipe_percpu), ipd); +} + +/** + * ipipe_this_cpu_root_context - return the address of the pipeline + * context data for the root domain on the current CPU. hw IRQs must + * be off. + * + * NOTE: this accessor is recommended when the domain we refer to is + * known at build time to be the root one. + */ +static inline struct ipipe_percpu_domain_data * +ipipe_this_cpu_root_context(void) +{ + return __ipipe_this_cpu_ptr(&ipipe_percpu.root); +} + +/** + * ipipe_this_cpu_head_context - return the address of the pipeline + * context data for the registered head domain on the current CPU. hw + * IRQs must be off. + * + * NOTE: this accessor is recommended when the domain we refer to is + * known at build time to be the registered head domain. This address + * is always different from the context data of the root domain in + * absence of registered head domain. To get the address of the + * context data for the domain leading the pipeline at the time of the + * call (which may be root in absence of registered head domain), use + * ipipe_this_cpu_leading_context() instead. + */ +static inline struct ipipe_percpu_domain_data * +ipipe_this_cpu_head_context(void) +{ + return __ipipe_this_cpu_ptr(&ipipe_percpu.head); +} + +/** + * ipipe_this_cpu_leading_context - return the address of the pipeline + * context data for the domain leading the pipeline on the current + * CPU. hw IRQs must be off. + * + * NOTE: this accessor is required when either root or a registered + * head domain may be the final target of this call, depending on + * whether the high priority domain was installed via + * ipipe_register_head(). + */ +static inline struct ipipe_percpu_domain_data * +ipipe_this_cpu_leading_context(void) +{ + return ipipe_this_cpu_context(ipipe_head_domain); +} + +/** + * __ipipe_get_current_context() - return the address of the pipeline + * context data of the domain running on the current CPU. hw IRQs must + * be off. + */ +static inline struct ipipe_percpu_domain_data *__ipipe_get_current_context(void) +{ + return __ipipe_this_cpu_read(ipipe_percpu.curr); +} + +#define __ipipe_current_context __ipipe_get_current_context() + +/** + * __ipipe_set_current_context() - switch the current CPU to the + * specified domain context. hw IRQs must be off. + * + * NOTE: this is the only way to change the current domain for the + * current CPU. Don't bypass. + */ +static inline +void __ipipe_set_current_context(struct ipipe_percpu_domain_data *pd) +{ + struct ipipe_percpu_data *p; + p = __ipipe_this_cpu_ptr(&ipipe_percpu); + p->curr = pd; +} + +/** + * __ipipe_set_current_domain() - switch the current CPU to the + * specified domain. This is equivalent to calling + * __ipipe_set_current_context() with the context data of that + * domain. hw IRQs must be off. + */ +static inline void __ipipe_set_current_domain(struct ipipe_domain *ipd) +{ + struct ipipe_percpu_data *p; + p = __ipipe_this_cpu_ptr(&ipipe_percpu); + p->curr = __context_of(p, ipd); +} + +static inline struct ipipe_percpu_domain_data *ipipe_current_context(void) +{ + struct ipipe_percpu_domain_data *pd; + unsigned long flags; + + flags = hard_smp_local_irq_save(); + pd = __ipipe_get_current_context(); + hard_smp_local_irq_restore(flags); + + return pd; +} + +static inline struct ipipe_domain *__ipipe_get_current_domain(void) +{ + return __ipipe_get_current_context()->domain; +} + +#define __ipipe_current_domain __ipipe_get_current_domain() + +/** + * __ipipe_get_current_domain() - return the address of the pipeline + * domain running on the current CPU. hw IRQs must be off. + */ +static inline struct ipipe_domain *ipipe_get_current_domain(void) +{ + struct ipipe_domain *ipd; + unsigned long flags; + + flags = hard_smp_local_irq_save(); + ipd = __ipipe_get_current_domain(); + hard_smp_local_irq_restore(flags); + + return ipd; +} + +#define ipipe_current_domain ipipe_get_current_domain() + +#define __ipipe_root_p (__ipipe_current_domain == ipipe_root_domain) +#define ipipe_root_p (ipipe_current_domain == ipipe_root_domain) + +#ifdef CONFIG_SMP +#define __ipipe_root_status (ipipe_this_cpu_root_context()->status) +#else +extern unsigned long __ipipe_root_status; +#endif + +#define __ipipe_head_status (ipipe_this_cpu_head_context()->status) + +/** + * __ipipe_ipending_p() - Whether we have interrupts pending + * (i.e. logged) for the given domain context on the current CPU. hw + * IRQs must be off. + */ +static inline int __ipipe_ipending_p(struct ipipe_percpu_domain_data *pd) +{ + return pd->irqpend_himap != 0; +} + +static inline unsigned long +__ipipe_cpudata_irq_hits(struct ipipe_domain *ipd, int cpu, unsigned int irq) +{ + return ipipe_percpu_context(ipd, cpu)->irqall[irq]; +} + +#endif /* CONFIG_IPIPE */ + +#endif /* !__LINUX_IPIPE_DOMAIN_H */ diff --git a/include/linux/ipipe_lock.h b/include/linux/ipipe_lock.h new file mode 100644 index 0000000..032c734 --- /dev/null +++ b/include/linux/ipipe_lock.h @@ -0,0 +1,256 @@ +/* -*- linux-c -*- + * include/linux/ipipe_lock.h + * + * Copyright (C) 2009 Philippe Gerum. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __LINUX_IPIPE_LOCK_H +#define __LINUX_IPIPE_LOCK_H + +typedef struct { + arch_spinlock_t arch_lock; +} __ipipe_spinlock_t; + +#define ipipe_spinlock_p(lock) \ + __builtin_types_compatible_p(typeof(lock), __ipipe_spinlock_t *) + +#define std_spinlock_raw_p(lock) \ + __builtin_types_compatible_p(typeof(lock), raw_spinlock_t *) + +#define std_spinlock_p(lock) \ + __builtin_types_compatible_p(typeof(lock), spinlock_t *) + +#define ipipe_spinlock(lock) ((__ipipe_spinlock_t *)(lock)) +#define std_spinlock_raw(lock) ((raw_spinlock_t *)(lock)) +#define std_spinlock(lock) ((spinlock_t *)(lock)) + +#define PICK_SPINLOCK_IRQSAVE(lock, flags) \ + do { \ + if (ipipe_spinlock_p(lock)) \ + (flags) = __ipipe_spin_lock_irqsave(ipipe_spinlock(lock)); \ + else if (std_spinlock_raw_p(lock)) \ + __real_raw_spin_lock_irqsave(std_spinlock_raw(lock), flags); \ + else if (std_spinlock_p(lock)) \ + __real_raw_spin_lock_irqsave(&std_spinlock(lock)->rlock, flags); \ + else __bad_lock_type(); \ + } while (0) + +#define PICK_SPINTRYLOCK_IRQSAVE(lock, flags) \ + ({ \ + int __ret__; \ + if (ipipe_spinlock_p(lock)) \ + __ret__ = __ipipe_spin_trylock_irqsave(ipipe_spinlock(lock), &(flags)); \ + else if (std_spinlock_raw_p(lock)) \ + __ret__ = __real_raw_spin_trylock_irqsave(std_spinlock_raw(lock), flags); \ + else if (std_spinlock_p(lock)) \ + __ret__ = __real_raw_spin_trylock_irqsave(&std_spinlock(lock)->rlock, flags); \ + else __bad_lock_type(); \ + __ret__; \ + }) + +#define PICK_SPINTRYLOCK_IRQ(lock) \ + ({ \ + int __ret__; \ + if (ipipe_spinlock_p(lock)) \ + __ret__ = __ipipe_spin_trylock_irq(ipipe_spinlock(lock)); \ + else if (std_spinlock_raw_p(lock)) \ + __ret__ = __real_raw_spin_trylock_irq(std_spinlock_raw(lock)); \ + else if (std_spinlock_p(lock)) \ + __ret__ = __real_raw_spin_trylock_irq(&std_spinlock(lock)->rlock); \ + else __bad_lock_type(); \ + __ret__; \ + }) + +#define PICK_SPINUNLOCK_IRQRESTORE(lock, flags) \ + do { \ + if (ipipe_spinlock_p(lock)) \ + __ipipe_spin_unlock_irqrestore(ipipe_spinlock(lock), flags); \ + else { \ + __ipipe_spin_unlock_debug(flags); \ + if (std_spinlock_raw_p(lock)) \ + __real_raw_spin_unlock_irqrestore(std_spinlock_raw(lock), flags); \ + else if (std_spinlock_p(lock)) \ + __real_raw_spin_unlock_irqrestore(&std_spinlock(lock)->rlock, flags); \ + } \ + } while (0) + +#define PICK_SPINOP(op, lock) \ + do { \ + if (ipipe_spinlock_p(lock)) \ + arch_spin##op(&ipipe_spinlock(lock)->arch_lock); \ + else if (std_spinlock_raw_p(lock)) \ + __real_raw_spin##op(std_spinlock_raw(lock)); \ + else if (std_spinlock_p(lock)) \ + __real_raw_spin##op(&std_spinlock(lock)->rlock); \ + else __bad_lock_type(); \ + } while (0) + +#define PICK_SPINOP_RET(op, lock, type) \ + ({ \ + type __ret__; \ + if (ipipe_spinlock_p(lock)) \ + __ret__ = arch_spin##op(&ipipe_spinlock(lock)->arch_lock); \ + else if (std_spinlock_raw_p(lock)) \ + __ret__ = __real_raw_spin##op(std_spinlock_raw(lock)); \ + else if (std_spinlock_p(lock)) \ + __ret__ = __real_raw_spin##op(&std_spinlock(lock)->rlock); \ + else { __ret__ = -1; __bad_lock_type(); } \ + __ret__; \ + }) + +#define arch_spin_lock_init(lock) \ + do { \ + IPIPE_DEFINE_SPINLOCK(__lock__); \ + *((ipipe_spinlock_t *)lock) = __lock__; \ + } while (0) + +#define arch_spin_lock_irq(lock) \ + do { \ + hard_local_irq_disable(); \ + arch_spin_lock(lock); \ + } while (0) + +#define arch_spin_unlock_irq(lock) \ + do { \ + arch_spin_unlock(lock); \ + hard_local_irq_enable(); \ + } while (0) + +typedef struct { + arch_rwlock_t arch_lock; +} __ipipe_rwlock_t; + +#define ipipe_rwlock_p(lock) \ + __builtin_types_compatible_p(typeof(lock), __ipipe_rwlock_t *) + +#define std_rwlock_p(lock) \ + __builtin_types_compatible_p(typeof(lock), rwlock_t *) + +#define ipipe_rwlock(lock) ((__ipipe_rwlock_t *)(lock)) +#define std_rwlock(lock) ((rwlock_t *)(lock)) + +#define PICK_RWOP(op, lock) \ + do { \ + if (ipipe_rwlock_p(lock)) \ + arch##op(&ipipe_rwlock(lock)->arch_lock); \ + else if (std_rwlock_p(lock)) \ + _raw##op(std_rwlock(lock)); \ + else __bad_lock_type(); \ + } while (0) + +extern int __bad_lock_type(void); + +#ifdef CONFIG_IPIPE + +#define ipipe_spinlock_t __ipipe_spinlock_t +#define IPIPE_DEFINE_RAW_SPINLOCK(x) ipipe_spinlock_t x = IPIPE_SPIN_LOCK_UNLOCKED +#define IPIPE_DECLARE_RAW_SPINLOCK(x) extern ipipe_spinlock_t x +#define IPIPE_DEFINE_SPINLOCK(x) IPIPE_DEFINE_RAW_SPINLOCK(x) +#define IPIPE_DECLARE_SPINLOCK(x) IPIPE_DECLARE_RAW_SPINLOCK(x) + +#define IPIPE_SPIN_LOCK_UNLOCKED \ + (__ipipe_spinlock_t) { .arch_lock = __ARCH_SPIN_LOCK_UNLOCKED } + +#define spin_lock_irqsave_cond(lock, flags) \ + spin_lock_irqsave(lock, flags) + +#define spin_unlock_irqrestore_cond(lock, flags) \ + spin_unlock_irqrestore(lock, flags) + +#define raw_spin_lock_irqsave_cond(lock, flags) \ + raw_spin_lock_irqsave(lock, flags) + +#define raw_spin_unlock_irqrestore_cond(lock, flags) \ + raw_spin_unlock_irqrestore(lock, flags) + +void __ipipe_spin_lock_irq(ipipe_spinlock_t *lock); + +int __ipipe_spin_trylock_irq(ipipe_spinlock_t *lock); + +void __ipipe_spin_unlock_irq(ipipe_spinlock_t *lock); + +unsigned long __ipipe_spin_lock_irqsave(ipipe_spinlock_t *lock); + +int __ipipe_spin_trylock_irqsave(ipipe_spinlock_t *lock, + unsigned long *x); + +void __ipipe_spin_unlock_irqrestore(ipipe_spinlock_t *lock, + unsigned long x); + +void __ipipe_spin_unlock_irqbegin(ipipe_spinlock_t *lock); + +void __ipipe_spin_unlock_irqcomplete(unsigned long x); + +#if defined(CONFIG_IPIPE_DEBUG_INTERNAL) && defined(CONFIG_SMP) +void __ipipe_spin_unlock_debug(unsigned long flags); +#else +#define __ipipe_spin_unlock_debug(flags) do { } while (0) +#endif + +#define ipipe_rwlock_t __ipipe_rwlock_t +#define IPIPE_DEFINE_RWLOCK(x) ipipe_rwlock_t x = IPIPE_RW_LOCK_UNLOCKED +#define IPIPE_DECLARE_RWLOCK(x) extern ipipe_rwlock_t x + +#define IPIPE_RW_LOCK_UNLOCKED \ + (__ipipe_rwlock_t) { .arch_lock = __ARCH_RW_LOCK_UNLOCKED } + +#else /* !CONFIG_IPIPE */ + +#define ipipe_spinlock_t spinlock_t +#define IPIPE_DEFINE_SPINLOCK(x) DEFINE_SPINLOCK(x) +#define IPIPE_DECLARE_SPINLOCK(x) extern spinlock_t x +#define IPIPE_SPIN_LOCK_UNLOCKED __SPIN_LOCK_UNLOCKED(unknown) +#define IPIPE_DEFINE_RAW_SPINLOCK(x) DEFINE_RAW_SPINLOCK(x) +#define IPIPE_DECLARE_RAW_SPINLOCK(x) extern raw_spinlock_t x + +#define spin_lock_irqsave_cond(lock, flags) \ + do { \ + (void)(flags); \ + spin_lock(lock); \ + } while(0) + +#define spin_unlock_irqrestore_cond(lock, flags) \ + spin_unlock(lock) + +#define raw_spin_lock_irqsave_cond(lock, flags) \ + do { \ + (void)(flags); \ + raw_spin_lock(lock); \ + } while(0) + +#define raw_spin_unlock_irqrestore_cond(lock, flags) \ + raw_spin_unlock(lock) + +#define __ipipe_spin_lock_irq(lock) do { } while (0) +#define __ipipe_spin_unlock_irq(lock) do { } while (0) +#define __ipipe_spin_lock_irqsave(lock) 0 +#define __ipipe_spin_trylock_irq(lock) 1 +#define __ipipe_spin_trylock_irqsave(lock, x) ({ (void)(x); 1; }) +#define __ipipe_spin_unlock_irqrestore(lock, x) do { (void)(x); } while (0) +#define __ipipe_spin_unlock_irqbegin(lock) do { } while (0) +#define __ipipe_spin_unlock_irqcomplete(x) do { (void)(x); } while (0) +#define __ipipe_spin_unlock_debug(flags) do { } while (0) + +#define ipipe_rwlock_t rwlock_t +#define IPIPE_DEFINE_RWLOCK(x) DEFINE_RWLOCK(x) +#define IPIPE_DECLARE_RWLOCK(x) extern rwlock_t x +#define IPIPE_RW_LOCK_UNLOCKED RW_LOCK_UNLOCKED + +#endif /* !CONFIG_IPIPE */ + +#endif /* !__LINUX_IPIPE_LOCK_H */ diff --git a/include/linux/ipipe_tickdev.h b/include/linux/ipipe_tickdev.h new file mode 100644 index 0000000..d76647c --- /dev/null +++ b/include/linux/ipipe_tickdev.h @@ -0,0 +1,146 @@ +/* -*- linux-c -*- + * include/linux/ipipe_tickdev.h + * + * Copyright (C) 2007 Philippe Gerum. + * Copyright (C) 2012 Gilles Chanteperdrix + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __LINUX_IPIPE_TICKDEV_H +#define __LINUX_IPIPE_TICKDEV_H + +#include +#include +#include +#include +#include + +#ifdef CONFIG_IPIPE + +enum clock_event_mode; +struct clock_event_device; + +struct ipipe_hostrt_data { + short live; + seqcount_t seqcount; + time_t wall_time_sec; + u32 wall_time_nsec; + struct timespec wall_to_monotonic; + cycle_t cycle_last; + cycle_t mask; + u32 mult; + u32 shift; +}; + +struct ipipe_timer { + int irq; + void (*request)(struct ipipe_timer *timer, int steal); + int (*set)(unsigned long ticks, void *timer); + void (*ack)(void); + void (*release)(struct ipipe_timer *timer); + + /* Only if registering a timer directly */ + const char *name; + unsigned rating; + unsigned long freq; + unsigned min_delay_ticks; + const struct cpumask *cpumask; + + /* For internal use */ + void *timer_set; /* pointer passed to ->set() callback */ + struct clock_event_device *host_timer; + struct list_head link; + + /* Conversions between clock frequency and timer frequency */ + unsigned c2t_integ; + unsigned c2t_frac; + + /* For clockevent interception */ + u32 real_mult; + u32 real_shift; + void (*real_set_mode)(enum clock_event_mode mode, + struct clock_event_device *cdev); + int (*real_set_next_event)(unsigned long evt, + struct clock_event_device *cdev); +}; + +#define __ipipe_hrtimer_irq __ipipe_this_cpu_read(ipipe_percpu.hrtimer_irq) + +extern unsigned long __ipipe_hrtimer_freq; + +/* + * Called by clockevents_register_device, to register a piggybacked + * ipipe timer, if there is one + */ +void ipipe_host_timer_register(struct clock_event_device *clkevt); + +/* + * Register a standalone ipipe timer + */ +void ipipe_timer_register(struct ipipe_timer *timer); + +/* + * Chooses the best timer for each cpu. Take over its handling. + */ +int ipipe_select_timers(const struct cpumask *mask); + +/* + * Release the per-cpu timers + */ +void ipipe_timers_release(void); + +/* + * Start handling the per-cpu timer irq, and intercepting the linux clockevent + * device callbacks. + */ +int ipipe_timer_start(void (*tick_handler)(void), + void (*emumode)(enum clock_event_mode mode, + struct clock_event_device *cdev), + int (*emutick)(unsigned long evt, + struct clock_event_device *cdev), + unsigned cpu); + +/* + * Stop handling a per-cpu timer + */ +void ipipe_timer_stop(unsigned cpu); + +/* + * Program the timer + */ +void ipipe_timer_set(unsigned long delay); + +const char *ipipe_timer_name(void); + +unsigned ipipe_timer_ns2ticks(struct ipipe_timer *timer, unsigned ns); + +#else /* !CONFIG_IPIPE */ + +#define ipipe_host_timer_register(clkevt) do { } while (0) + +#endif /* !CONFIG_IPIPE */ + +#ifdef CONFIG_IPIPE_HAVE_HOSTRT +void ipipe_update_hostrt(struct timespec *wall_time, struct timespec *wtm, + struct clocksource *clock, u32 mult); +#else +static inline void +ipipe_update_hostrt(struct timespec *wall_time, struct timespec *wtm, + struct clocksource *clock, u32 mult) {} +#endif + +#endif /* __LINUX_IPIPE_TICKDEV_H */ diff --git a/include/linux/ipipe_trace.h b/include/linux/ipipe_trace.h new file mode 100644 index 0000000..deb0a47 --- /dev/null +++ b/include/linux/ipipe_trace.h @@ -0,0 +1,77 @@ +/* -*- linux-c -*- + * include/linux/ipipe_trace.h + * + * Copyright (C) 2005 Luotao Fu. + * 2005-2007 Jan Kiszka. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef _LINUX_IPIPE_TRACE_H +#define _LINUX_IPIPE_TRACE_H + +#ifdef CONFIG_IPIPE_TRACE + +#include + +#ifndef BROKEN_BUILTIN_RETURN_ADDRESS +#define __BUILTIN_RETURN_ADDRESS0 ((unsigned long)__builtin_return_address(0)) +#define __BUILTIN_RETURN_ADDRESS1 ((unsigned long)__builtin_return_address(1)) +#endif /* !BUILTIN_RETURN_ADDRESS */ + +void ipipe_trace_begin(unsigned long v); +void ipipe_trace_end(unsigned long v); +void ipipe_trace_freeze(unsigned long v); +void ipipe_trace_special(unsigned char special_id, unsigned long v); +void ipipe_trace_pid(pid_t pid, short prio); +void ipipe_trace_event(unsigned char id, unsigned long delay_tsc); +int ipipe_trace_max_reset(void); +int ipipe_trace_frozen_reset(void); + +#else /* !CONFIG_IPIPE_TRACE */ + +#define ipipe_trace_begin(v) do { (void)(v); } while(0) +#define ipipe_trace_end(v) do { (void)(v); } while(0) +#define ipipe_trace_freeze(v) do { (void)(v); } while(0) +#define ipipe_trace_special(id, v) do { (void)(id); (void)(v); } while(0) +#define ipipe_trace_pid(pid, prio) do { (void)(pid); (void)(prio); } while(0) +#define ipipe_trace_event(id, delay_tsc) do { (void)(id); (void)(delay_tsc); } while(0) +#define ipipe_trace_max_reset() ({ 0; }) +#define ipipe_trace_frozen_reset() ({ 0; }) + +#endif /* !CONFIG_IPIPE_TRACE */ + +#ifdef CONFIG_IPIPE_TRACE_PANIC +void ipipe_trace_panic_freeze(void); +void ipipe_trace_panic_dump(void); +#else +static inline void ipipe_trace_panic_freeze(void) { } +static inline void ipipe_trace_panic_dump(void) { } +#endif + +#ifdef CONFIG_IPIPE_TRACE_IRQSOFF +#define ipipe_trace_irq_entry(irq) ipipe_trace_begin(irq) +#define ipipe_trace_irq_exit(irq) ipipe_trace_end(irq) +#define ipipe_trace_irqsoff() ipipe_trace_begin(0x80000000UL) +#define ipipe_trace_irqson() ipipe_trace_end(0x80000000UL) +#else +#define ipipe_trace_irq_entry(irq) do { (void)(irq);} while(0) +#define ipipe_trace_irq_exit(irq) do { (void)(irq);} while(0) +#define ipipe_trace_irqsoff() do { } while(0) +#define ipipe_trace_irqson() do { } while(0) +#endif + +#endif /* !__LINUX_IPIPE_TRACE_H */ diff --git a/include/linux/irq.h b/include/linux/irq.h index a5261e3..4be97f0 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -322,6 +322,11 @@ struct irq_chip { void (*irq_bus_lock)(struct irq_data *data); void (*irq_bus_sync_unlock)(struct irq_data *data); +#ifdef CONFIG_IPIPE + void (*irq_move)(struct irq_data *data); + void (*irq_hold)(struct irq_data *data); + void (*irq_release)(struct irq_data *data); +#endif /* CONFIG_IPIPE */ void (*irq_cpu_online)(struct irq_data *data); void (*irq_cpu_offline)(struct irq_data *data); @@ -523,7 +528,7 @@ extern int irq_set_handler_data(unsigned int irq, void *data); extern int irq_set_chip_data(unsigned int irq, void *data); extern int irq_set_irq_type(unsigned int irq, unsigned int type); extern int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry); -extern struct irq_data *irq_get_irq_data(unsigned int irq); +extern struct irq_data *irq_get_irq_data(unsigned int irq) __attribute__((const)); static inline struct irq_chip *irq_get_chip(unsigned int irq) { @@ -666,7 +671,11 @@ struct irq_chip_type { * different flow mechanisms (level/edge) for it. */ struct irq_chip_generic { +#ifdef CONFIG_IPIPE + ipipe_spinlock_t lock; +#else raw_spinlock_t lock; +#endif void __iomem *reg_base; unsigned int irq_base; unsigned int irq_cnt; @@ -724,18 +733,28 @@ static inline struct irq_chip_type *irq_data_get_chip_type(struct irq_data *d) #define IRQ_MSK(n) (u32)((n) < 32 ? ((1 << (n)) - 1) : UINT_MAX) #ifdef CONFIG_SMP -static inline void irq_gc_lock(struct irq_chip_generic *gc) +static inline unsigned long irq_gc_lock(struct irq_chip_generic *gc) { - raw_spin_lock(&gc->lock); + unsigned long flags = 0; + raw_spin_lock_irqsave_cond(&gc->lock, flags); + return flags; } -static inline void irq_gc_unlock(struct irq_chip_generic *gc) +static inline void +irq_gc_unlock(struct irq_chip_generic *gc, unsigned long flags) { - raw_spin_unlock(&gc->lock); + raw_spin_unlock_irqrestore_cond(&gc->lock, flags); } #else -static inline void irq_gc_lock(struct irq_chip_generic *gc) { } -static inline void irq_gc_unlock(struct irq_chip_generic *gc) { } +static inline unsigned long irq_gc_lock(struct irq_chip_generic *gc) +{ + return hard_cond_local_irq_save(); +} +static inline void +irq_gc_unlock(struct irq_chip_generic *gc, unsigned long flags) +{ + hard_cond_local_irq_restore(flags); +} #endif #endif /* CONFIG_GENERIC_HARDIRQS */ diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 9a323d1..3b2b7b5 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -40,6 +40,12 @@ struct module; struct irq_desc { struct irq_data irq_data; unsigned int __percpu *kstat_irqs; +#ifdef CONFIG_IPIPE + void (*ipipe_ack)(unsigned int irq, + struct irq_desc *desc); + void (*ipipe_end)(unsigned int irq, + struct irq_desc *desc); +#endif /* CONFIG_IPIPE */ irq_flow_handler_t handle_irq; #ifdef CONFIG_IRQ_PREFLOW_FASTEOI irq_preflow_handler_t preflow_handler; @@ -122,6 +128,10 @@ static inline int irq_has_action(unsigned int irq) return desc->action != NULL; } +irq_flow_handler_t +__fixup_irq_handler(struct irq_desc *desc, irq_flow_handler_t handle, + int is_chained); + /* caller has locked the irq_desc and both params are valid */ static inline void __irq_set_handler_locked(unsigned int irq, irq_flow_handler_t handler) @@ -129,6 +139,7 @@ static inline void __irq_set_handler_locked(unsigned int irq, struct irq_desc *desc; desc = irq_to_desc(irq); + handler = __fixup_irq_handler(desc, handler, 0); desc->handle_irq = handler; } diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h index 3bc4dca..fb1f848 100644 --- a/include/linux/irqnr.h +++ b/include/linux/irqnr.h @@ -24,7 +24,11 @@ #else /* CONFIG_GENERIC_HARDIRQS */ extern int nr_irqs; +#if !defined(CONFIG_IPIPE) || defined(CONFIG_SPARSE_IRQ) extern struct irq_desc *irq_to_desc(unsigned int irq); +#else +#define irq_to_desc(irq) (&irq_desc[irq]) +#endif unsigned int irq_get_next_irq(unsigned int offset); # define for_each_irq_desc(irq, desc) \ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 6043821..f00db79 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -141,9 +142,12 @@ struct user; #ifdef CONFIG_PREEMPT_VOLUNTARY extern int _cond_resched(void); -# define might_resched() _cond_resched() +# define might_resched() do { \ + ipipe_root_only(); \ + _cond_resched(); \ + } while (0) #else -# define might_resched() do { } while (0) +# define might_resched() ipipe_root_only() #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 96c158a..07bbfb3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -147,6 +147,9 @@ struct kvm_vcpu { #ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier preempt_notifier; #endif +#ifdef CONFIG_IPIPE + struct ipipe_vm_notifier ipipe_notifier; +#endif int cpu; int vcpu_id; int srcu_idx; diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 5a710b9..f2ac9ea 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -9,13 +9,20 @@ #include #include #include +#include #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) extern void add_preempt_count(int val); extern void sub_preempt_count(int val); #else -# define add_preempt_count(val) do { preempt_count() += (val); } while (0) -# define sub_preempt_count(val) do { preempt_count() -= (val); } while (0) +# define add_preempt_count(val) do { \ + ipipe_root_only(); \ + preempt_count() += (val); \ + } while (0) +# define sub_preempt_count(val) do { \ + ipipe_root_only(); \ + preempt_count() -= (val); \ + } while (0) #endif #define inc_preempt_count() add_preempt_count(1) diff --git a/include/linux/resource.h b/include/linux/resource.h index d01c96c..9e4eb7c 100644 --- a/include/linux/resource.h +++ b/include/linux/resource.h @@ -59,12 +59,6 @@ struct rlimit64 { #define PRIO_USER 2 /* - * Limit the stack by to some sane default: root can always - * increase this limit if needed.. 8MB seems reasonable. - */ -#define _STK_LIM (8*1024*1024) - -/* * GPG2 wants 64kB of mlocked memory, to make sure pass phrases * and other sensitive information are never written to disk. */ diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index bc2994e..5e2da8d 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -61,8 +61,8 @@ do { \ #define read_trylock(lock) __cond_lock(lock, _raw_read_trylock(lock)) #define write_trylock(lock) __cond_lock(lock, _raw_write_trylock(lock)) -#define write_lock(lock) _raw_write_lock(lock) -#define read_lock(lock) _raw_read_lock(lock) +#define write_lock(lock) PICK_RWOP(_write_lock, lock) +#define read_lock(lock) PICK_RWOP(_read_lock, lock) #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) @@ -96,8 +96,8 @@ do { \ #define read_lock_bh(lock) _raw_read_lock_bh(lock) #define write_lock_irq(lock) _raw_write_lock_irq(lock) #define write_lock_bh(lock) _raw_write_lock_bh(lock) -#define read_unlock(lock) _raw_read_unlock(lock) -#define write_unlock(lock) _raw_write_unlock(lock) +#define read_unlock(lock) PICK_RWOP(_read_unlock, lock) +#define write_unlock(lock) PICK_RWOP(_write_unlock, lock) #define read_unlock_irq(lock) _raw_read_unlock_irq(lock) #define write_unlock_irq(lock) _raw_write_unlock_irq(lock) diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h index 9c9f049..62c894150 100644 --- a/include/linux/rwlock_api_smp.h +++ b/include/linux/rwlock_api_smp.h @@ -141,7 +141,9 @@ static inline int __raw_write_trylock(rwlock_t *lock) * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are * not re-enabled during lock-acquire (which the preempt-spin-ops do): */ -#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) +#if !defined(CONFIG_GENERIC_LOCKBREAK) || \ + defined(CONFIG_DEBUG_LOCK_ALLOC) || \ + defined(CONFIG_IPIPE) static inline void __raw_read_lock(rwlock_t *lock) { diff --git a/include/linux/sched.h b/include/linux/sched.h index e63650f..beb101e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -62,6 +62,7 @@ struct sched_param { #include #include #include +#include #include #include @@ -193,9 +194,17 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #define TASK_DEAD 64 #define TASK_WAKEKILL 128 #define TASK_WAKING 256 +#ifdef CONFIG_IPIPE +#define TASK_HARDENING 512 +#define TASK_NOWAKEUP 1024 +#define TASK_STATE_MAX 2048 +#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWHN" +#else /* !CONFIG_IPIPE */ +#define TASK_HARDENING 0 +#define TASK_NOWAKEUP 0 #define TASK_STATE_MAX 512 - #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW" +#endif /* CONFIG_IPIPE */ extern char ___assert_task_state[1 - 2*!!( sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; @@ -308,6 +317,15 @@ extern void trap_init(void); extern void update_process_times(int user); extern void scheduler_tick(void); +#ifdef CONFIG_IPIPE +void update_root_process_times(struct pt_regs *regs); +#else /* !CONFIG_IPIPE */ +static inline void update_root_process_times(struct pt_regs *regs) +{ + update_process_times(user_mode(regs)); +} +#endif /* CONFIG_IPIPE */ + extern void sched_show_task(struct task_struct *p); #ifdef CONFIG_LOCKUP_DETECTOR @@ -440,6 +458,9 @@ extern int get_dumpable(struct mm_struct *mm); #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ #define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ #define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ +#ifdef CONFIG_IPIPE +#define MMF_VM_PINNED 31 /* ondemand load up and COW disabled */ +#endif #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) @@ -1519,6 +1540,10 @@ struct task_struct { short pref_node_fork; #endif struct rcu_head rcu; + struct ipipe_task_info ipipe; +#ifdef CONFIG_IPIPE_LEGACY + void *ptd[IPIPE_ROOT_NPTDKEYS]; +#endif /* * cache last used pipe for splice @@ -1801,6 +1826,10 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ +/* p->ipipe.flags */ +#define PF_MAYDAY 0x1 /* MAYDAY call is pending */ +#define PF_EVNOTIFY 0x2 /* Notify head domain about kernel events */ + /* * Only the _current_ task can read/write to tsk->flags, but other * tasks can access tsk->flags in readonly mode for example diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 7d537ce..384cd7e 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -89,10 +89,12 @@ # include #endif +#include + #ifdef CONFIG_DEBUG_SPINLOCK extern void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, struct lock_class_key *key); -# define raw_spin_lock_init(lock) \ +# define __real_raw_spin_lock_init(lock) \ do { \ static struct lock_class_key __key; \ \ @@ -100,9 +102,10 @@ do { \ } while (0) #else -# define raw_spin_lock_init(lock) \ +# define __real_raw_spin_lock_init(lock) \ do { *(lock) = __RAW_SPIN_LOCK_UNLOCKED(lock); } while (0) #endif +#define raw_spin_lock_init(lock) PICK_SPINOP(_lock_init, lock) #define raw_spin_is_locked(lock) arch_spin_is_locked(&(lock)->raw_lock) @@ -165,9 +168,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) * various methods are defined as nops in the case they are not * required. */ -#define raw_spin_trylock(lock) __cond_lock(lock, _raw_spin_trylock(lock)) +#define __real_raw_spin_trylock(lock) __cond_lock(lock, _raw_spin_trylock(lock)) +#define raw_spin_trylock(lock) PICK_SPINOP_RET(_trylock, lock, int) -#define raw_spin_lock(lock) _raw_spin_lock(lock) +#define __real_raw_spin_lock(lock) _raw_spin_lock(lock) +#define raw_spin_lock(lock) PICK_SPINOP(_lock, lock) #ifdef CONFIG_DEBUG_LOCK_ALLOC # define raw_spin_lock_nested(lock, subclass) \ @@ -185,7 +190,7 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) -#define raw_spin_lock_irqsave(lock, flags) \ +#define __real_raw_spin_lock_irqsave(lock, flags) \ do { \ typecheck(unsigned long, flags); \ flags = _raw_spin_lock_irqsave(lock); \ @@ -207,7 +212,7 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) #else -#define raw_spin_lock_irqsave(lock, flags) \ +#define __real_raw_spin_lock_irqsave(lock, flags) \ do { \ typecheck(unsigned long, flags); \ _raw_spin_lock_irqsave(lock, flags); \ @@ -218,34 +223,46 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) #endif -#define raw_spin_lock_irq(lock) _raw_spin_lock_irq(lock) +#define raw_spin_lock_irqsave(lock, flags) \ + PICK_SPINLOCK_IRQSAVE(lock, flags) + +#define __real_raw_spin_lock_irq(lock) _raw_spin_lock_irq(lock) +#define raw_spin_lock_irq(lock) PICK_SPINOP(_lock_irq, lock) #define raw_spin_lock_bh(lock) _raw_spin_lock_bh(lock) -#define raw_spin_unlock(lock) _raw_spin_unlock(lock) -#define raw_spin_unlock_irq(lock) _raw_spin_unlock_irq(lock) +#define __real_raw_spin_unlock(lock) _raw_spin_unlock(lock) +#define raw_spin_unlock(lock) PICK_SPINOP(_unlock, lock) +#define __real_raw_spin_unlock_irq(lock) _raw_spin_unlock_irq(lock) +#define raw_spin_unlock_irq(lock) PICK_SPINOP(_unlock_irq, lock) -#define raw_spin_unlock_irqrestore(lock, flags) \ +#define __real_raw_spin_unlock_irqrestore(lock, flags) \ do { \ typecheck(unsigned long, flags); \ _raw_spin_unlock_irqrestore(lock, flags); \ } while (0) +#define raw_spin_unlock_irqrestore(lock, flags) \ + PICK_SPINUNLOCK_IRQRESTORE(lock, flags) + #define raw_spin_unlock_bh(lock) _raw_spin_unlock_bh(lock) #define raw_spin_trylock_bh(lock) \ __cond_lock(lock, _raw_spin_trylock_bh(lock)) -#define raw_spin_trylock_irq(lock) \ +#define __real_raw_spin_trylock_irq(lock) \ ({ \ local_irq_disable(); \ - raw_spin_trylock(lock) ? \ + __real_raw_spin_trylock(lock) ? \ 1 : ({ local_irq_enable(); 0; }); \ }) +#define raw_spin_trylock_irq(lock) PICK_SPINTRYLOCK_IRQ(lock) -#define raw_spin_trylock_irqsave(lock, flags) \ +#define __real_raw_spin_trylock_irqsave(lock, flags) \ ({ \ local_irq_save(flags); \ raw_spin_trylock(lock) ? \ 1 : ({ local_irq_restore(flags); 0; }); \ }) +#define raw_spin_trylock_irqsave(lock, flags) \ + PICK_SPINTRYLOCK_IRQSAVE(lock, flags) /** * raw_spin_can_lock - would raw_spin_trylock() succeed? @@ -276,24 +293,17 @@ static inline raw_spinlock_t *spinlock_check(spinlock_t *lock) #define spin_lock_init(_lock) \ do { \ - spinlock_check(_lock); \ - raw_spin_lock_init(&(_lock)->rlock); \ + raw_spin_lock_init(_lock); \ } while (0) -static inline void spin_lock(spinlock_t *lock) -{ - raw_spin_lock(&lock->rlock); -} +#define spin_lock(lock) raw_spin_lock(lock) static inline void spin_lock_bh(spinlock_t *lock) { raw_spin_lock_bh(&lock->rlock); } -static inline int spin_trylock(spinlock_t *lock) -{ - return raw_spin_trylock(&lock->rlock); -} +#define spin_trylock(lock) raw_spin_trylock(lock) #define spin_lock_nested(lock, subclass) \ do { \ @@ -305,14 +315,11 @@ do { \ raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock); \ } while (0) -static inline void spin_lock_irq(spinlock_t *lock) -{ - raw_spin_lock_irq(&lock->rlock); -} +#define spin_lock_irq(lock) raw_spin_lock_irq(lock) #define spin_lock_irqsave(lock, flags) \ do { \ - raw_spin_lock_irqsave(spinlock_check(lock), flags); \ + raw_spin_lock_irqsave(lock, flags); \ } while (0) #define spin_lock_irqsave_nested(lock, flags, subclass) \ @@ -320,39 +327,28 @@ do { \ raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \ } while (0) -static inline void spin_unlock(spinlock_t *lock) -{ - raw_spin_unlock(&lock->rlock); -} +#define spin_unlock(lock) raw_spin_unlock(lock) static inline void spin_unlock_bh(spinlock_t *lock) { raw_spin_unlock_bh(&lock->rlock); } -static inline void spin_unlock_irq(spinlock_t *lock) -{ - raw_spin_unlock_irq(&lock->rlock); -} +#define spin_unlock_irq(lock) raw_spin_unlock_irq(lock) -static inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) -{ - raw_spin_unlock_irqrestore(&lock->rlock, flags); -} +#define spin_unlock_irqrestore(lock, flags) \ + raw_spin_unlock_irqrestore(lock, flags) static inline int spin_trylock_bh(spinlock_t *lock) { return raw_spin_trylock_bh(&lock->rlock); } -static inline int spin_trylock_irq(spinlock_t *lock) -{ - return raw_spin_trylock_irq(&lock->rlock); -} +#define spin_trylock_irq(lock) raw_spin_trylock_irq(lock) #define spin_trylock_irqsave(lock, flags) \ ({ \ - raw_spin_trylock_irqsave(spinlock_check(lock), flags); \ + raw_spin_trylock_irqsave(lock, flags); \ }) static inline void spin_unlock_wait(spinlock_t *lock) diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index 51df117..7e7b9a7 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -99,7 +99,9 @@ static inline int __raw_spin_trylock(raw_spinlock_t *lock) * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are * not re-enabled during lock-acquire (which the preempt-spin-ops do): */ -#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) +#if !defined(CONFIG_GENERIC_LOCKBREAK) || \ + defined(CONFIG_DEBUG_LOCK_ALLOC) || \ + defined(CONFIG_IPIPE) static inline unsigned long __raw_spin_lock_irqsave(raw_spinlock_t *lock) { @@ -113,7 +115,7 @@ static inline unsigned long __raw_spin_lock_irqsave(raw_spinlock_t *lock) * do_raw_spin_lock_flags() code, because lockdep assumes * that interrupts are not re-enabled during lock-acquire: */ -#ifdef CONFIG_LOCKDEP +#if defined(CONFIG_LOCKDEP) || defined(CONFIG_IPIPE) LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); #else do_raw_spin_lock_flags(lock, &flags); diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h index a26e2fb..71348f2 100644 --- a/include/linux/spinlock_up.h +++ b/include/linux/spinlock_up.h @@ -51,13 +51,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) /* * Read-write spinlocks. No debug version. */ -#define arch_read_lock(lock) do { (void)(lock); } while (0) -#define arch_write_lock(lock) do { (void)(lock); } while (0) -#define arch_read_trylock(lock) ({ (void)(lock); 1; }) -#define arch_write_trylock(lock) ({ (void)(lock); 1; }) -#define arch_read_unlock(lock) do { (void)(lock); } while (0) -#define arch_write_unlock(lock) do { (void)(lock); } while (0) - #else /* DEBUG_SPINLOCK */ #define arch_spin_is_locked(lock) ((void)(lock), 0) /* for sched.c and kernel_lock.c: */ @@ -67,6 +60,13 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) # define arch_spin_trylock(lock) ({ (void)(lock); 1; }) #endif /* DEBUG_SPINLOCK */ +#define arch_read_lock(lock) do { (void)(lock); } while (0) +#define arch_write_lock(lock) do { (void)(lock); } while (0) +#define arch_read_trylock(lock) ({ (void)(lock); 1; }) +#define arch_write_trylock(lock) ({ (void)(lock); 1; }) +#define arch_read_unlock(lock) do { (void)(lock); } while (0) +#define arch_write_unlock(lock) do { (void)(lock); } while (0) + #define arch_spin_is_contended(lock) (((void)(lock), 0)) #define arch_read_can_lock(lock) (((void)(lock), 1)) diff --git a/include/linux/time.h b/include/linux/time.h index b51e664..c7d91b9 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -151,6 +151,7 @@ struct timespec get_monotonic_coarse(void); void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, struct timespec *wtom, struct timespec *sleep); void timekeeping_inject_sleeptime(struct timespec *delta); +struct timespec get_wall_to_monotonic(void); #define CURRENT_TIME (current_kernel_time()) #define CURRENT_TIME_SEC ((struct timespec) { get_seconds(), 0 }) diff --git a/init/Kconfig b/init/Kconfig index d07dcf9..ea3d94c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -90,6 +90,7 @@ config CROSS_COMPILE config LOCALVERSION string "Local version - append to kernel release" + default "-rtai" help Append an extra string to the end of your kernel version. This will show up when you type uname, for example. diff --git a/init/main.c b/init/main.c index b5cc0a7..70afdd4 100644 --- a/init/main.c +++ b/init/main.c @@ -482,7 +482,7 @@ asmlinkage void __init start_kernel(void) cgroup_init_early(); - local_irq_disable(); + hard_local_irq_disable(); early_boot_irqs_disabled = true; /* @@ -520,6 +520,7 @@ asmlinkage void __init start_kernel(void) pidhash_init(); vfs_caches_init_early(); sort_main_extable(); + __ipipe_init_early(); trap_init(); mm_init(); @@ -552,6 +553,11 @@ asmlinkage void __init start_kernel(void) softirq_init(); timekeeping_init(); time_init(); + /* + * We need to wait for the interrupt and time subsystems to be + * initialized before enabling the pipeline. + */ + __ipipe_init(); profile_init(); call_function_init(); if (!irqs_disabled()) @@ -773,6 +779,7 @@ static void __init do_basic_setup(void) shmem_init(); driver_init(); init_irq_proc(); + __ipipe_init_proc(); do_ctors(); usermodehelper_enable(); do_initcalls(); diff --git a/kernel/Makefile b/kernel/Makefile index c0cc67a..d01236c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -88,6 +88,7 @@ obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o obj-$(CONFIG_TINY_RCU) += rcutiny.o obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o obj-$(CONFIG_RELAY) += relay.o +obj-$(CONFIG_IPIPE) += ipipe/ obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o diff --git a/kernel/exit.c b/kernel/exit.c index 46ce8da..492e0e7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -991,6 +991,7 @@ void do_exit(long code) acct_process(); trace_sched_process_exit(tsk); + __ipipe_report_exit(tsk); exit_sem(tsk); exit_shm(tsk); exit_files(tsk); @@ -1882,3 +1883,36 @@ SYSCALL_DEFINE3(waitpid, pid_t, pid, int } #endif + + void rt_daemonize(void) + { + sigset_t blocked; + + /* + * We don't want to have TIF_FREEZE set if the system-wide hibernation + * or suspend transition begins right now. + */ + current->flags |= (PF_NOFREEZE | PF_KTHREAD); + + if (current->nsproxy != &init_nsproxy) { + get_nsproxy(&init_nsproxy); + switch_task_namespaces(current, &init_nsproxy); + } + set_special_pids(&init_struct_pid); + proc_clear_tty(current); + + /* Block and flush all signals */ + sigfillset(&blocked); + sigprocmask(SIG_BLOCK, &blocked, NULL); + flush_signals(current); + + /* Become as one with the init task */ + daemonize_fs_struct(); + exit_files(current); + current->files = init_task.files; + atomic_inc(¤t->files->count); + + reparent_to_kthreadd(); + } + +EXPORT_SYMBOL(rt_daemonize); diff --git a/kernel/fork.c b/kernel/fork.c index f9d0499..234af7f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -315,6 +315,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) if (err) goto out; + __ipipe_init_threadinfo(&ti->ipipe_data); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); stackend = end_of_stack(tsk); @@ -614,6 +615,7 @@ void mmput(struct mm_struct *mm) ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); + __ipipe_report_cleanup(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); @@ -1104,6 +1106,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); new_flags |= PF_FORKNOEXEC; p->flags = new_flags; + __ipipe_clear_taskflags(p); } SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) @@ -1492,6 +1495,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, cgroup_post_fork(p); if (clone_flags & CLONE_THREAD) threadgroup_change_end(current); + __ipipe_init_taskinfo(p); perf_event_fork(p); trace_task_newtask(p, clone_flags); diff --git a/kernel/ipipe/Kconfig b/kernel/ipipe/Kconfig new file mode 100644 index 0000000..ce56a9c --- /dev/null +++ b/kernel/ipipe/Kconfig @@ -0,0 +1,57 @@ +config IPIPE + bool "Interrupt pipeline" + default y + ---help--- + Activate this option if you want the interrupt pipeline to be + compiled in. + +config IPIPE_CORE + def_bool y + +config IPIPE_WANT_CLOCKSOURCE + bool + +config IPIPE_CORE_APIREV + int + depends on IPIPE + default 2 + ---help--- + The API revision level we implement. + +config IPIPE_WANT_APIREV_1 + bool + +config IPIPE_WANT_APIREV_2 + bool + +config IPIPE_TARGET_APIREV + int + depends on IPIPE + default 1 if IPIPE_WANT_APIREV_1 + default 2 if IPIPE_WANT_APIREV_2 + default 1 if IPIPE_LEGACY + default IPIPE_CORE_APIREV + ---help--- + The API revision level the we want (must be <= + IPIPE_CORE_APIREV). + +config IPIPE_LEGACY + bool "I-pipe legacy interface" + depends on IPIPE + default y + ---help--- + Activate this option if you want to control the interrupt + pipeline via the legacy interface. + +config IPIPE_HAVE_HOSTRT + bool + +config IPIPE_HAVE_PIC_MUTE + bool + +config HAVE_IPIPE_HOSTRT + depends on IPIPE_LEGACY + bool + +config IPIPE_DELAYED_ATOMICSW + def_bool y if IPIPE_LEGACY diff --git a/kernel/ipipe/Kconfig.debug b/kernel/ipipe/Kconfig.debug new file mode 100644 index 0000000..40c82a2 --- /dev/null +++ b/kernel/ipipe/Kconfig.debug @@ -0,0 +1,95 @@ +config IPIPE_DEBUG + bool "I-pipe debugging" + depends on IPIPE + +config IPIPE_DEBUG_CONTEXT + bool "Check for illicit cross-domain calls" + depends on IPIPE_DEBUG + default y + ---help--- + Enable this feature to arm checkpoints in the kernel that + verify the correct invocation context. On entry of critical + Linux services a warning is issued if the caller is not + running over the root domain. + +config IPIPE_DEBUG_INTERNAL + bool "Enable internal debug checks" + depends on IPIPE_DEBUG + default y + ---help--- + When this feature is enabled, I-pipe will perform internal + consistency checks of its subsystems, e.g. on per-cpu variable + access. + +config IPIPE_TRACE + bool "Latency tracing" + depends on IPIPE_DEBUG + select ARCH_WANT_FRAME_POINTERS if !ARM_UNWIND + select FRAME_POINTER if !ARM_UNWIND + select KALLSYMS + select PROC_FS + ---help--- + Activate this option if you want to use per-function tracing of + the kernel. The tracer will collect data via instrumentation + features like the one below or with the help of explicite calls + of ipipe_trace_xxx(). See include/linux/ipipe_trace.h for the + in-kernel tracing API. The collected data and runtime control + is available via /proc/ipipe/trace/*. + +if IPIPE_TRACE + +config IPIPE_TRACE_ENABLE + bool "Enable tracing on boot" + default y + ---help--- + Disable this option if you want to arm the tracer after booting + manually ("echo 1 > /proc/ipipe/tracer/enable"). This can reduce + boot time on slow embedded devices due to the tracer overhead. + +config IPIPE_TRACE_MCOUNT + bool "Instrument function entries" + default y + select FTRACE + select FUNCTION_TRACER + ---help--- + When enabled, records every kernel function entry in the tracer + log. While this slows down the system noticeably, it provides + the highest level of information about the flow of events. + However, it can be switch off in order to record only explicit + I-pipe trace points. + +config IPIPE_TRACE_IRQSOFF + bool "Trace IRQs-off times" + default y + ---help--- + Activate this option if I-pipe shall trace the longest path + with hard-IRQs switched off. + +config IPIPE_TRACE_SHIFT + int "Depth of trace log (14 => 16Kpoints, 15 => 32Kpoints)" + range 10 18 + default 14 + ---help--- + The number of trace points to hold tracing data for each + trace path, as a power of 2. + +config IPIPE_TRACE_VMALLOC + bool "Use vmalloc'ed trace buffer" + default y if EMBEDDED + ---help--- + Instead of reserving static kernel data, the required buffer + is allocated via vmalloc during boot-up when this option is + enabled. This can help to start systems that are low on memory, + but it slightly degrades overall performance. Try this option + when a traced kernel hangs unexpectedly at boot time. + +config IPIPE_TRACE_PANIC + bool "Enable panic back traces" + default y + ---help--- + Provides services to freeze and dump a back trace on panic + situations. This is used on IPIPE_DEBUG_CONTEXT exceptions + as well as ordinary kernel oopses. You can control the number + of printed back trace points via /proc/ipipe/trace. + +endif diff --git a/kernel/ipipe/Makefile b/kernel/ipipe/Makefile new file mode 100644 index 0000000..c3ffe63 --- /dev/null +++ b/kernel/ipipe/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_IPIPE) += core.o timer.o +obj-$(CONFIG_IPIPE_TRACE) += tracer.o +obj-$(CONFIG_IPIPE_LEGACY) += compat.o diff --git a/kernel/ipipe/compat.c b/kernel/ipipe/compat.c new file mode 100644 index 0000000..1147bf4 --- /dev/null +++ b/kernel/ipipe/compat.c @@ -0,0 +1,268 @@ +/* -*- linux-c -*- + * linux/kernel/ipipe/compat.c + * + * Copyright (C) 2012 Philippe Gerum. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * I-pipe legacy interface. + */ +#include +#include +#include +#include +#include +#include + +static int ptd_key_count; + +static unsigned long ptd_key_map; + +IPIPE_DECLARE_SPINLOCK(__ipipe_lock); + +void ipipe_init_attr(struct ipipe_domain_attr *attr) +{ + attr->name = "anon"; + attr->domid = 1; + attr->entry = NULL; + attr->priority = IPIPE_ROOT_PRIO; + attr->pdd = NULL; +} +EXPORT_SYMBOL_GPL(ipipe_init_attr); + +int ipipe_register_domain(struct ipipe_domain *ipd, + struct ipipe_domain_attr *attr) +{ + struct ipipe_percpu_domain_data *p; + unsigned long flags; + + BUG_ON(attr->priority != IPIPE_HEAD_PRIORITY); + + ipipe_register_head(ipd, attr->name); + ipd->legacy.domid = attr->domid; + ipd->legacy.pdd = attr->pdd; + ipd->legacy.priority = INT_MAX; + + if (attr->entry == NULL) + return 0; + + flags = hard_smp_local_irq_save(); + __ipipe_set_current_domain(ipd); + hard_smp_local_irq_restore(flags); + + attr->entry(); + + flags = hard_local_irq_save(); + __ipipe_set_current_domain(ipipe_root_domain); + p = ipipe_this_cpu_root_context(); + if (__ipipe_ipending_p(p) && + !test_bit(IPIPE_STALL_FLAG, &p->status)) + __ipipe_sync_stage(); + hard_local_irq_restore(flags); + + return 0; +} +EXPORT_SYMBOL_GPL(ipipe_register_domain); + +int ipipe_unregister_domain(struct ipipe_domain *ipd) +{ + ipipe_unregister_head(ipd); + + return 0; +} +EXPORT_SYMBOL_GPL(ipipe_unregister_domain); + +int ipipe_alloc_ptdkey(void) +{ + unsigned long flags; + int key = -1; + + spin_lock_irqsave(&__ipipe_lock,flags); + + if (ptd_key_count < IPIPE_ROOT_NPTDKEYS) { + key = ffz(ptd_key_map); + set_bit(key,&ptd_key_map); + ptd_key_count++; + } + + spin_unlock_irqrestore(&__ipipe_lock,flags); + + return key; +} +EXPORT_SYMBOL_GPL(ipipe_alloc_ptdkey); + +int ipipe_free_ptdkey(int key) +{ + unsigned long flags; + + if (key < 0 || key >= IPIPE_ROOT_NPTDKEYS) + return -EINVAL; + + spin_lock_irqsave(&__ipipe_lock,flags); + + if (test_and_clear_bit(key,&ptd_key_map)) + ptd_key_count--; + + spin_unlock_irqrestore(&__ipipe_lock,flags); + + return 0; +} +EXPORT_SYMBOL_GPL(ipipe_free_ptdkey); + +int ipipe_set_ptd(int key, void *value) +{ + if (key < 0 || key >= IPIPE_ROOT_NPTDKEYS) + return -EINVAL; + + current->ptd[key] = value; + + return 0; +} +EXPORT_SYMBOL_GPL(ipipe_set_ptd); + +void *ipipe_get_ptd(int key) +{ + if (key < 0 || key >= IPIPE_ROOT_NPTDKEYS) + return NULL; + + return current->ptd[key]; +} +EXPORT_SYMBOL_GPL(ipipe_get_ptd); + +int ipipe_virtualize_irq(struct ipipe_domain *ipd, + unsigned int irq, + ipipe_irq_handler_t handler, + void *cookie, + ipipe_irq_ackfn_t ackfn, + unsigned int modemask) +{ + if (handler == NULL) { + ipipe_free_irq(ipd, irq); + return 0; + } + + return ipipe_request_irq(ipd, irq, handler, cookie, ackfn); +} +EXPORT_SYMBOL_GPL(ipipe_virtualize_irq); + +static int null_handler(unsigned int event, + struct ipipe_domain *from, void *data) +{ + /* + * Legacy mode users will trap all events, at worst most + * frequent ones. Therefore it is actually faster to run a + * dummy handler once in a while rather than testing for a + * null handler pointer each time an event is fired. + */ + return 0; +} + +ipipe_event_handler_t ipipe_catch_event(struct ipipe_domain *ipd, + unsigned int event, + ipipe_event_handler_t handler) +{ + ipipe_event_handler_t oldhandler; + int n, enables = 0; + + if (event & IPIPE_EVENT_SELF) { + event &= ~IPIPE_EVENT_SELF; + IPIPE_WARN(event >= IPIPE_NR_FAULTS); + } + + if (event >= IPIPE_NR_EVENTS) + return NULL; + + /* + * It makes no sense to run a SETSCHED notification handler + * over the head domain, this introduces a useless domain + * switch for doing work which ought to be root specific. + * Unfortunately, some client domains using the legacy + * interface still ask for this, so we silently fix their + * request. This prevents ipipe_set_hooks() from yelling at us + * because of an attempt to enable kernel event notifications + * for the head domain. + */ + if (event == IPIPE_EVENT_SETSCHED) + ipd = ipipe_root_domain; + + oldhandler = ipd->legacy.handlers[event]; + ipd->legacy.handlers[event] = handler ?: null_handler; + + for (n = 0; n < IPIPE_NR_FAULTS; n++) { + if (ipd->legacy.handlers[n] != null_handler) { + enables |= __IPIPE_TRAP_E; + break; + } + } + + for (n = IPIPE_FIRST_EVENT; n < IPIPE_LAST_EVENT; n++) { + if (ipd->legacy.handlers[n] != null_handler) { + enables |= __IPIPE_KEVENT_E; + break; + } + } + + if (ipd->legacy.handlers[IPIPE_EVENT_SYSCALL] != null_handler) + enables |= __IPIPE_SYSCALL_E; + + ipipe_set_hooks(ipd, enables); + + return oldhandler == null_handler ? NULL : oldhandler; +} +EXPORT_SYMBOL_GPL(ipipe_catch_event); + +int ipipe_setscheduler_root(struct task_struct *p, int policy, int prio) +{ + struct sched_param param = { .sched_priority = prio }; + return sched_setscheduler_nocheck(p, policy, ¶m); +} +EXPORT_SYMBOL_GPL(ipipe_setscheduler_root); + +int ipipe_syscall_hook(struct ipipe_domain *ipd, struct pt_regs *regs) +{ + const int event = IPIPE_EVENT_SYSCALL; + return ipipe_current_domain->legacy.handlers[event](event, ipd, regs); +} + +int ipipe_trap_hook(struct ipipe_trap_data *data) +{ + struct ipipe_domain *ipd = ipipe_head_domain; + struct pt_regs *regs = data->regs; + int ex = data->exception; + + return ipd->legacy.handlers[ex](ex, ipd, regs); +} + +int ipipe_kevent_hook(int kevent, void *data) +{ + unsigned int event = IPIPE_FIRST_EVENT + kevent; + struct ipipe_domain *ipd = ipipe_root_domain; + + return ipd->legacy.handlers[event](event, ipd, data); +} + +void __ipipe_legacy_init_stage(struct ipipe_domain *ipd) +{ + int n; + + for (n = 0; n < IPIPE_NR_EVENTS; n++) + ipd->legacy.handlers[n] = null_handler; + + if (ipd == &ipipe_root) { + ipd->legacy.domid = IPIPE_ROOT_ID; + ipd->legacy.priority = IPIPE_ROOT_PRIO; + } +} diff --git a/kernel/ipipe/core.c b/kernel/ipipe/core.c new file mode 100644 index 0000000..3bc6ef5 --- /dev/null +++ b/kernel/ipipe/core.c @@ -0,0 +1,1740 @@ +/* -*- linux-c -*- + * linux/kernel/ipipe/core.c + * + * Copyright (C) 2002-2012 Philippe Gerum. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Architecture-independent I-PIPE core support. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_PROC_FS +#include +#include +#endif /* CONFIG_PROC_FS */ +#include +#include +#include + +struct ipipe_domain ipipe_root; +EXPORT_SYMBOL_GPL(ipipe_root); + +struct ipipe_domain *ipipe_head_domain = &ipipe_root; +EXPORT_SYMBOL_GPL(ipipe_head_domain); + +#ifdef CONFIG_SMP +static __initdata struct ipipe_percpu_domain_data bootup_context = { + .status = IPIPE_STALL_MASK, + .domain = &ipipe_root, +}; +#else +#define bootup_context ipipe_percpu.root +#endif /* !CONFIG_SMP */ + +DEFINE_PER_CPU(struct ipipe_percpu_data, ipipe_percpu) = { + .root = { + .status = IPIPE_STALL_MASK, + .domain = &ipipe_root, + }, + .curr = &bootup_context, + .hrtimer_irq = -1, +#ifdef CONFIG_IPIPE_DEBUG_CONTEXT + .context_check = 1, +#endif +}; +EXPORT_PER_CPU_SYMBOL_GPL(ipipe_percpu); + +/* Up to 2k of pending work data per CPU. */ +#define WORKBUF_SIZE 2048 +static DEFINE_PER_CPU_ALIGNED(unsigned char[WORKBUF_SIZE], work_buf); +static DEFINE_PER_CPU(void *, work_tail); +static unsigned int __ipipe_work_virq; + +static void __ipipe_do_work(unsigned int virq, void *cookie); + +#ifdef CONFIG_SMP + +#define IPIPE_CRITICAL_TIMEOUT 1000000 +static cpumask_t __ipipe_cpu_sync_map; +static cpumask_t __ipipe_cpu_lock_map; +static cpumask_t __ipipe_cpu_pass_map; +static unsigned long __ipipe_critical_lock; +static IPIPE_DEFINE_SPINLOCK(__ipipe_cpu_barrier); +static atomic_t __ipipe_critical_count = ATOMIC_INIT(0); +static void (*__ipipe_cpu_sync) (void); + +#else /* !CONFIG_SMP */ +/* + * Create an alias to the unique root status, so that arch-dep code + * may get fast access to this percpu variable including from + * assembly. A hard-coded assumption is that root.status appears at + * offset #0 of the ipipe_percpu struct. + */ +extern unsigned long __ipipe_root_status +__attribute__((alias(__stringify(ipipe_percpu)))); +EXPORT_SYMBOL_GPL(__ipipe_root_status); + +#endif /* !CONFIG_SMP */ + +IPIPE_DEFINE_SPINLOCK(__ipipe_lock); + +static unsigned long __ipipe_virtual_irq_map; + +#ifdef CONFIG_PRINTK +unsigned int __ipipe_printk_virq; +int __ipipe_printk_bypass; +#endif /* CONFIG_PRINTK */ + +#ifdef CONFIG_PROC_FS + +struct proc_dir_entry *ipipe_proc_root; + +static int __ipipe_version_info_proc(char *page, + char **start, + off_t off, int count, int *eof, void *data) +{ + int len = sprintf(page, "%d\n", IPIPE_CORE_RELEASE); + + len -= off; + + if (len <= off + count) + *eof = 1; + + *start = page + off; + + if(len > count) + len = count; + + if(len < 0) + len = 0; + + return len; +} + +static int __ipipe_common_info_show(struct seq_file *p, void *data) +{ + struct ipipe_domain *ipd = (struct ipipe_domain *)p->private; + char handling, lockbit, virtuality; + unsigned long ctlbits; + unsigned int irq; + + seq_printf(p, " +--- Handled\n"); + seq_printf(p, " |+-- Locked\n"); + seq_printf(p, " ||+- Virtual\n"); + seq_printf(p, "[IRQ] |||\n"); + + mutex_lock(&ipd->mutex); + + for (irq = 0; irq < IPIPE_NR_IRQS; irq++) { + ctlbits = ipd->irqs[irq].control; + /* + * There might be a hole between the last external IRQ + * and the first virtual one; skip it. + */ + if (irq >= IPIPE_NR_XIRQS && !ipipe_virtual_irq_p(irq)) + continue; + + if (ipipe_virtual_irq_p(irq) + && !test_bit(irq - IPIPE_VIRQ_BASE, &__ipipe_virtual_irq_map)) + /* Non-allocated virtual IRQ; skip it. */ + continue; + + if (ctlbits & IPIPE_HANDLE_MASK) + handling = 'H'; + else + handling = '.'; + + if (ctlbits & IPIPE_LOCK_MASK) + lockbit = 'L'; + else + lockbit = '.'; + + if (ipipe_virtual_irq_p(irq)) + virtuality = 'V'; + else + virtuality = '.'; + + seq_printf(p, " %3u: %c%c%c\n", + irq, handling, lockbit, virtuality); + } + + mutex_unlock(&ipd->mutex); + + return 0; +} + +static int __ipipe_common_info_open(struct inode *inode, struct file *file) +{ + return single_open(file, __ipipe_common_info_show, PROC_I(inode)->pde->data); +} + +static struct file_operations __ipipe_info_proc_ops = { + .owner = THIS_MODULE, + .open = __ipipe_common_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +void add_domain_proc(struct ipipe_domain *ipd) +{ + struct proc_dir_entry *e = create_proc_entry(ipd->name, 0444, ipipe_proc_root); + if (e) { + e->proc_fops = &__ipipe_info_proc_ops; + e->data = (void*) ipd; + } +} + +void remove_domain_proc(struct ipipe_domain *ipd) +{ + remove_proc_entry(ipd->name,ipipe_proc_root); +} + +void __init __ipipe_init_proc(void) +{ + ipipe_proc_root = create_proc_entry("ipipe",S_IFDIR, 0); + create_proc_read_entry("version",0444,ipipe_proc_root,&__ipipe_version_info_proc,NULL); + add_domain_proc(ipipe_root_domain); + + __ipipe_init_tracer(); +} + +#else + +static inline void add_domain_proc(struct ipipe_domain *ipd) +{ +} + +static inline void remove_domain_proc(struct ipipe_domain *ipd) +{ +} + +#endif /* CONFIG_PROC_FS */ + +static void init_stage(struct ipipe_domain *ipd) +{ + memset(&ipd->irqs, 0, sizeof(ipd->irqs)); + mutex_init(&ipd->mutex); + __ipipe_legacy_init_stage(ipd); + __ipipe_hook_critical_ipi(ipd); +} + +static inline int root_context_offset(void) +{ + void root_context_not_at_start_of_ipipe_percpu(void); + + /* ipipe_percpu.root must be found at offset #0. */ + + if (offsetof(struct ipipe_percpu_data, root)) + root_context_not_at_start_of_ipipe_percpu(); + + return 0; +} + +#ifdef CONFIG_SMP + +static inline void fixup_percpu_data(void) +{ + struct ipipe_percpu_data *p; + int cpu; + + /* + * ipipe_percpu.curr cannot be assigned statically to + * &ipipe_percpu.root, due to the dynamic nature of percpu + * data. So we make ipipe_percpu.curr refer to a temporary + * boot up context in static memory, until we can fixup all + * context pointers in this routine, after per-cpu areas have + * been eventually set up. The temporary context data is + * copied to per_cpu(ipipe_percpu, 0).root in the same move. + * + * Obviously, this code must run over the boot CPU, before SMP + * operations start. + */ + BUG_ON(smp_processor_id() || !irqs_disabled()); + + per_cpu(ipipe_percpu, 0).root = bootup_context; + + for_each_possible_cpu(cpu) { + p = &per_cpu(ipipe_percpu, cpu); + p->curr = &p->root; + } +} + +#else /* !CONFIG_SMP */ + +static inline void fixup_percpu_data(void) { } + +#endif /* CONFIG_SMP */ + +void __init __ipipe_init_early(void) +{ + struct ipipe_domain *ipd = &ipipe_root; + int cpu; + + fixup_percpu_data(); + + /* + * A lightweight registration code for the root domain. We are + * running on the boot CPU, hw interrupts are off, and + * secondary CPUs are still lost in space. + */ + ipd->name = "Linux"; + ipd->context_offset = root_context_offset(); + init_stage(ipd); + + /* + * Do the early init stuff. First we do the per-arch pipeline + * core setup, then we run the per-client setup code. At this + * point, the kernel does not provide much services yet: be + * careful. + */ + __ipipe_early_core_setup(); + __ipipe_early_client_setup(); + +#ifdef CONFIG_PRINTK + __ipipe_printk_virq = ipipe_alloc_virq(); + ipd->irqs[__ipipe_printk_virq].handler = __ipipe_flush_printk; + ipd->irqs[__ipipe_printk_virq].cookie = NULL; + ipd->irqs[__ipipe_printk_virq].ackfn = NULL; + ipd->irqs[__ipipe_printk_virq].control = IPIPE_HANDLE_MASK; +#endif /* CONFIG_PRINTK */ + + __ipipe_work_virq = ipipe_alloc_virq(); + ipd->irqs[__ipipe_work_virq].handler = __ipipe_do_work; + ipd->irqs[__ipipe_work_virq].cookie = NULL; + ipd->irqs[__ipipe_work_virq].ackfn = NULL; + ipd->irqs[__ipipe_work_virq].control = IPIPE_HANDLE_MASK; + + for_each_possible_cpu(cpu) + per_cpu(work_tail, cpu) = per_cpu(work_buf, cpu); +} + +void __init __ipipe_init(void) +{ + /* Now we may engage the pipeline. */ + __ipipe_enable_pipeline(); + + printk(KERN_INFO "Interrupt pipeline (release #%d)\n", + IPIPE_CORE_RELEASE); +} + +static inline void init_head_stage(struct ipipe_domain *ipd) +{ + struct ipipe_percpu_domain_data *p; + int cpu; + + /* Must be set first, used in ipipe_percpu_context(). */ + ipd->context_offset = offsetof(struct ipipe_percpu_data, head); + + for_each_online_cpu(cpu) { + p = ipipe_percpu_context(ipd, cpu); + memset(p, 0, sizeof(*p)); + p->domain = ipd; + } + + init_stage(ipd); +} + +void ipipe_register_head(struct ipipe_domain *ipd, const char *name) +{ + BUG_ON(!ipipe_root_p || ipd == &ipipe_root); + + ipd->name = name; + init_head_stage(ipd); + barrier(); + ipipe_head_domain = ipd; + add_domain_proc(ipd); + + printk(KERN_INFO "I-pipe: head domain %s registered.\n", name); +} +EXPORT_SYMBOL_GPL(ipipe_register_head); + +void ipipe_unregister_head(struct ipipe_domain *ipd) +{ + BUG_ON(!ipipe_root_p || ipd != ipipe_head_domain); + + ipipe_head_domain = &ipipe_root; + smp_mb(); + mutex_lock(&ipd->mutex); + remove_domain_proc(ipd); + mutex_unlock(&ipd->mutex); + + printk(KERN_INFO "I-pipe: head domain %s unregistered.\n", ipd->name); +} +EXPORT_SYMBOL_GPL(ipipe_unregister_head); + +void ipipe_unstall_root(void) +{ + struct ipipe_percpu_domain_data *p; + + hard_local_irq_disable(); + + /* This helps catching bad usage from assembly call sites. */ + ipipe_root_only(); + + p = ipipe_this_cpu_root_context(); + + __clear_bit(IPIPE_STALL_FLAG, &p->status); + + if (unlikely(__ipipe_ipending_p(p))) + __ipipe_sync_stage(); + + hard_local_irq_enable(); +} +EXPORT_SYMBOL_GPL(ipipe_unstall_root); + +void ipipe_restore_root(unsigned long x) +{ + ipipe_root_only(); + + if (x) + ipipe_stall_root(); + else + ipipe_unstall_root(); +} +EXPORT_SYMBOL_GPL(ipipe_restore_root); + +void __ipipe_restore_root_nosync(unsigned long x) +{ + struct ipipe_percpu_domain_data *p = ipipe_this_cpu_root_context(); + + if (raw_irqs_disabled_flags(x)) { + __set_bit(IPIPE_STALL_FLAG, &p->status); + trace_hardirqs_off(); + } else { + trace_hardirqs_on(); + __clear_bit(IPIPE_STALL_FLAG, &p->status); + } +} +EXPORT_SYMBOL_GPL(__ipipe_restore_root_nosync); + +void ipipe_unstall_head(void) +{ + struct ipipe_percpu_domain_data *p = ipipe_this_cpu_head_context(); + + hard_local_irq_disable(); + + __clear_bit(IPIPE_STALL_FLAG, &p->status); + + if (unlikely(__ipipe_ipending_p(p))) + __ipipe_sync_pipeline(ipipe_head_domain); + + hard_local_irq_enable(); +} +EXPORT_SYMBOL_GPL(ipipe_unstall_head); + +void __ipipe_restore_head(unsigned long x) /* hw interrupt off */ +{ + struct ipipe_percpu_domain_data *p = ipipe_this_cpu_head_context(); + + if (x) { +#ifdef CONFIG_DEBUG_KERNEL + static int warned; + if (!warned && + __test_and_set_bit(IPIPE_STALL_FLAG, &p->status)) { + /* + * Already stalled albeit ipipe_restore_head() + * should have detected it? Send a warning once. + */ + hard_local_irq_enable(); + warned = 1; + printk(KERN_WARNING + "I-pipe: ipipe_restore_head() optimization failed.\n"); + dump_stack(); + hard_local_irq_disable(); + } +#else /* !CONFIG_DEBUG_KERNEL */ + __set_bit(IPIPE_STALL_FLAG, &p->status); +#endif /* CONFIG_DEBUG_KERNEL */ + } else { + __clear_bit(IPIPE_STALL_FLAG, &p->status); + if (unlikely(__ipipe_ipending_p(p))) + __ipipe_sync_pipeline(ipipe_head_domain); + hard_local_irq_enable(); + } +} +EXPORT_SYMBOL_GPL(__ipipe_restore_head); + +void __ipipe_spin_lock_irq(ipipe_spinlock_t *lock) +{ + hard_local_irq_disable(); + arch_spin_lock(&lock->arch_lock); + __set_bit(IPIPE_STALL_FLAG, &__ipipe_current_context->status); +} +EXPORT_SYMBOL_GPL(__ipipe_spin_lock_irq); + +void __ipipe_spin_unlock_irq(ipipe_spinlock_t *lock) +{ + arch_spin_unlock(&lock->arch_lock); + __clear_bit(IPIPE_STALL_FLAG, &__ipipe_current_context->status); + hard_local_irq_enable(); +} +EXPORT_SYMBOL_GPL(__ipipe_spin_unlock_irq); + +unsigned long __ipipe_spin_lock_irqsave(ipipe_spinlock_t *lock) +{ + unsigned long flags; + int s; + + flags = hard_local_irq_save(); + arch_spin_lock(&lock->arch_lock); + s = __test_and_set_bit(IPIPE_STALL_FLAG, &__ipipe_current_context->status); + + return arch_mangle_irq_bits(s, flags); +} +EXPORT_SYMBOL_GPL(__ipipe_spin_lock_irqsave); + +int __ipipe_spin_trylock_irqsave(ipipe_spinlock_t *lock, + unsigned long *x) +{ + unsigned long flags; + int s; + + flags = hard_local_irq_save(); + if (!arch_spin_trylock(&lock->arch_lock)) { + hard_local_irq_restore(flags); + return 0; + } + s = __test_and_set_bit(IPIPE_STALL_FLAG, &__ipipe_current_context->status); + *x = arch_mangle_irq_bits(s, flags); + + return 1; +} +EXPORT_SYMBOL_GPL(__ipipe_spin_trylock_irqsave); + +void __ipipe_spin_unlock_irqrestore(ipipe_spinlock_t *lock, + unsigned long x) +{ + arch_spin_unlock(&lock->arch_lock); + if (!arch_demangle_irq_bits(&x)) + __clear_bit(IPIPE_STALL_FLAG, &__ipipe_current_context->status); + hard_local_irq_restore(x); +} +EXPORT_SYMBOL_GPL(__ipipe_spin_unlock_irqrestore); + +int __ipipe_spin_trylock_irq(ipipe_spinlock_t *lock) +{ + unsigned long flags; + + flags = hard_local_irq_save(); + if (!arch_spin_trylock(&lock->arch_lock)) { + hard_local_irq_restore(flags); + return 0; + } + __set_bit(IPIPE_STALL_FLAG, &__ipipe_current_context->status); + + return 1; +} +EXPORT_SYMBOL_GPL(__ipipe_spin_trylock_irq); + +void __ipipe_spin_unlock_irqbegin(ipipe_spinlock_t *lock) +{ + arch_spin_unlock(&lock->arch_lock); +} + +void __ipipe_spin_unlock_irqcomplete(unsigned long x) +{ + if (!arch_demangle_irq_bits(&x)) + __clear_bit(IPIPE_STALL_FLAG, &__ipipe_current_context->status); + hard_local_irq_restore(x); +} + +#ifdef __IPIPE_3LEVEL_IRQMAP + +/* Must be called hw IRQs off. */ +static inline void __ipipe_set_irq_held(struct ipipe_percpu_domain_data *p, + unsigned int irq) +{ + __set_bit(irq, p->irqheld_map); + p->irqall[irq]++; +} + +/* Must be called hw IRQs off. */ +void __ipipe_set_irq_pending(struct ipipe_domain *ipd, unsigned int irq) +{ + struct ipipe_percpu_domain_data *p = ipipe_this_cpu_context(ipd); + int l0b, l1b; + + IPIPE_WARN_ONCE(!hard_irqs_disabled()); + + l0b = irq / (BITS_PER_LONG * BITS_PER_LONG); + l1b = irq / BITS_PER_LONG; + + if (likely(!test_bit(IPIPE_LOCK_FLAG, &ipd->irqs[irq].control))) { + __set_bit(irq, p->irqpend_lomap); + __set_bit(l1b, p->irqpend_mdmap); + __set_bit(l0b, &p->irqpend_himap); + } else + __set_bit(irq, p->irqheld_map); + + p->irqall[irq]++; +} +EXPORT_SYMBOL_GPL(__ipipe_set_irq_pending); + +/* Must be called hw IRQs off. */ +void __ipipe_lock_irq(unsigned int irq) +{ + struct ipipe_domain *ipd = ipipe_root_domain; + struct ipipe_percpu_domain_data *p; + int l0b, l1b; + + IPIPE_WARN_ONCE(!hard_irqs_disabled()); + + /* + * Interrupts requested by a registered head domain cannot be + * locked, since this would make no sense: interrupts are + * globally masked at CPU level when the head domain is + * stalled, so there is no way we could encounter the + * situation IRQ locks are handling. + */ + if (test_and_set_bit(IPIPE_LOCK_FLAG, &ipd->irqs[irq].control)) + return; + + l0b = irq / (BITS_PER_LONG * BITS_PER_LONG); + l1b = irq / BITS_PER_LONG; + + p = ipipe_this_cpu_context(ipd); + if (__test_and_clear_bit(irq, p->irqpend_lomap)) { + __set_bit(irq, p->irqheld_map); + if (p->irqpend_lomap[l1b] == 0) { + __clear_bit(l1b, p->irqpend_mdmap); + if (p->irqpend_mdmap[l0b] == 0) + __clear_bit(l0b, &p->irqpend_himap); + } + } +} +EXPORT_SYMBOL_GPL(__ipipe_lock_irq); + +/* Must be called hw IRQs off. */ +void __ipipe_unlock_irq(unsigned int irq) +{ + struct ipipe_domain *ipd = ipipe_root_domain; + struct ipipe_percpu_domain_data *p; + int l0b, l1b, cpu; + + IPIPE_WARN_ONCE(!hard_irqs_disabled()); + + if (!test_and_clear_bit(IPIPE_LOCK_FLAG, &ipd->irqs[irq].control)) + return; + + l0b = irq / (BITS_PER_LONG * BITS_PER_LONG); + l1b = irq / BITS_PER_LONG; + + for_each_online_cpu(cpu) { + p = ipipe_this_cpu_root_context(); + if (test_and_clear_bit(irq, p->irqheld_map)) { + /* We need atomic ops here: */ + set_bit(irq, p->irqpend_lomap); + set_bit(l1b, p->irqpend_mdmap); + set_bit(l0b, &p->irqpend_himap); + } + } +} +EXPORT_SYMBOL_GPL(__ipipe_unlock_irq); + +static inline int __ipipe_next_irq(struct ipipe_percpu_domain_data *p) +{ + int l0b, l1b, l2b; + unsigned long l0m, l1m, l2m; + unsigned int irq; + + l0m = p->irqpend_himap; + if (unlikely(l0m == 0)) + return -1; + + l0b = __ipipe_ffnz(l0m); + l1m = p->irqpend_mdmap[l0b]; + if (unlikely(l1m == 0)) + return -1; + + l1b = __ipipe_ffnz(l1m) + l0b * BITS_PER_LONG; + l2m = p->irqpend_lomap[l1b]; + if (unlikely(l2m == 0)) + return -1; + + l2b = __ipipe_ffnz(l2m); + irq = l1b * BITS_PER_LONG + l2b; + + __clear_bit(irq, p->irqpend_lomap); + if (p->irqpend_lomap[l1b] == 0) { + __clear_bit(l1b, p->irqpend_mdmap); + if (p->irqpend_mdmap[l0b] == 0) + __clear_bit(l0b, &p->irqpend_himap); + } + + return irq; +} + +#else /* __IPIPE_2LEVEL_IRQMAP */ + +/* Must be called hw IRQs off. */ +static inline void __ipipe_set_irq_held(struct ipipe_percpu_domain_data *p, + unsigned int irq) +{ + __set_bit(irq, p->irqheld_map); + p->irqall[irq]++; +} + +/* Must be called hw IRQs off. */ +void __ipipe_set_irq_pending(struct ipipe_domain *ipd, unsigned int irq) +{ + struct ipipe_percpu_domain_data *p = ipipe_this_cpu_context(ipd); + int l0b = irq / BITS_PER_LONG; + + IPIPE_WARN_ONCE(!hard_irqs_disabled()); + + if (likely(!test_bit(IPIPE_LOCK_FLAG, &ipd->irqs[irq].control))) { + __set_bit(irq, p->irqpend_lomap); + __set_bit(l0b, &p->irqpend_himap); + } else + __set_bit(irq, p->irqheld_map); + + p->irqall[irq]++; +} +EXPORT_SYMBOL_GPL(__ipipe_set_irq_pending); + +/* Must be called hw IRQs off. */ +void __ipipe_lock_irq(unsigned int irq) +{ + struct ipipe_percpu_domain_data *p; + int l0b = irq / BITS_PER_LONG; + + IPIPE_WARN_ONCE(!hard_irqs_disabled()); + + if (test_and_set_bit(IPIPE_LOCK_FLAG, + &ipipe_root_domain->irqs[irq].control)) + return; + + p = ipipe_this_cpu_root_context(); + if (__test_and_clear_bit(irq, p->irqpend_lomap)) { + __set_bit(irq, p->irqheld_map); + if (p->irqpend_lomap[l0b] == 0) + __clear_bit(l0b, &p->irqpend_himap); + } +} +EXPORT_SYMBOL_GPL(__ipipe_lock_irq); + +/* Must be called hw IRQs off. */ +void __ipipe_unlock_irq(unsigned int irq) +{ + struct ipipe_domain *ipd = ipipe_root_domain; + struct ipipe_percpu_domain_data *p; + int l0b = irq / BITS_PER_LONG, cpu; + + IPIPE_WARN_ONCE(!hard_irqs_disabled()); + + if (!test_and_clear_bit(IPIPE_LOCK_FLAG, &ipd->irqs[irq].control)) + return; + + for_each_online_cpu(cpu) { + p = ipipe_percpu_context(ipd, cpu); + if (test_and_clear_bit(irq, p->irqheld_map)) { + /* We need atomic ops here: */ + set_bit(irq, p->irqpend_lomap); + set_bit(l0b, &p->irqpend_himap); + } + } +} +EXPORT_SYMBOL_GPL(__ipipe_unlock_irq); + +static inline int __ipipe_next_irq(struct ipipe_percpu_domain_data *p) +{ + unsigned long l0m, l1m; + int l0b, l1b; + + l0m = p->irqpend_himap; + if (unlikely(l0m == 0)) + return -1; + + l0b = __ipipe_ffnz(l0m); + l1m = p->irqpend_lomap[l0b]; + if (unlikely(l1m == 0)) + return -1; + + l1b = __ipipe_ffnz(l1m); + __clear_bit(l1b, &p->irqpend_lomap[l0b]); + if (p->irqpend_lomap[l0b] == 0) + __clear_bit(l0b, &p->irqpend_himap); + + return l0b * BITS_PER_LONG + l1b; +} + +#endif /* __IPIPE_2LEVEL_IRQMAP */ + +void __ipipe_do_sync_pipeline(struct ipipe_domain *top) +{ + struct ipipe_percpu_domain_data *p; + struct ipipe_domain *ipd; + + /* We must enter over the root domain. */ + IPIPE_WARN_ONCE(__ipipe_current_domain != ipipe_root_domain); + ipd = top; +next: + p = ipipe_this_cpu_context(ipd); + if (test_bit(IPIPE_STALL_FLAG, &p->status)) + return; + + if (__ipipe_ipending_p(p)) { + if (ipd == ipipe_root_domain) + __ipipe_sync_stage(); + else { + /* Switching to head. */ + p->coflags &= ~__IPIPE_ALL_R; + __ipipe_set_current_context(p); + __ipipe_sync_stage(); + __ipipe_set_current_domain(ipipe_root_domain); + } + } + + if (ipd != ipipe_root_domain) { + ipd = ipipe_root_domain; + goto next; + } +} +EXPORT_SYMBOL_GPL(__ipipe_do_sync_pipeline); + +unsigned int ipipe_alloc_virq(void) +{ + unsigned long flags, irq = 0; + int ipos; + + spin_lock_irqsave(&__ipipe_lock, flags); + + if (__ipipe_virtual_irq_map != ~0) { + ipos = ffz(__ipipe_virtual_irq_map); + set_bit(ipos, &__ipipe_virtual_irq_map); + irq = ipos + IPIPE_VIRQ_BASE; + } + + spin_unlock_irqrestore(&__ipipe_lock, flags); + + return irq; +} +EXPORT_SYMBOL_GPL(ipipe_alloc_virq); + +void ipipe_free_virq(unsigned int virq) +{ + clear_bit(virq - IPIPE_VIRQ_BASE, &__ipipe_virtual_irq_map); + smp_mb__after_clear_bit(); +} +EXPORT_SYMBOL_GPL(ipipe_free_virq); + +int ipipe_request_irq(struct ipipe_domain *ipd, + unsigned int irq, + ipipe_irq_handler_t handler, + void *cookie, + ipipe_irq_ackfn_t ackfn) +{ + unsigned long flags; + int ret = 0; + +#ifndef CONFIG_IPIPE_LEGACY + ipipe_root_only(); +#endif /* CONFIG_IPIPE_LEGACY */ + + if (handler == NULL || + (irq >= IPIPE_NR_XIRQS && !ipipe_virtual_irq_p(irq))) + return -EINVAL; + + spin_lock_irqsave(&__ipipe_lock, flags); + + if (ipd->irqs[irq].handler) { + ret = -EBUSY; + goto out; + } + + if (ackfn == NULL) + ackfn = ipipe_root_domain->irqs[irq].ackfn; + + ipd->irqs[irq].handler = handler; + ipd->irqs[irq].cookie = cookie; + ipd->irqs[irq].ackfn = ackfn; + ipd->irqs[irq].control = IPIPE_HANDLE_MASK; + + if (irq < NR_IRQS) + __ipipe_enable_irqdesc(ipd, irq); +out: + spin_unlock_irqrestore(&__ipipe_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(ipipe_request_irq); + +void ipipe_free_irq(struct ipipe_domain *ipd, + unsigned int irq) +{ + unsigned long flags; + +#ifndef CONFIG_IPIPE_LEGACY + ipipe_root_only(); +#endif /* CONFIG_IPIPE_LEGACY */ + + spin_lock_irqsave(&__ipipe_lock, flags); + + if (ipd->irqs[irq].handler == NULL) + goto out; + + ipd->irqs[irq].handler = NULL; + ipd->irqs[irq].cookie = NULL; + ipd->irqs[irq].ackfn = NULL; + ipd->irqs[irq].control = 0; + + if (irq < NR_IRQS) + __ipipe_disable_irqdesc(ipd, irq); +out: + spin_unlock_irqrestore(&__ipipe_lock, flags); +} +EXPORT_SYMBOL_GPL(ipipe_free_irq); + +void ipipe_set_hooks(struct ipipe_domain *ipd, int enables) +{ + struct ipipe_percpu_domain_data *p; + unsigned long flags; + int cpu, wait; + + if (ipd == ipipe_root_domain) { + IPIPE_WARN(enables & __IPIPE_TRAP_E); + enables &= ~__IPIPE_TRAP_E; + } else { + IPIPE_WARN(enables & __IPIPE_KEVENT_E); + enables &= ~__IPIPE_KEVENT_E; + } + + flags = ipipe_critical_enter(NULL); + + for_each_online_cpu(cpu) { + p = ipipe_percpu_context(ipd, cpu); + p->coflags &= ~__IPIPE_ALL_E; + p->coflags |= enables; + } + + wait = (enables ^ __IPIPE_ALL_E) << __IPIPE_SHIFT_R; + if (wait == 0 || !__ipipe_root_p) { + ipipe_critical_exit(flags); + return; + } + + ipipe_this_cpu_context(ipd)->coflags &= ~wait; + + ipipe_critical_exit(flags); + + /* + * In case we cleared some hooks over the root domain, we have + * to wait for any ongoing execution to finish, since our + * caller might subsequently unmap the target domain code. + * + * We synchronize with the relevant __ipipe_notify_*() + * helpers, disabling all hooks before we start waiting for + * completion on all CPUs. + */ + for_each_online_cpu(cpu) { + while (ipipe_percpu_context(ipd, cpu)->coflags & wait) + schedule_timeout_interruptible(HZ / 50); + } +} +EXPORT_SYMBOL_GPL(ipipe_set_hooks); + +int __weak ipipe_syscall_hook(struct ipipe_domain *ipd, struct pt_regs *regs) +{ + return 0; +} + +int __ipipe_notify_syscall(struct pt_regs *regs) +{ + struct ipipe_domain *caller_domain, *this_domain, *ipd; + struct ipipe_percpu_domain_data *p; + unsigned long flags; + int ret = 0; + + flags = hard_local_irq_save(); + caller_domain = this_domain = __ipipe_current_domain; + ipd = ipipe_head_domain; +next: + p = ipipe_this_cpu_context(ipd); + if (likely(p->coflags & __IPIPE_SYSCALL_E)) { + __ipipe_set_current_context(p); + p->coflags |= __IPIPE_SYSCALL_R; + hard_local_irq_restore(flags); + ret = ipipe_syscall_hook(caller_domain, regs); + flags = hard_local_irq_save(); + p->coflags &= ~__IPIPE_SYSCALL_R; + if (__ipipe_current_domain != ipd) + /* Account for domain migration. */ + this_domain = __ipipe_current_domain; + else + __ipipe_set_current_domain(this_domain); + } + + if (this_domain == ipipe_root_domain && + ipd != ipipe_root_domain && ret == 0) { + ipd = ipipe_root_domain; + goto next; + } + + hard_local_irq_restore(flags); + + return ret; +} + +int __weak ipipe_trap_hook(struct ipipe_trap_data *data) +{ + return 0; +} + +int __ipipe_notify_trap(int exception, struct pt_regs *regs) +{ +{ if (current->ipipe.flags & PF_EVNOTIFY) return ((int (*)(unsigned long, void *))ipipe_root_domain->legacy.handlers[exception])(exception, regs); } { + struct ipipe_percpu_domain_data *p; + struct ipipe_trap_data data; + unsigned long flags; + int ret = 0; + + flags = hard_local_irq_save(); + + /* + * We send a notification about all traps raised over a + * registered head domain only. + */ + if (__ipipe_root_p) + goto out; + + p = ipipe_this_cpu_head_context(); + if (likely(p->coflags & __IPIPE_TRAP_E)) { + p->coflags |= __IPIPE_TRAP_R; + hard_local_irq_restore(flags); + data.exception = exception; + data.regs = regs; + ret = ipipe_trap_hook(&data); + flags = hard_local_irq_save(); + p->coflags &= ~__IPIPE_TRAP_R; + } +out: + hard_local_irq_restore(flags); + + return ret; +} } + +int __weak ipipe_kevent_hook(int kevent, void *data) +{ + return 0; +} + +int __ipipe_notify_kevent(int kevent, void *data) +{ +{ return ((int (*)(unsigned long, void *))ipipe_root_domain->legacy.handlers[IPIPE_FIRST_EVENT + kevent])(IPIPE_FIRST_EVENT + kevent, data); } { + struct ipipe_percpu_domain_data *p; + unsigned long flags; + int ret = 0; + + ipipe_root_only(); + + flags = hard_local_irq_save(); + + p = ipipe_this_cpu_root_context(); + if (likely(p->coflags & __IPIPE_KEVENT_E)) { + p->coflags |= __IPIPE_KEVENT_R; + hard_local_irq_restore(flags); + ret = ipipe_kevent_hook(kevent, data); + flags = hard_local_irq_save(); + p->coflags &= ~__IPIPE_KEVENT_R; + } + + hard_local_irq_restore(flags); + + return ret; +}} + +void __weak ipipe_migration_hook(struct task_struct *p) +{ +} + +#ifdef CONFIG_IPIPE_LEGACY + +static inline void complete_domain_migration(void) /* hw IRQs off */ +{ + current->state &= ~TASK_HARDENING; +} + +#else /* !CONFIG_IPIPE_LEGACY */ + +static void complete_domain_migration(void) /* hw IRQs off */ +{ + struct ipipe_percpu_domain_data *p; + struct ipipe_percpu_data *pd; + struct task_struct *t; + + ipipe_root_only(); + pd = __this_cpu_ptr(&ipipe_percpu); + t = pd->task_hijacked; + if (t == NULL) + return; + + pd->task_hijacked = NULL; + t->state &= ~TASK_HARDENING; + if (t->state != TASK_INTERRUPTIBLE) + /* Migration aborted (by signal). */ + return; + + p = ipipe_this_cpu_head_context(); + IPIPE_WARN_ONCE(test_bit(IPIPE_STALL_FLAG, &p->status)); + /* + * hw IRQs are disabled, but the completion hook assumes the + * head domain is logically stalled: fix it up. + */ + __set_bit(IPIPE_STALL_FLAG, &p->status); + ipipe_migration_hook(t); + __clear_bit(IPIPE_STALL_FLAG, &p->status); +} + +#endif /* !CONFIG_IPIPE_LEGACY */ + +void __ipipe_complete_domain_migration(void) +{ + unsigned long flags; + + flags = hard_local_irq_save(); + complete_domain_migration(); + hard_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(__ipipe_complete_domain_migration); + +int __ipipe_switch_tail(void) +{ + int x; + +#ifdef CONFIG_IPIPE_WANT_PREEMPTIBLE_SWITCH + hard_local_irq_disable(); +#endif + x = __ipipe_root_p; +#ifndef CONFIG_IPIPE_LEGACY + if (x) +#endif + complete_domain_migration(); + +#ifndef CONFIG_IPIPE_WANT_PREEMPTIBLE_SWITCH + if (x) +#endif + hard_local_irq_enable(); + + return !x; +} + +void __ipipe_notify_vm_preemption(void) +{ + struct ipipe_vm_notifier *vmf; + struct ipipe_percpu_data *p; + + ipipe_check_irqoff(); + p = __ipipe_this_cpu_ptr(&ipipe_percpu); + vmf = p->vm_notifier; + if (unlikely(vmf)) + vmf->handler(vmf); +} +EXPORT_SYMBOL_GPL(__ipipe_notify_vm_preemption); + +static void dispatch_irq_head(unsigned int irq) /* hw interrupts off */ +{ + struct ipipe_percpu_domain_data *p = ipipe_this_cpu_head_context(), *old; + struct ipipe_domain *head = p->domain; + + if (unlikely(test_bit(IPIPE_STALL_FLAG, &p->status))) { + __ipipe_set_irq_pending(head, irq); + return; + } + + /* Switch to the head domain if not current. */ + old = __ipipe_current_context; + if (old != p) + __ipipe_set_current_context(p); + + p->irqall[irq]++; + __set_bit(IPIPE_STALL_FLAG, &p->status); + barrier(); + head->irqs[irq].handler(irq, head->irqs[irq].cookie); + __ipipe_run_irqtail(irq); + hard_local_irq_disable(); + p = ipipe_this_cpu_head_context(); + __clear_bit(IPIPE_STALL_FLAG, &p->status); + + /* Are we still running in the head domain? */ + if (likely(__ipipe_current_context == p)) { + /* Did we enter this code over the head domain? */ + if (old->domain == head) { + /* Yes, do immediate synchronization. */ + if (__ipipe_ipending_p(p)) + __ipipe_sync_stage(); + return; + } + __ipipe_set_current_context(ipipe_this_cpu_root_context()); + } + + /* + * We must be running over the root domain, synchronize + * the pipeline for high priority IRQs (slow path). + */ + __ipipe_do_sync_pipeline(head); +} + +void (*rtai_irq_handler)(int) = NULL; +EXPORT_SYMBOL(rtai_irq_handler); + +void __ipipe_dispatch_irq(unsigned int irq, int flags) /* hw interrupts off */ +{ + struct ipipe_domain *ipd; + struct irq_desc *desc; + unsigned long control; + int chained_irq; + + /* + * Survival kit when reading this code: + * + * - we have two main situations, leading to three cases for + * handling interrupts: + * + * a) the root domain is alone, no registered head domain + * => all interrupts are delivered via the fast dispatcher. + * b) a head domain is registered + * => head domain IRQs go through the fast dispatcher + * => root domain IRQs go through the interrupt log + * + * - when no head domain is registered, ipipe_head_domain == + * ipipe_root_domain == &ipipe_root. + * + * - the caller tells us whether we should acknowledge this + * IRQ. Even virtual IRQs may require acknowledge on some + * platforms (e.g. arm/SMP). + * + * - the caller tells us whether we may try to run the IRQ log + * syncer. Typically, demuxed IRQs won't be synced + * immediately. + * + * - multiplex IRQs most likely have a valid acknowledge + * handler and we may not be called with IPIPE_IRQF_NOACK + * for them. The ack handler for the multiplex IRQ actually + * decodes the demuxed interrupts. + */ + +#ifdef CONFIG_IPIPE_DEBUG + if (unlikely(irq >= IPIPE_NR_IRQS) || + (irq < NR_IRQS && irq_to_desc(irq) == NULL)) { + printk(KERN_ERR "I-pipe: spurious interrupt %u\n", irq); + return; + } +#endif + /* + * CAUTION: on some archs, virtual IRQs may have acknowledge + * handlers. Multiplex IRQs should have one too. + */ + if (unlikely(irq >= NR_IRQS)) { + desc = NULL; + chained_irq = 0; + } else { + desc = irq_to_desc(irq); + chained_irq = desc ? ipipe_chained_irq_p(desc) : 0; + } + if (flags & IPIPE_IRQF_NOACK) + IPIPE_WARN_ONCE(chained_irq); + else { + ipd = ipipe_head_domain; + control = ipd->irqs[irq].control; + if ((control & IPIPE_HANDLE_MASK) == 0) + ipd = ipipe_root_domain; + if (ipd->irqs[irq].ackfn) + ipd->irqs[irq].ackfn(irq, desc); + if (chained_irq) { + if ((flags & IPIPE_IRQF_NOSYNC) == 0) + /* Run demuxed IRQ handlers. */ + goto sync; + return; + } + } + + /* + * Sticky interrupts must be handled early and separately, so + * that we always process them on the current domain. + */ + ipd = __ipipe_current_domain; + control = ipd->irqs[irq].control; + if (control & IPIPE_STICKY_MASK) + goto log; + + /* + * In case we have no registered head domain + * (i.e. ipipe_head_domain == &ipipe_root), we always go + * through the interrupt log, and leave the dispatching work + * ultimately to __ipipe_sync_pipeline(). + */ + ipd = ipipe_head_domain; + control = ipd->irqs[irq].control; + if (ipd == ipipe_root_domain) + /* + * The root domain must handle all interrupts, so + * testing the HANDLE bit would be pointless. + */ + goto log; + + if (control & IPIPE_HANDLE_MASK) { + if (unlikely(flags & IPIPE_IRQF_NOSYNC)) + __ipipe_set_irq_pending(ipd, irq); + else + rtai_irq_handler(irq); // dispatch_irq_head(irq); + return; + } + + ipd = ipipe_root_domain; +log: + __ipipe_set_irq_pending(ipd, irq); + + if (flags & IPIPE_IRQF_NOSYNC) + return; + + /* + * Optimize if we preempted a registered high priority head + * domain: we don't need to synchronize the pipeline unless + * there is a pending interrupt for it. + */ + if (!__ipipe_root_p && + !__ipipe_ipending_p(ipipe_this_cpu_head_context())) + return; +sync: + __ipipe_sync_pipeline(ipipe_head_domain); +} + +#ifdef CONFIG_PREEMPT + +asmlinkage void preempt_schedule_irq(void); + +asmlinkage void __sched __ipipe_preempt_schedule_irq(void) +{ + struct ipipe_percpu_domain_data *p; + unsigned long flags; + + BUG_ON(!hard_irqs_disabled()); + local_irq_save(flags); + hard_local_irq_enable(); + preempt_schedule_irq(); /* Ok, may reschedule now. */ + hard_local_irq_disable(); + + /* + * Flush any pending interrupt that may have been logged after + * preempt_schedule_irq() stalled the root stage before + * returning to us, and now. + */ + p = ipipe_this_cpu_root_context(); + if (unlikely(__ipipe_ipending_p(p))) { + add_preempt_count(PREEMPT_ACTIVE); + trace_hardirqs_on(); + __clear_bit(IPIPE_STALL_FLAG, &p->status); + __ipipe_sync_stage(); + sub_preempt_count(PREEMPT_ACTIVE); + } + + __ipipe_restore_root_nosync(flags); +} + +#else /* !CONFIG_PREEMPT */ + +#define __ipipe_preempt_schedule_irq() do { } while (0) + +#endif /* !CONFIG_PREEMPT */ + +#ifdef CONFIG_TRACE_IRQFLAGS +#define root_stall_after_handler() local_irq_disable() +#else +#define root_stall_after_handler() do { } while (0) +#endif + +/* + * __ipipe_do_sync_stage() -- Flush the pending IRQs for the current + * domain (and processor). This routine flushes the interrupt log (see + * "Optimistic interrupt protection" from D. Stodolsky et al. for more + * on the deferred interrupt scheme). Every interrupt that occurred + * while the pipeline was stalled gets played. + * + * WARNING: CPU migration may occur over this routine. + */ +void __ipipe_do_sync_stage(void) +{ + struct ipipe_percpu_domain_data *p; + struct ipipe_domain *ipd; + int irq; + + p = __ipipe_current_context; + ipd = p->domain; + + __set_bit(IPIPE_STALL_FLAG, &p->status); + smp_wmb(); + + if (ipd == ipipe_root_domain) + trace_hardirqs_off(); + + for (;;) { + irq = __ipipe_next_irq(p); + if (irq < 0) + break; + /* + * Make sure the compiler does not reorder wrongly, so + * that all updates to maps are done before the + * handler gets called. + */ + barrier(); + + if (test_bit(IPIPE_LOCK_FLAG, &ipd->irqs[irq].control)) + continue; + + if (ipd != ipipe_head_domain) + hard_local_irq_enable(); + + if (likely(ipd != ipipe_root_domain)) { + ipd->irqs[irq].handler(irq, ipd->irqs[irq].cookie); + __ipipe_run_irqtail(irq); + hard_local_irq_disable(); + } else if (ipipe_virtual_irq_p(irq)) { + irq_enter(); + ipd->irqs[irq].handler(irq, ipd->irqs[irq].cookie); + irq_exit(); + root_stall_after_handler(); + hard_local_irq_disable(); + while (__ipipe_check_root_resched()) + __ipipe_preempt_schedule_irq(); + } else { + ipd->irqs[irq].handler(irq, ipd->irqs[irq].cookie); + root_stall_after_handler(); + hard_local_irq_disable(); + } + + p = __ipipe_current_context; + } + + if (ipd == ipipe_root_domain) + trace_hardirqs_on(); + + __clear_bit(IPIPE_STALL_FLAG, &p->status); +} + +EXPORT_SYMBOL(__ipipe_do_sync_stage); + +#ifdef CONFIG_SMP + +/* Always called with hw interrupts off. */ +void __ipipe_do_critical_sync(unsigned int irq, void *cookie) +{ + int cpu = ipipe_processor_id(); + + cpu_set(cpu, __ipipe_cpu_sync_map); + + /* + * Now we are in sync with the lock requestor running on + * another CPU. Enter a spinning wait until he releases the + * global lock. + */ + spin_lock(&__ipipe_cpu_barrier); + + /* Got it. Now get out. */ + + /* Call the sync routine if any. */ + if (__ipipe_cpu_sync) + __ipipe_cpu_sync(); + + cpu_set(cpu, __ipipe_cpu_pass_map); + + spin_unlock(&__ipipe_cpu_barrier); + + cpu_clear(cpu, __ipipe_cpu_sync_map); +} + +#endif /* CONFIG_SMP */ + +unsigned long ipipe_critical_enter(void (*syncfn)(void)) +{ + cpumask_t allbutself __maybe_unused, online __maybe_unused; + int cpu __maybe_unused, n __maybe_unused; + unsigned long flags, loops __maybe_unused; + + flags = hard_local_irq_save(); + + if (num_online_cpus() == 1) + return flags; + +#ifdef CONFIG_SMP + + cpu = ipipe_processor_id(); + if (!cpu_test_and_set(cpu, __ipipe_cpu_lock_map)) { + while (test_and_set_bit(0, &__ipipe_critical_lock)) { + n = 0; + hard_local_irq_enable(); + + do + cpu_relax(); + while (++n < cpu); + + hard_local_irq_disable(); + } +restart: + online = *cpu_online_mask; + spin_lock(&__ipipe_cpu_barrier); + + __ipipe_cpu_sync = syncfn; + + cpus_clear(__ipipe_cpu_pass_map); + cpu_set(cpu, __ipipe_cpu_pass_map); + + /* + * Send the sync IPI to all processors but the current + * one. + */ + cpus_andnot(allbutself, online, __ipipe_cpu_pass_map); + ipipe_send_ipi(IPIPE_CRITICAL_IPI, allbutself); + loops = IPIPE_CRITICAL_TIMEOUT; + + while (!cpus_equal(__ipipe_cpu_sync_map, allbutself)) { + if (--loops > 0) { + cpu_relax(); + continue; + } + /* + * We ran into a deadlock due to a contended + * rwlock. Cancel this round and retry. + */ + __ipipe_cpu_sync = NULL; + + spin_unlock(&__ipipe_cpu_barrier); + /* + * Ensure all CPUs consumed the IPI to avoid + * running __ipipe_cpu_sync prematurely. This + * usually resolves the deadlock reason too. + */ + while (!cpus_equal(online, __ipipe_cpu_pass_map)) + cpu_relax(); + + goto restart; + } + } + + atomic_inc(&__ipipe_critical_count); + +#endif /* CONFIG_SMP */ + + return flags; +} +EXPORT_SYMBOL_GPL(ipipe_critical_enter); + +void ipipe_critical_exit(unsigned long flags) +{ + if (num_online_cpus() == 1) { + hard_local_irq_restore(flags); + return; + } + +#ifdef CONFIG_SMP + if (atomic_dec_and_test(&__ipipe_critical_count)) { + spin_unlock(&__ipipe_cpu_barrier); + while (!cpus_empty(__ipipe_cpu_sync_map)) + cpu_relax(); + cpu_clear(ipipe_processor_id(), __ipipe_cpu_lock_map); + clear_bit(0, &__ipipe_critical_lock); + smp_mb__after_clear_bit(); + } +#endif /* CONFIG_SMP */ + + hard_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(ipipe_critical_exit); + +#ifdef CONFIG_IPIPE_DEBUG_CONTEXT + +void ipipe_root_only(void) +{ + struct ipipe_domain *this_domain; + unsigned long flags; + + flags = hard_smp_local_irq_save(); + + this_domain = __ipipe_current_domain; + if (likely(this_domain == ipipe_root_domain && + !test_bit(IPIPE_STALL_FLAG, &__ipipe_head_status))) { + hard_smp_local_irq_restore(flags); + return; + } + + if (!__this_cpu_read(ipipe_percpu.context_check)) { + hard_smp_local_irq_restore(flags); + return; + } + + hard_smp_local_irq_restore(flags); + + ipipe_prepare_panic(); + ipipe_trace_panic_freeze(); + + if (this_domain != ipipe_root_domain) + printk(KERN_ERR + "I-pipe: Detected illicit call from head domain '%s'\n" + KERN_ERR " into a regular Linux service\n", + this_domain->name); + else + printk(KERN_ERR "I-pipe: Detected stalled head domain, " + "probably caused by a bug.\n" + " A critical section may have been " + "left unterminated.\n"); + dump_stack(); + ipipe_trace_panic_dump(); +} +EXPORT_SYMBOL_GPL(ipipe_root_only); + +#endif /* CONFIG_IPIPE_DEBUG_CONTEXT */ + +#if defined(CONFIG_IPIPE_DEBUG_INTERNAL) && defined(CONFIG_SMP) + +int notrace __ipipe_check_percpu_access(void) +{ + struct ipipe_percpu_domain_data *p; + struct ipipe_domain *this_domain; + unsigned long flags; + int ret = 0; + + flags = hard_local_irq_save_notrace(); + + /* + * Don't use __ipipe_current_domain here, this would recurse + * indefinitely. + */ + this_domain = __this_cpu_read(ipipe_percpu.curr)->domain; + + /* + * Only the root domain may implement preemptive CPU migration + * of tasks, so anything above in the pipeline should be fine. + */ + if (this_domain != ipipe_root_domain) + goto out; + + if (raw_irqs_disabled_flags(flags)) + goto out; + + /* + * Last chance: hw interrupts were enabled on entry while + * running over the root domain, but the root stage might be + * currently stalled, in which case preemption would be + * disabled, and no migration could occur. + */ + if (this_domain == ipipe_root_domain) { + p = ipipe_this_cpu_root_context(); + if (test_bit(IPIPE_STALL_FLAG, &p->status)) + goto out; + } + /* + * Our caller may end up accessing the wrong per-cpu variable + * instance due to CPU migration; tell it to complain about + * this. + */ + ret = 1; +out: + hard_local_irq_restore_notrace(flags); + + return ret; +} +EXPORT_SYMBOL_GPL(__ipipe_check_percpu_access); + +void __ipipe_spin_unlock_debug(unsigned long flags) +{ + /* + * We catch a nasty issue where spin_unlock_irqrestore() on a + * regular kernel spinlock is about to re-enable hw interrupts + * in a section entered with hw irqs off. This is clearly the + * sign of a massive breakage coming. Usual suspect is a + * regular spinlock which was overlooked, used within a + * section which must run with hw irqs disabled. + */ + IPIPE_WARN_ONCE(!raw_irqs_disabled_flags(flags) && hard_irqs_disabled()); +} +EXPORT_SYMBOL_GPL(__ipipe_spin_unlock_debug); + +#endif /* CONFIG_IPIPE_DEBUG_INTERNAL && CONFIG_SMP */ + +void ipipe_prepare_panic(void) +{ +#ifdef CONFIG_PRINTK + __ipipe_printk_bypass = 1; +#endif + ipipe_context_check_off(); +} +EXPORT_SYMBOL_GPL(ipipe_prepare_panic); + +static void __ipipe_do_work(unsigned int virq, void *cookie) +{ + struct ipipe_work_header *work; + unsigned long flags; + void *curr, *tail; + int cpu; + + /* + * Work is dispatched in enqueuing order. This interrupt + * context can't migrate to another CPU. + */ + cpu = smp_processor_id(); + curr = per_cpu(work_buf, cpu); + + for (;;) { + flags = hard_local_irq_save(); + tail = per_cpu(work_tail, cpu); + if (curr == tail) { + per_cpu(work_tail, cpu) = per_cpu(work_buf, cpu); + hard_local_irq_restore(flags); + return; + } + work = curr; + curr += work->size; + hard_local_irq_restore(flags); + work->handler(work); + } +} + +void __ipipe_post_work_root(struct ipipe_work_header *work) +{ + unsigned long flags; + void *tail; + int cpu; + + /* + * Subtle: we want to use the head stall/unstall operators, + * not the hard_* routines to protect against races. This way, + * we ensure that a root-based caller will trigger the virq + * handling immediately when unstalling the head stage, as a + * result of calling __ipipe_sync_pipeline() under the hood. + */ + flags = ipipe_test_and_stall_head(); + cpu = ipipe_processor_id(); + tail = per_cpu(work_tail, cpu); + + if (WARN_ON_ONCE((unsigned char *)tail + work->size >= + per_cpu(work_buf, cpu) + WORKBUF_SIZE)) + goto out; + + /* Work handling is deferred, so data has to be copied. */ + memcpy(tail, work, work->size); + per_cpu(work_tail, cpu) = tail + work->size; + ipipe_post_irq_root(__ipipe_work_virq); +out: + ipipe_restore_head(flags); +} +EXPORT_SYMBOL_GPL(__ipipe_post_work_root); diff --git a/kernel/ipipe/timer.c b/kernel/ipipe/timer.c new file mode 100644 index 0000000..4281cb1 --- /dev/null +++ b/kernel/ipipe/timer.c @@ -0,0 +1,491 @@ +/* -*- linux-c -*- + * linux/kernel/ipipe/timer.c + * + * Copyright (C) 2012 Gilles Chanteperdrix + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * I-pipe timer request interface. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned long __ipipe_hrtimer_freq; + +static LIST_HEAD(timers); +static IPIPE_DEFINE_SPINLOCK(lock); + +static DEFINE_PER_CPU(struct ipipe_timer *, percpu_timer); + +#ifdef CONFIG_GENERIC_CLOCKEVENTS +/* + * Default request method: switch to oneshot mode if supported. + */ +static void ipipe_timer_default_request(struct ipipe_timer *timer, int steal) +{ + struct clock_event_device *evtdev = timer->host_timer; + + if (!(evtdev->features & CLOCK_EVT_FEAT_ONESHOT)) + return; + + if (evtdev->mode != CLOCK_EVT_MODE_ONESHOT) { + evtdev->set_mode(CLOCK_EVT_MODE_ONESHOT, evtdev); + evtdev->set_next_event(timer->freq / HZ, evtdev); + } +} + +/* + * Default release method: return the timer to the mode it had when + * starting. + */ +static void ipipe_timer_default_release(struct ipipe_timer *timer) +{ + struct clock_event_device *evtdev = timer->host_timer; + + evtdev->set_mode(evtdev->mode, evtdev); + if (evtdev->mode == CLOCK_EVT_MODE_ONESHOT) + evtdev->set_next_event(timer->freq / HZ, evtdev); +} + +void ipipe_host_timer_register(struct clock_event_device *evtdev) +{ + struct ipipe_timer *timer = evtdev->ipipe_timer; + + if (timer == NULL) + return; + + if (timer->request == NULL) + timer->request = ipipe_timer_default_request; + + /* + * By default, use the same method as linux timer, on ARM at + * least, most set_next_event methods are safe to be called + * from Xenomai domain anyway. + */ + if (timer->set == NULL) { + timer->timer_set = evtdev; + timer->set = (typeof(timer->set))evtdev->set_next_event; + } + + if (timer->release == NULL) + timer->release = ipipe_timer_default_release; + + if (timer->name == NULL) + timer->name = evtdev->name; + + if (timer->rating == 0) + timer->rating = evtdev->rating; + + timer->freq = (1000000000ULL * evtdev->mult) >> evtdev->shift; + + if (timer->min_delay_ticks == 0) + timer->min_delay_ticks = + (evtdev->min_delta_ns * evtdev->mult) >> evtdev->shift; + + if (timer->cpumask == NULL) + timer->cpumask = evtdev->cpumask; + + timer->host_timer = evtdev; + + ipipe_timer_register(timer); +} +#endif /* CONFIG_GENERIC_CLOCKEVENTS */ + +/* + * register a timer: maintain them in a list sorted by rating + */ +void ipipe_timer_register(struct ipipe_timer *timer) +{ + struct ipipe_timer *t; + unsigned long flags; + + if (timer->timer_set == NULL) + timer->timer_set = timer; + + if (timer->cpumask == NULL) + timer->cpumask = cpumask_of(smp_processor_id()); + + spin_lock_irqsave(&lock, flags); + + list_for_each_entry(t, &timers, link) { + if (t->rating <= timer->rating) { + __list_add(&timer->link, t->link.prev, &t->link); + goto done; + } + } + list_add_tail(&timer->link, &timers); + done: + spin_unlock_irqrestore(&lock, flags); +} + +static void ipipe_timer_request_sync(void) +{ + struct ipipe_timer *timer = __ipipe_this_cpu_read(percpu_timer); + struct clock_event_device *evtdev; + int steal; + + evtdev = timer->host_timer; + +#ifdef CONFIG_GENERIC_CLOCKEVENTS + steal = evtdev != NULL && evtdev->mode != CLOCK_EVT_MODE_UNUSED; +#else /* !CONFIG_GENERIC_CLOCKEVENTS */ + steal = 1; +#endif /* !CONFIG_GENERIC_CLOCKEVENTS */ + + timer->request(timer, steal); +} + +/* Set up a timer as per-cpu timer for ipipe */ +static void install_pcpu_timer(unsigned cpu, unsigned hrclock_freq, + struct ipipe_timer *t) { + unsigned hrtimer_freq; + unsigned long long tmp; + + if (__ipipe_hrtimer_freq == 0) + __ipipe_hrtimer_freq = t->freq; + + per_cpu(ipipe_percpu.hrtimer_irq, cpu) = t->irq; + per_cpu(percpu_timer, cpu) = t; + + hrtimer_freq = t->freq; + if (__ipipe_hrclock_freq > UINT_MAX) + hrtimer_freq /= 1000; + + t->c2t_integ = hrtimer_freq / hrclock_freq; + tmp = (((unsigned long long) + (hrtimer_freq % hrclock_freq)) << 32) + + hrclock_freq - 1; + do_div(tmp, hrclock_freq); + t->c2t_frac = tmp; +} + +static void select_root_only_timer(unsigned cpu, unsigned hrclock_khz, + const struct cpumask *mask, + struct ipipe_timer *t) { + unsigned icpu; + struct clock_event_device *evtdev; + + /* + * If no ipipe-supported CPU shares an interrupt with the + * timer, we do not need to care about it. + */ + for_each_cpu(icpu, mask) { + if (t->irq == per_cpu(ipipe_percpu.hrtimer_irq, icpu)) { +#ifdef CONFIG_GENERIC_CLOCKEVENTS + evtdev = t->host_timer; + if (evtdev && evtdev->mode == CLOCK_EVT_MODE_SHUTDOWN) + continue; +#endif /* CONFIG_GENERIC_CLOCKEVENTS */ + goto found; + } + } + + return; + +found: + install_pcpu_timer(cpu, hrclock_khz, t); +} + +/* + * Choose per-cpu timers with the highest rating by traversing the + * rating-sorted list for each CPU. + */ +int ipipe_select_timers(const struct cpumask *mask) +{ + unsigned hrclock_freq; + unsigned long long tmp; + struct ipipe_timer *t; + struct clock_event_device *evtdev; + unsigned long flags; + unsigned cpu; + cpumask_t fixup; + + if (__ipipe_hrclock_freq > UINT_MAX) { + tmp = __ipipe_hrclock_freq; + do_div(tmp, 1000); + hrclock_freq = tmp; + } else + hrclock_freq = __ipipe_hrclock_freq; + + spin_lock_irqsave(&lock, flags); + + /* First, choose timers for the CPUs handled by ipipe */ + for_each_cpu(cpu, mask) { + list_for_each_entry(t, &timers, link) { + if (!cpumask_test_cpu(cpu, t->cpumask)) + continue; + +#ifdef CONFIG_GENERIC_CLOCKEVENTS + evtdev = t->host_timer; + if (evtdev && evtdev->mode == CLOCK_EVT_MODE_SHUTDOWN) + continue; +#endif /* CONFIG_GENERIC_CLOCKEVENTS */ + goto found; + } + + printk("I-pipe: could not find timer for cpu #%d\n", + cpu); + goto err_remove_all; +found: + install_pcpu_timer(cpu, hrclock_freq, t); + } + + /* + * Second, check if we need to fix up any CPUs not supported + * by ipipe (but by Linux) whose interrupt may need to be + * forwarded because they have the same IRQ as an ipipe-enabled + * timer. + */ + cpumask_andnot(&fixup, cpu_online_mask, mask); + + for_each_cpu(cpu, &fixup) { + list_for_each_entry(t, &timers, link) { + if (!cpumask_test_cpu(cpu, t->cpumask)) + continue; + + select_root_only_timer(cpu, hrclock_freq, mask, t); + } + } + + spin_unlock_irqrestore(&lock, flags); + + flags = ipipe_critical_enter(ipipe_timer_request_sync); + ipipe_timer_request_sync(); + ipipe_critical_exit(flags); + + return 0; + +err_remove_all: + spin_unlock_irqrestore(&lock, flags); + + for_each_cpu(cpu, mask) { + per_cpu(ipipe_percpu.hrtimer_irq, cpu) = -1; + per_cpu(percpu_timer, cpu) = NULL; + } + __ipipe_hrtimer_freq = 0; + + return -ENODEV; +} + +EXPORT_SYMBOL(ipipe_select_timers); + +static void ipipe_timer_release_sync(void) +{ + struct ipipe_timer *timer = __ipipe_this_cpu_read(percpu_timer); + + timer->release(timer); +} + +void ipipe_timers_release(void) +{ + unsigned long flags; + unsigned cpu; + + flags = ipipe_critical_enter(ipipe_timer_release_sync); + ipipe_timer_release_sync(); + ipipe_critical_exit(flags); + + for_each_online_cpu(cpu) { + per_cpu(ipipe_percpu.hrtimer_irq, cpu) = -1; + per_cpu(percpu_timer, cpu) = NULL; + __ipipe_hrtimer_freq = 0; + } +} + +EXPORT_SYMBOL(ipipe_timers_release); + +static void __ipipe_ack_hrtimer_irq(unsigned int irq, struct irq_desc *desc) +{ + struct ipipe_timer *timer = __ipipe_this_cpu_read(percpu_timer); + + if (desc) + desc->ipipe_ack(irq, desc); + if (timer->ack) + timer->ack(); + if (desc) + desc->ipipe_end(irq, desc); +} + +int ipipe_timer_start(void (*tick_handler)(void), + void (*emumode)(enum clock_event_mode mode, + struct clock_event_device *cdev), + int (*emutick)(unsigned long evt, + struct clock_event_device *cdev), + unsigned cpu) +{ + struct clock_event_device *evtdev; + struct ipipe_timer *timer; + struct irq_desc *desc; + unsigned long flags; + int steal, ret; + + timer = per_cpu(percpu_timer, cpu); + evtdev = timer->host_timer; + + flags = ipipe_critical_enter(NULL); + + if (cpu == 0 || timer->irq != per_cpu(ipipe_percpu.hrtimer_irq, 0)) { + ret = ipipe_request_irq(ipipe_head_domain, timer->irq, + (ipipe_irq_handler_t)tick_handler, + NULL, __ipipe_ack_hrtimer_irq); + if (ret < 0) + goto done; + } + +#ifdef CONFIG_GENERIC_CLOCKEVENTS + steal = evtdev != NULL && evtdev->mode != CLOCK_EVT_MODE_UNUSED; + if (steal && evtdev->ipipe_stolen == 0) { + timer->real_mult = evtdev->mult; + timer->real_shift = evtdev->shift; + timer->real_set_mode = evtdev->set_mode; + timer->real_set_next_event = evtdev->set_next_event; + evtdev->mult = 1; + evtdev->shift = 0; + evtdev->set_mode = emumode; + evtdev->set_next_event = emutick; + evtdev->ipipe_stolen = 1; + } + + ret = evtdev ? evtdev->mode : CLOCK_EVT_MODE_UNUSED; +#else /* CONFIG_GENERIC_CLOCKEVENTS */ + steal = 1; + ret = 0; +#endif /* CONFIG_GENERIC_CLOCKEVENTS */ + + done: + ipipe_critical_exit(flags); + + desc = irq_to_desc(timer->irq); + if (desc && irqd_irq_disabled(&desc->irq_data)) + ipipe_enable_irq(timer->irq); + + return ret; +} + +EXPORT_SYMBOL(ipipe_timer_start); + +void ipipe_timer_stop(unsigned cpu) +{ + unsigned long __maybe_unused flags; + struct clock_event_device *evtdev; + struct ipipe_timer *timer; + struct irq_desc *desc; + + timer = per_cpu(percpu_timer, cpu); + evtdev = timer->host_timer; + + desc = irq_to_desc(timer->irq); + if (desc && irqd_irq_disabled(&desc->irq_data)) + ipipe_disable_irq(timer->irq); + +#ifdef CONFIG_GENERIC_CLOCKEVENTS + if (evtdev) { + flags = ipipe_critical_enter(NULL); + + if (evtdev->ipipe_stolen) { + evtdev->mult = timer->real_mult; + evtdev->shift = timer->real_shift; + evtdev->set_mode = timer->real_set_mode; + evtdev->set_next_event = timer->real_set_next_event; + timer->real_mult = timer->real_shift = 0; + timer->real_set_mode = NULL; + timer->real_set_next_event = NULL; + evtdev->ipipe_stolen = 0; + } + + ipipe_critical_exit(flags); + } +#endif /* CONFIG_GENERIC_CLOCKEVENTS */ + + ipipe_free_irq(ipipe_head_domain, timer->irq); +} + +EXPORT_SYMBOL(ipipe_timer_stop); + +void ipipe_timer_set(unsigned long cdelay) +{ + unsigned long tdelay; + struct ipipe_timer *t; + + t = __ipipe_this_cpu_read(percpu_timer); + + /* + * Even though some architectures may use a 64 bits delay + * here, we voluntarily limit to 32 bits, 4 billions ticks + * should be enough for now. Would a timer needs more, an + * extra call to the tick handler would simply occur after 4 + * billions ticks. + */ + if (cdelay > UINT_MAX) + cdelay = UINT_MAX; + + tdelay = cdelay; + if (t->c2t_integ != 1) + tdelay *= t->c2t_integ; + if (t->c2t_frac) + tdelay += ((unsigned long long)cdelay * t->c2t_frac) >> 32; + + if (tdelay < t->min_delay_ticks + || t->set(tdelay, t->timer_set) < 0) + ipipe_raise_irq(t->irq); +} +EXPORT_SYMBOL_GPL(ipipe_timer_set); + +const char *ipipe_timer_name(void) +{ + return per_cpu(percpu_timer, 0)->name; +} +EXPORT_SYMBOL_GPL(ipipe_timer_name); + +unsigned ipipe_timer_ns2ticks(struct ipipe_timer *timer, unsigned ns) +{ + unsigned long long tmp; + BUG_ON(!timer->freq); + tmp = (unsigned long long)ns * timer->freq; + do_div(tmp, 1000000000); + return tmp; +} + +#ifdef CONFIG_IPIPE_HAVE_HOSTRT +/* + * NOTE: The architecture specific code must only call this function + * when a clocksource suitable for CLOCK_HOST_REALTIME is enabled. + * The event receiver is responsible for providing proper locking. + */ +void ipipe_update_hostrt(struct timespec *wall_time, struct timespec *wtm, + struct clocksource *clock, u32 mult) +{ + struct ipipe_hostrt_data data; + + ipipe_root_only(); + data.live = 1; + data.cycle_last = clock->cycle_last; + data.mask = clock->mask; + data.mult = mult; + data.shift = clock->shift; + data.wall_time_sec = wall_time->tv_sec; + data.wall_time_nsec = wall_time->tv_nsec; + data.wall_to_monotonic = *wtm; + __ipipe_notify_kevent(IPIPE_KEVT_HOSTRT, &data); +} + +#endif /* CONFIG_IPIPE_HAVE_HOSTRT */ diff --git a/kernel/ipipe/tracer.c b/kernel/ipipe/tracer.c new file mode 100644 index 0000000..1d9997a --- /dev/null +++ b/kernel/ipipe/tracer.c @@ -0,0 +1,1442 @@ +/* -*- linux-c -*- + * kernel/ipipe/tracer.c + * + * Copyright (C) 2005 Luotao Fu. + * 2005-2008 Jan Kiszka. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IPIPE_TRACE_PATHS 4 /* Do not lower below 3 */ +#define IPIPE_DEFAULT_ACTIVE 0 +#define IPIPE_DEFAULT_MAX 1 +#define IPIPE_DEFAULT_FROZEN 2 + +#define IPIPE_TRACE_POINTS (1 << CONFIG_IPIPE_TRACE_SHIFT) +#define WRAP_POINT_NO(point) ((point) & (IPIPE_TRACE_POINTS-1)) + +#define IPIPE_DEFAULT_PRE_TRACE 10 +#define IPIPE_DEFAULT_POST_TRACE 10 +#define IPIPE_DEFAULT_BACK_TRACE 100 + +#define IPIPE_DELAY_NOTE 1000 /* in nanoseconds */ +#define IPIPE_DELAY_WARN 10000 /* in nanoseconds */ + +#define IPIPE_TFLG_NMI_LOCK 0x0001 +#define IPIPE_TFLG_NMI_HIT 0x0002 +#define IPIPE_TFLG_NMI_FREEZE_REQ 0x0004 + +#define IPIPE_TFLG_HWIRQ_OFF 0x0100 +#define IPIPE_TFLG_FREEZING 0x0200 +#define IPIPE_TFLG_CURRDOM_SHIFT 10 /* bits 10..11: current domain */ +#define IPIPE_TFLG_CURRDOM_MASK 0x0C00 +#define IPIPE_TFLG_DOMSTATE_SHIFT 12 /* bits 12..15: domain stalled? */ +#define IPIPE_TFLG_DOMSTATE_BITS 3 + +#define IPIPE_TFLG_DOMAIN_STALLED(point, n) \ + (point->flags & (1 << (n + IPIPE_TFLG_DOMSTATE_SHIFT))) +#define IPIPE_TFLG_CURRENT_DOMAIN(point) \ + ((point->flags & IPIPE_TFLG_CURRDOM_MASK) >> IPIPE_TFLG_CURRDOM_SHIFT) + +struct ipipe_trace_point { + short type; + short flags; + unsigned long eip; + unsigned long parent_eip; + unsigned long v; + unsigned long long timestamp; +}; + +struct ipipe_trace_path { + volatile int flags; + int dump_lock; /* separated from flags due to cross-cpu access */ + int trace_pos; /* next point to fill */ + int begin, end; /* finalised path begin and end */ + int post_trace; /* non-zero when in post-trace phase */ + unsigned long long length; /* max path length in cycles */ + unsigned long nmi_saved_eip; /* for deferred requests from NMIs */ + unsigned long nmi_saved_parent_eip; + unsigned long nmi_saved_v; + struct ipipe_trace_point point[IPIPE_TRACE_POINTS]; +} ____cacheline_aligned_in_smp; + +enum ipipe_trace_type +{ + IPIPE_TRACE_FUNC = 0, + IPIPE_TRACE_BEGIN, + IPIPE_TRACE_END, + IPIPE_TRACE_FREEZE, + IPIPE_TRACE_SPECIAL, + IPIPE_TRACE_PID, + IPIPE_TRACE_EVENT, +}; + +#define IPIPE_TYPE_MASK 0x0007 +#define IPIPE_TYPE_BITS 3 + +#ifdef CONFIG_IPIPE_TRACE_VMALLOC +static DEFINE_PER_CPU(struct ipipe_trace_path *, trace_path); +#else /* !CONFIG_IPIPE_TRACE_VMALLOC */ +static DEFINE_PER_CPU(struct ipipe_trace_path, trace_path[IPIPE_TRACE_PATHS]) = + { [0 ... IPIPE_TRACE_PATHS-1] = { .begin = -1, .end = -1 } }; +#endif /* CONFIG_IPIPE_TRACE_VMALLOC */ + +int ipipe_trace_enable = 0; + +static DEFINE_PER_CPU(int, active_path) = { IPIPE_DEFAULT_ACTIVE }; +static DEFINE_PER_CPU(int, max_path) = { IPIPE_DEFAULT_MAX }; +static DEFINE_PER_CPU(int, frozen_path) = { IPIPE_DEFAULT_FROZEN }; +static IPIPE_DEFINE_SPINLOCK(global_path_lock); +static int pre_trace = IPIPE_DEFAULT_PRE_TRACE; +static int post_trace = IPIPE_DEFAULT_POST_TRACE; +static int back_trace = IPIPE_DEFAULT_BACK_TRACE; +static int verbose_trace = 1; +static unsigned long trace_overhead; + +static unsigned long trigger_begin; +static unsigned long trigger_end; + +static DEFINE_MUTEX(out_mutex); +static struct ipipe_trace_path *print_path; +#ifdef CONFIG_IPIPE_TRACE_PANIC +static struct ipipe_trace_path *panic_path; +#endif /* CONFIG_IPIPE_TRACE_PANIC */ +static int print_pre_trace; +static int print_post_trace; + + +static long __ipipe_signed_tsc2us(long long tsc); +static void +__ipipe_trace_point_type(char *buf, struct ipipe_trace_point *point); +static void __ipipe_print_symname(struct seq_file *m, unsigned long eip); + +static inline void store_states(struct ipipe_domain *ipd, + struct ipipe_trace_point *point, int pos) +{ + if (test_bit(IPIPE_STALL_FLAG, &ipipe_this_cpu_context(ipd)->status)) + point->flags |= 1 << (pos + IPIPE_TFLG_DOMSTATE_SHIFT); + + if (ipd == __ipipe_current_domain) + point->flags |= pos << IPIPE_TFLG_CURRDOM_SHIFT; +} + +static notrace void +__ipipe_store_domain_states(struct ipipe_trace_point *point) +{ + store_states(ipipe_root_domain, point, 0); + if (ipipe_head_domain != ipipe_root_domain) + store_states(ipipe_head_domain, point, 1); +} + +static notrace int __ipipe_get_free_trace_path(int old, int cpu) +{ + int new_active = old; + struct ipipe_trace_path *tp; + + do { + if (++new_active == IPIPE_TRACE_PATHS) + new_active = 0; + tp = &per_cpu(trace_path, cpu)[new_active]; + } while (new_active == per_cpu(max_path, cpu) || + new_active == per_cpu(frozen_path, cpu) || + tp->dump_lock); + + return new_active; +} + +static notrace void +__ipipe_migrate_pre_trace(struct ipipe_trace_path *new_tp, + struct ipipe_trace_path *old_tp, int old_pos) +{ + int i; + + new_tp->trace_pos = pre_trace+1; + + for (i = new_tp->trace_pos; i > 0; i--) + memcpy(&new_tp->point[WRAP_POINT_NO(new_tp->trace_pos-i)], + &old_tp->point[WRAP_POINT_NO(old_pos-i)], + sizeof(struct ipipe_trace_point)); + + /* mark the end (i.e. the point before point[0]) invalid */ + new_tp->point[IPIPE_TRACE_POINTS-1].eip = 0; +} + +static notrace struct ipipe_trace_path * +__ipipe_trace_end(int cpu, struct ipipe_trace_path *tp, int pos) +{ + struct ipipe_trace_path *old_tp = tp; + long active = per_cpu(active_path, cpu); + unsigned long long length; + + /* do we have a new worst case? */ + length = tp->point[tp->end].timestamp - + tp->point[tp->begin].timestamp; + if (length > per_cpu(trace_path, cpu)[per_cpu(max_path, cpu)].length) { + /* we need protection here against other cpus trying + to start a proc dump */ + spin_lock(&global_path_lock); + + /* active path holds new worst case */ + tp->length = length; + per_cpu(max_path, cpu) = active; + + /* find next unused trace path */ + active = __ipipe_get_free_trace_path(active, cpu); + + spin_unlock(&global_path_lock); + + tp = &per_cpu(trace_path, cpu)[active]; + + /* migrate last entries for pre-tracing */ + __ipipe_migrate_pre_trace(tp, old_tp, pos); + } + + return tp; +} + +static notrace struct ipipe_trace_path * +__ipipe_trace_freeze(int cpu, struct ipipe_trace_path *tp, int pos) +{ + struct ipipe_trace_path *old_tp = tp; + long active = per_cpu(active_path, cpu); + int n; + + /* frozen paths have no core (begin=end) */ + tp->begin = tp->end; + + /* we need protection here against other cpus trying + * to set their frozen path or to start a proc dump */ + spin_lock(&global_path_lock); + + per_cpu(frozen_path, cpu) = active; + + /* find next unused trace path */ + active = __ipipe_get_free_trace_path(active, cpu); + + /* check if this is the first frozen path */ + for_each_possible_cpu(n) { + if (n != cpu && + per_cpu(trace_path, n)[per_cpu(frozen_path, n)].end >= 0) + tp->end = -1; + } + + spin_unlock(&global_path_lock); + + tp = &per_cpu(trace_path, cpu)[active]; + + /* migrate last entries for pre-tracing */ + __ipipe_migrate_pre_trace(tp, old_tp, pos); + + return tp; +} + +void notrace +__ipipe_trace(enum ipipe_trace_type type, unsigned long eip, + unsigned long parent_eip, unsigned long v) +{ + struct ipipe_trace_path *tp, *old_tp; + int pos, next_pos, begin; + struct ipipe_trace_point *point; + unsigned long flags; + int cpu; + + flags = hard_local_irq_save_notrace(); + + cpu = ipipe_processor_id(); + restart: + tp = old_tp = &per_cpu(trace_path, cpu)[per_cpu(active_path, cpu)]; + + /* here starts a race window with NMIs - catched below */ + + /* check for NMI recursion */ + if (unlikely(tp->flags & IPIPE_TFLG_NMI_LOCK)) { + tp->flags |= IPIPE_TFLG_NMI_HIT; + + /* first freeze request from NMI context? */ + if ((type == IPIPE_TRACE_FREEZE) && + !(tp->flags & IPIPE_TFLG_NMI_FREEZE_REQ)) { + /* save arguments and mark deferred freezing */ + tp->flags |= IPIPE_TFLG_NMI_FREEZE_REQ; + tp->nmi_saved_eip = eip; + tp->nmi_saved_parent_eip = parent_eip; + tp->nmi_saved_v = v; + } + return; /* no need for restoring flags inside IRQ */ + } + + /* clear NMI events and set lock (atomically per cpu) */ + tp->flags = (tp->flags & ~(IPIPE_TFLG_NMI_HIT | + IPIPE_TFLG_NMI_FREEZE_REQ)) + | IPIPE_TFLG_NMI_LOCK; + + /* check active_path again - some nasty NMI may have switched + * it meanwhile */ + if (unlikely(tp != + &per_cpu(trace_path, cpu)[per_cpu(active_path, cpu)])) { + /* release lock on wrong path and restart */ + tp->flags &= ~IPIPE_TFLG_NMI_LOCK; + + /* there is no chance that the NMI got deferred + * => no need to check for pending freeze requests */ + goto restart; + } + + /* get the point buffer */ + pos = tp->trace_pos; + point = &tp->point[pos]; + + /* store all trace point data */ + point->type = type; + point->flags = hard_irqs_disabled_flags(flags) ? IPIPE_TFLG_HWIRQ_OFF : 0; + point->eip = eip; + point->parent_eip = parent_eip; + point->v = v; + ipipe_read_tsc(point->timestamp); + + __ipipe_store_domain_states(point); + + /* forward to next point buffer */ + next_pos = WRAP_POINT_NO(pos+1); + tp->trace_pos = next_pos; + + /* only mark beginning if we haven't started yet */ + begin = tp->begin; + if (unlikely(type == IPIPE_TRACE_BEGIN) && (begin < 0)) + tp->begin = pos; + + /* end of critical path, start post-trace if not already started */ + if (unlikely(type == IPIPE_TRACE_END) && + (begin >= 0) && !tp->post_trace) + tp->post_trace = post_trace + 1; + + /* freeze only if the slot is free and we are not already freezing */ + if ((unlikely(type == IPIPE_TRACE_FREEZE) || + (unlikely(eip >= trigger_begin && eip <= trigger_end) && + type == IPIPE_TRACE_FUNC)) && + per_cpu(trace_path, cpu)[per_cpu(frozen_path, cpu)].begin < 0 && + !(tp->flags & IPIPE_TFLG_FREEZING)) { + tp->post_trace = post_trace + 1; + tp->flags |= IPIPE_TFLG_FREEZING; + } + + /* enforce end of trace in case of overflow */ + if (unlikely(WRAP_POINT_NO(next_pos + 1) == begin)) { + tp->end = pos; + goto enforce_end; + } + + /* stop tracing this path if we are in post-trace and + * a) that phase is over now or + * b) a new TRACE_BEGIN came in but we are not freezing this path */ + if (unlikely((tp->post_trace > 0) && ((--tp->post_trace == 0) || + ((type == IPIPE_TRACE_BEGIN) && + !(tp->flags & IPIPE_TFLG_FREEZING))))) { + /* store the path's end (i.e. excluding post-trace) */ + tp->end = WRAP_POINT_NO(pos - post_trace + tp->post_trace); + + enforce_end: + if (tp->flags & IPIPE_TFLG_FREEZING) + tp = __ipipe_trace_freeze(cpu, tp, pos); + else + tp = __ipipe_trace_end(cpu, tp, pos); + + /* reset the active path, maybe already start a new one */ + tp->begin = (type == IPIPE_TRACE_BEGIN) ? + WRAP_POINT_NO(tp->trace_pos - 1) : -1; + tp->end = -1; + tp->post_trace = 0; + tp->flags = 0; + + /* update active_path not earlier to avoid races with NMIs */ + per_cpu(active_path, cpu) = tp - per_cpu(trace_path, cpu); + } + + /* we still have old_tp and point, + * let's reset NMI lock and check for catches */ + old_tp->flags &= ~IPIPE_TFLG_NMI_LOCK; + if (unlikely(old_tp->flags & IPIPE_TFLG_NMI_HIT)) { + /* well, this late tagging may not immediately be visible for + * other cpus already dumping this path - a minor issue */ + point->flags |= IPIPE_TFLG_NMI_HIT; + + /* handle deferred freezing from NMI context */ + if (old_tp->flags & IPIPE_TFLG_NMI_FREEZE_REQ) + __ipipe_trace(IPIPE_TRACE_FREEZE, old_tp->nmi_saved_eip, + old_tp->nmi_saved_parent_eip, + old_tp->nmi_saved_v); + } + + hard_local_irq_restore_notrace(flags); +} + +static unsigned long __ipipe_global_path_lock(void) +{ + unsigned long flags; + int cpu; + struct ipipe_trace_path *tp; + + spin_lock_irqsave(&global_path_lock, flags); + + cpu = ipipe_processor_id(); + restart: + tp = &per_cpu(trace_path, cpu)[per_cpu(active_path, cpu)]; + + /* here is small race window with NMIs - catched below */ + + /* clear NMI events and set lock (atomically per cpu) */ + tp->flags = (tp->flags & ~(IPIPE_TFLG_NMI_HIT | + IPIPE_TFLG_NMI_FREEZE_REQ)) + | IPIPE_TFLG_NMI_LOCK; + + /* check active_path again - some nasty NMI may have switched + * it meanwhile */ + if (tp != &per_cpu(trace_path, cpu)[per_cpu(active_path, cpu)]) { + /* release lock on wrong path and restart */ + tp->flags &= ~IPIPE_TFLG_NMI_LOCK; + + /* there is no chance that the NMI got deferred + * => no need to check for pending freeze requests */ + goto restart; + } + + return flags; +} + +static void __ipipe_global_path_unlock(unsigned long flags) +{ + int cpu; + struct ipipe_trace_path *tp; + + /* release spinlock first - it's not involved in the NMI issue */ + __ipipe_spin_unlock_irqbegin(&global_path_lock); + + cpu = ipipe_processor_id(); + tp = &per_cpu(trace_path, cpu)[per_cpu(active_path, cpu)]; + + tp->flags &= ~IPIPE_TFLG_NMI_LOCK; + + /* handle deferred freezing from NMI context */ + if (tp->flags & IPIPE_TFLG_NMI_FREEZE_REQ) + __ipipe_trace(IPIPE_TRACE_FREEZE, tp->nmi_saved_eip, + tp->nmi_saved_parent_eip, tp->nmi_saved_v); + + /* See __ipipe_spin_lock_irqsave() and friends. */ + __ipipe_spin_unlock_irqcomplete(flags); +} + +void notrace asmlinkage +ipipe_trace_asm(enum ipipe_trace_type type, unsigned long eip, + unsigned long parent_eip, unsigned long v) +{ + if (!ipipe_trace_enable) + return; + __ipipe_trace(type, eip, parent_eip, v); +} + +void notrace ipipe_trace_begin(unsigned long v) +{ + if (!ipipe_trace_enable) + return; + __ipipe_trace(IPIPE_TRACE_BEGIN, __BUILTIN_RETURN_ADDRESS0, + __BUILTIN_RETURN_ADDRESS1, v); +} +EXPORT_SYMBOL_GPL(ipipe_trace_begin); + +void notrace ipipe_trace_end(unsigned long v) +{ + if (!ipipe_trace_enable) + return; + __ipipe_trace(IPIPE_TRACE_END, __BUILTIN_RETURN_ADDRESS0, + __BUILTIN_RETURN_ADDRESS1, v); +} +EXPORT_SYMBOL_GPL(ipipe_trace_end); + +void notrace ipipe_trace_freeze(unsigned long v) +{ + if (!ipipe_trace_enable) + return; + __ipipe_trace(IPIPE_TRACE_FREEZE, __BUILTIN_RETURN_ADDRESS0, + __BUILTIN_RETURN_ADDRESS1, v); +} +EXPORT_SYMBOL_GPL(ipipe_trace_freeze); + +void notrace ipipe_trace_special(unsigned char id, unsigned long v) +{ + if (!ipipe_trace_enable) + return; + __ipipe_trace(IPIPE_TRACE_SPECIAL | (id << IPIPE_TYPE_BITS), + __BUILTIN_RETURN_ADDRESS0, + __BUILTIN_RETURN_ADDRESS1, v); +} +EXPORT_SYMBOL_GPL(ipipe_trace_special); + +void notrace ipipe_trace_pid(pid_t pid, short prio) +{ + if (!ipipe_trace_enable) + return; + __ipipe_trace(IPIPE_TRACE_PID | (prio << IPIPE_TYPE_BITS), + __BUILTIN_RETURN_ADDRESS0, + __BUILTIN_RETURN_ADDRESS1, pid); +} +EXPORT_SYMBOL_GPL(ipipe_trace_pid); + +void notrace ipipe_trace_event(unsigned char id, unsigned long delay_tsc) +{ + if (!ipipe_trace_enable) + return; + __ipipe_trace(IPIPE_TRACE_EVENT | (id << IPIPE_TYPE_BITS), + __BUILTIN_RETURN_ADDRESS0, + __BUILTIN_RETURN_ADDRESS1, delay_tsc); +} +EXPORT_SYMBOL_GPL(ipipe_trace_event); + +int ipipe_trace_max_reset(void) +{ + int cpu; + unsigned long flags; + struct ipipe_trace_path *path; + int ret = 0; + + flags = __ipipe_global_path_lock(); + + for_each_possible_cpu(cpu) { + path = &per_cpu(trace_path, cpu)[per_cpu(max_path, cpu)]; + + if (path->dump_lock) { + ret = -EBUSY; + break; + } + + path->begin = -1; + path->end = -1; + path->trace_pos = 0; + path->length = 0; + } + + __ipipe_global_path_unlock(flags); + + return ret; +} +EXPORT_SYMBOL_GPL(ipipe_trace_max_reset); + +int ipipe_trace_frozen_reset(void) +{ + int cpu; + unsigned long flags; + struct ipipe_trace_path *path; + int ret = 0; + + flags = __ipipe_global_path_lock(); + + for_each_online_cpu(cpu) { + path = &per_cpu(trace_path, cpu)[per_cpu(frozen_path, cpu)]; + + if (path->dump_lock) { + ret = -EBUSY; + break; + } + + path->begin = -1; + path->end = -1; + path->trace_pos = 0; + path->length = 0; + } + + __ipipe_global_path_unlock(flags); + + return ret; +} +EXPORT_SYMBOL_GPL(ipipe_trace_frozen_reset); + +static void +__ipipe_get_task_info(char *task_info, struct ipipe_trace_point *point, + int trylock) +{ + struct task_struct *task = NULL; + char buf[8]; + int i; + int locked = 1; + + if (trylock) { + if (!read_trylock(&tasklist_lock)) + locked = 0; + } else + read_lock(&tasklist_lock); + + if (locked) + task = find_task_by_pid_ns((pid_t)point->v, &init_pid_ns); + + if (task) + strncpy(task_info, task->comm, 11); + else + strcpy(task_info, "--"); + + if (locked) + read_unlock(&tasklist_lock); + + for (i = strlen(task_info); i < 11; i++) + task_info[i] = ' '; + + sprintf(buf, " %d ", point->type >> IPIPE_TYPE_BITS); + strcpy(task_info + (11 - strlen(buf)), buf); +} + +static void +__ipipe_get_event_date(char *buf,struct ipipe_trace_path *path, + struct ipipe_trace_point *point) +{ + long time; + int type; + + time = __ipipe_signed_tsc2us(point->timestamp - + path->point[path->begin].timestamp + point->v); + type = point->type >> IPIPE_TYPE_BITS; + + if (type == 0) + /* + * Event type #0 is predefined, stands for the next + * timer tick. + */ + sprintf(buf, "tick@%-6ld", time); + else + sprintf(buf, "%3d@%-7ld", type, time); +} + +#ifdef CONFIG_IPIPE_TRACE_PANIC + +void ipipe_trace_panic_freeze(void) +{ + unsigned long flags; + int cpu; + + if (!ipipe_trace_enable) + return; + + ipipe_trace_enable = 0; + flags = hard_local_irq_save_notrace(); + + cpu = ipipe_processor_id(); + + panic_path = &per_cpu(trace_path, cpu)[per_cpu(active_path, cpu)]; + + hard_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(ipipe_trace_panic_freeze); + +void ipipe_trace_panic_dump(void) +{ + int cnt = back_trace; + int start, pos; + char buf[16]; + + if (!panic_path) + return; + + ipipe_context_check_off(); + + printk("I-pipe tracer log (%d points):\n", cnt); + + start = pos = WRAP_POINT_NO(panic_path->trace_pos-1); + + while (cnt-- > 0) { + struct ipipe_trace_point *point = &panic_path->point[pos]; + long time; + char info[16]; + int i; + + printk(" %c", + (point->flags & IPIPE_TFLG_HWIRQ_OFF) ? '|' : ' '); + + for (i = IPIPE_TFLG_DOMSTATE_BITS; i >= 0; i--) + printk("%c", + (IPIPE_TFLG_CURRENT_DOMAIN(point) == i) ? + (IPIPE_TFLG_DOMAIN_STALLED(point, i) ? + '#' : '+') : + (IPIPE_TFLG_DOMAIN_STALLED(point, i) ? + '*' : ' ')); + + if (!point->eip) + printk("--\n"); + else { + __ipipe_trace_point_type(buf, point); + printk("%s", buf); + + switch (point->type & IPIPE_TYPE_MASK) { + case IPIPE_TRACE_FUNC: + printk(" "); + break; + + case IPIPE_TRACE_PID: + __ipipe_get_task_info(info, + point, 1); + printk("%s", info); + break; + + case IPIPE_TRACE_EVENT: + __ipipe_get_event_date(info, + panic_path, point); + printk("%s", info); + break; + + default: + printk("0x%08lx ", point->v); + } + + time = __ipipe_signed_tsc2us(point->timestamp - + panic_path->point[start].timestamp); + printk(" %5ld ", time); + + __ipipe_print_symname(NULL, point->eip); + printk(" ("); + __ipipe_print_symname(NULL, point->parent_eip); + printk(")\n"); + } + pos = WRAP_POINT_NO(pos - 1); + } + + panic_path = NULL; +} +EXPORT_SYMBOL_GPL(ipipe_trace_panic_dump); + +#endif /* CONFIG_IPIPE_TRACE_PANIC */ + + +/* --- /proc output --- */ + +static notrace int __ipipe_in_critical_trpath(long point_no) +{ + return ((WRAP_POINT_NO(point_no-print_path->begin) < + WRAP_POINT_NO(print_path->end-print_path->begin)) || + ((print_path->end == print_path->begin) && + (WRAP_POINT_NO(point_no-print_path->end) > + print_post_trace))); +} + +static long __ipipe_signed_tsc2us(long long tsc) +{ + unsigned long long abs_tsc; + long us; + + /* ipipe_tsc2us works on unsigned => handle sign separately */ + abs_tsc = (tsc >= 0) ? tsc : -tsc; + us = ipipe_tsc2us(abs_tsc); + if (tsc < 0) + return -us; + else + return us; +} + +static void +__ipipe_trace_point_type(char *buf, struct ipipe_trace_point *point) +{ + switch (point->type & IPIPE_TYPE_MASK) { + case IPIPE_TRACE_FUNC: + strcpy(buf, "func "); + break; + + case IPIPE_TRACE_BEGIN: + strcpy(buf, "begin "); + break; + + case IPIPE_TRACE_END: + strcpy(buf, "end "); + break; + + case IPIPE_TRACE_FREEZE: + strcpy(buf, "freeze "); + break; + + case IPIPE_TRACE_SPECIAL: + sprintf(buf, "(0x%02x) ", + point->type >> IPIPE_TYPE_BITS); + break; + + case IPIPE_TRACE_PID: + sprintf(buf, "[%5d] ", (pid_t)point->v); + break; + + case IPIPE_TRACE_EVENT: + sprintf(buf, "event "); + break; + } +} + +static void +__ipipe_print_pathmark(struct seq_file *m, struct ipipe_trace_point *point) +{ + char mark = ' '; + int point_no = point - print_path->point; + int i; + + if (print_path->end == point_no) + mark = '<'; + else if (print_path->begin == point_no) + mark = '>'; + else if (__ipipe_in_critical_trpath(point_no)) + mark = ':'; + seq_printf(m, "%c%c", mark, + (point->flags & IPIPE_TFLG_HWIRQ_OFF) ? '|' : ' '); + + if (!verbose_trace) + return; + + for (i = IPIPE_TFLG_DOMSTATE_BITS; i >= 0; i--) + seq_printf(m, "%c", + (IPIPE_TFLG_CURRENT_DOMAIN(point) == i) ? + (IPIPE_TFLG_DOMAIN_STALLED(point, i) ? + '#' : '+') : + (IPIPE_TFLG_DOMAIN_STALLED(point, i) ? '*' : ' ')); +} + +static void +__ipipe_print_delay(struct seq_file *m, struct ipipe_trace_point *point) +{ + unsigned long delay = 0; + int next; + char *mark = " "; + + next = WRAP_POINT_NO(point+1 - print_path->point); + + if (next != print_path->trace_pos) + delay = ipipe_tsc2ns(print_path->point[next].timestamp - + point->timestamp); + + if (__ipipe_in_critical_trpath(point - print_path->point)) { + if (delay > IPIPE_DELAY_WARN) + mark = "! "; + else if (delay > IPIPE_DELAY_NOTE) + mark = "+ "; + } + seq_puts(m, mark); + + if (verbose_trace) + seq_printf(m, "%3lu.%03lu%c ", delay/1000, delay%1000, + (point->flags & IPIPE_TFLG_NMI_HIT) ? 'N' : ' '); + else + seq_puts(m, " "); +} + +static void __ipipe_print_symname(struct seq_file *m, unsigned long eip) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); + +#ifdef CONFIG_IPIPE_TRACE_PANIC + if (!m) { + /* panic dump */ + if (sym_name) { + printk("%s+0x%lx", sym_name, offset); + if (modname) + printk(" [%s]", modname); + } else + printk("<%08lx>", eip); + } else +#endif /* CONFIG_IPIPE_TRACE_PANIC */ + { + if (sym_name) { + if (verbose_trace) { + seq_printf(m, "%s+0x%lx", sym_name, offset); + if (modname) + seq_printf(m, " [%s]", modname); + } else + seq_puts(m, sym_name); + } else + seq_printf(m, "<%08lx>", eip); + } +} + +static void __ipipe_print_headline(struct seq_file *m) +{ + const char *name[2]; + + seq_printf(m, "Calibrated minimum trace-point overhead: %lu.%03lu " + "us\n\n", trace_overhead/1000, trace_overhead%1000); + + if (verbose_trace) { + name[0] = ipipe_root_domain->name; + if (ipipe_head_domain != ipipe_root_domain) + name[1] = ipipe_head_domain->name; + else + name[1] = ""; + + seq_printf(m, + " +----- Hard IRQs ('|': locked)\n" + " |+-- %s\n" + " ||+- %s%s\n" + " ||| +---------- " + "Delay flag ('+': > %d us, '!': > %d us)\n" + " ||| | +- " + "NMI noise ('N')\n" + " ||| | |\n" + " Type User Val. Time Delay Function " + "(Parent)\n", + name[1], name[0], + " ('*': domain stalled, '+': current, " + "'#': current+stalled)", + IPIPE_DELAY_NOTE/1000, IPIPE_DELAY_WARN/1000); + } else + seq_printf(m, + " +--------------- Hard IRQs ('|': locked)\n" + " | +- Delay flag " + "('+': > %d us, '!': > %d us)\n" + " | |\n" + " Type Time Function (Parent)\n", + IPIPE_DELAY_NOTE/1000, IPIPE_DELAY_WARN/1000); +} + +static void *__ipipe_max_prtrace_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + + mutex_lock(&out_mutex); + + if (!n) { + struct ipipe_trace_path *tp; + unsigned long length_usecs; + int points, cpu; + unsigned long flags; + + /* protect against max_path/frozen_path updates while we + * haven't locked our target path, also avoid recursively + * taking global_path_lock from NMI context */ + flags = __ipipe_global_path_lock(); + + /* find the longest of all per-cpu paths */ + print_path = NULL; + for_each_online_cpu(cpu) { + tp = &per_cpu(trace_path, cpu)[per_cpu(max_path, cpu)]; + if ((print_path == NULL) || + (tp->length > print_path->length)) { + print_path = tp; + break; + } + } + print_path->dump_lock = 1; + + __ipipe_global_path_unlock(flags); + + /* does this path actually contain data? */ + if (print_path->end == print_path->begin) + return NULL; + + /* number of points inside the critical path */ + points = WRAP_POINT_NO(print_path->end-print_path->begin+1); + + /* pre- and post-tracing length, post-trace length was frozen + in __ipipe_trace, pre-trace may have to be reduced due to + buffer overrun */ + print_pre_trace = pre_trace; + print_post_trace = WRAP_POINT_NO(print_path->trace_pos - + print_path->end - 1); + if (points+pre_trace+print_post_trace > IPIPE_TRACE_POINTS - 1) + print_pre_trace = IPIPE_TRACE_POINTS - 1 - points - + print_post_trace; + + length_usecs = ipipe_tsc2us(print_path->length); + seq_printf(m, "I-pipe worst-case tracing service on %s/ipipe release #%d\n" + "-------------------------------------------------------------\n", + UTS_RELEASE, IPIPE_CORE_RELEASE); + seq_printf(m, "CPU: %d, Begin: %lld cycles, Trace Points: " + "%d (-%d/+%d), Length: %lu us\n", + cpu, print_path->point[print_path->begin].timestamp, + points, print_pre_trace, print_post_trace, length_usecs); + __ipipe_print_headline(m); + } + + /* check if we are inside the trace range */ + if (n >= WRAP_POINT_NO(print_path->end - print_path->begin + 1 + + print_pre_trace + print_post_trace)) + return NULL; + + /* return the next point to be shown */ + return &print_path->point[WRAP_POINT_NO(print_path->begin - + print_pre_trace + n)]; +} + +static void *__ipipe_prtrace_next(struct seq_file *m, void *p, loff_t *pos) +{ + loff_t n = ++*pos; + + /* check if we are inside the trace range with the next entry */ + if (n >= WRAP_POINT_NO(print_path->end - print_path->begin + 1 + + print_pre_trace + print_post_trace)) + return NULL; + + /* return the next point to be shown */ + return &print_path->point[WRAP_POINT_NO(print_path->begin - + print_pre_trace + *pos)]; +} + +static void __ipipe_prtrace_stop(struct seq_file *m, void *p) +{ + if (print_path) + print_path->dump_lock = 0; + mutex_unlock(&out_mutex); +} + +static int __ipipe_prtrace_show(struct seq_file *m, void *p) +{ + long time; + struct ipipe_trace_point *point = p; + char buf[16]; + + if (!point->eip) { + seq_puts(m, "--\n"); + return 0; + } + + __ipipe_print_pathmark(m, point); + __ipipe_trace_point_type(buf, point); + seq_puts(m, buf); + if (verbose_trace) + switch (point->type & IPIPE_TYPE_MASK) { + case IPIPE_TRACE_FUNC: + seq_puts(m, " "); + break; + + case IPIPE_TRACE_PID: + __ipipe_get_task_info(buf, point, 0); + seq_puts(m, buf); + break; + + case IPIPE_TRACE_EVENT: + __ipipe_get_event_date(buf, print_path, point); + seq_puts(m, buf); + break; + + default: + seq_printf(m, "0x%08lx ", point->v); + } + + time = __ipipe_signed_tsc2us(point->timestamp - + print_path->point[print_path->begin].timestamp); + seq_printf(m, "%5ld", time); + + __ipipe_print_delay(m, point); + __ipipe_print_symname(m, point->eip); + seq_puts(m, " ("); + __ipipe_print_symname(m, point->parent_eip); + seq_puts(m, ")\n"); + + return 0; +} + +static struct seq_operations __ipipe_max_ptrace_ops = { + .start = __ipipe_max_prtrace_start, + .next = __ipipe_prtrace_next, + .stop = __ipipe_prtrace_stop, + .show = __ipipe_prtrace_show +}; + +static int __ipipe_max_prtrace_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &__ipipe_max_ptrace_ops); +} + +static ssize_t +__ipipe_max_reset(struct file *file, const char __user *pbuffer, + size_t count, loff_t *data) +{ + mutex_lock(&out_mutex); + ipipe_trace_max_reset(); + mutex_unlock(&out_mutex); + + return count; +} + +struct file_operations __ipipe_max_prtrace_fops = { + .open = __ipipe_max_prtrace_open, + .read = seq_read, + .write = __ipipe_max_reset, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void *__ipipe_frozen_prtrace_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + + mutex_lock(&out_mutex); + + if (!n) { + struct ipipe_trace_path *tp; + int cpu; + unsigned long flags; + + /* protect against max_path/frozen_path updates while we + * haven't locked our target path, also avoid recursively + * taking global_path_lock from NMI context */ + flags = __ipipe_global_path_lock(); + + /* find the first of all per-cpu frozen paths */ + print_path = NULL; + for_each_online_cpu(cpu) { + tp = &per_cpu(trace_path, cpu)[per_cpu(frozen_path, cpu)]; + if (tp->end >= 0) { + print_path = tp; + break; + } + } + if (print_path) + print_path->dump_lock = 1; + + __ipipe_global_path_unlock(flags); + + if (!print_path) + return NULL; + + /* back- and post-tracing length, post-trace length was frozen + in __ipipe_trace, back-trace may have to be reduced due to + buffer overrun */ + print_pre_trace = back_trace-1; /* substract freeze point */ + print_post_trace = WRAP_POINT_NO(print_path->trace_pos - + print_path->end - 1); + if (1+pre_trace+print_post_trace > IPIPE_TRACE_POINTS - 1) + print_pre_trace = IPIPE_TRACE_POINTS - 2 - + print_post_trace; + + seq_printf(m, "I-pipe frozen back-tracing service on %s/ipipe release #%d\n" + "------------------------------------------------------------\n", + UTS_RELEASE, IPIPE_CORE_RELEASE); + seq_printf(m, "CPU: %d, Freeze: %lld cycles, Trace Points: %d (+%d)\n", + cpu, print_path->point[print_path->begin].timestamp, + print_pre_trace+1, print_post_trace); + __ipipe_print_headline(m); + } + + /* check if we are inside the trace range */ + if (n >= print_pre_trace + 1 + print_post_trace) + return NULL; + + /* return the next point to be shown */ + return &print_path->point[WRAP_POINT_NO(print_path->begin- + print_pre_trace+n)]; +} + +static struct seq_operations __ipipe_frozen_ptrace_ops = { + .start = __ipipe_frozen_prtrace_start, + .next = __ipipe_prtrace_next, + .stop = __ipipe_prtrace_stop, + .show = __ipipe_prtrace_show +}; + +static int __ipipe_frozen_prtrace_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &__ipipe_frozen_ptrace_ops); +} + +static ssize_t +__ipipe_frozen_ctrl(struct file *file, const char __user *pbuffer, + size_t count, loff_t *data) +{ + char *end, buf[16]; + int val; + int n; + + n = (count > sizeof(buf) - 1) ? sizeof(buf) - 1 : count; + + if (copy_from_user(buf, pbuffer, n)) + return -EFAULT; + + buf[n] = '\0'; + val = simple_strtol(buf, &end, 0); + + if (((*end != '\0') && !isspace(*end)) || (val < 0)) + return -EINVAL; + + mutex_lock(&out_mutex); + ipipe_trace_frozen_reset(); + if (val > 0) + ipipe_trace_freeze(-1); + mutex_unlock(&out_mutex); + + return count; +} + +struct file_operations __ipipe_frozen_prtrace_fops = { + .open = __ipipe_frozen_prtrace_open, + .read = seq_read, + .write = __ipipe_frozen_ctrl, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __ipipe_rd_proc_val(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len; + + len = sprintf(page, "%u\n", *(int *)data); + len -= off; + if (len <= off + count) + *eof = 1; + *start = page + off; + if (len > count) + len = count; + if (len < 0) + len = 0; + + return len; +} + +static int __ipipe_wr_proc_val(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + char *end, buf[16]; + int val; + int n; + + n = (count > sizeof(buf) - 1) ? sizeof(buf) - 1 : count; + + if (copy_from_user(buf, buffer, n)) + return -EFAULT; + + buf[n] = '\0'; + val = simple_strtol(buf, &end, 0); + + if (((*end != '\0') && !isspace(*end)) || (val < 0)) + return -EINVAL; + + mutex_lock(&out_mutex); + *(int *)data = val; + mutex_unlock(&out_mutex); + + return count; +} + +static int __ipipe_rd_trigger(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + int len; + + if (!trigger_begin) + return 0; + + len = sprint_symbol(page, trigger_begin); + page[len++] = '\n'; + + len -= off; + if (len <= off + count) + *eof = 1; + *start = page + off; + if (len > count) + len = count; + if (len < 0) + len = 0; + + return len; +} + +static int __ipipe_wr_trigger(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + char buf[KSYM_SYMBOL_LEN]; + unsigned long begin, end; + + if (count > sizeof(buf) - 1) + count = sizeof(buf) - 1; + if (copy_from_user(buf, buffer, count)) + return -EFAULT; + buf[count] = 0; + if (buf[count-1] == '\n') + buf[count-1] = 0; + + begin = kallsyms_lookup_name(buf); + if (!begin || !kallsyms_lookup_size_offset(begin, &end, NULL)) + return -ENOENT; + end += begin - 1; + + mutex_lock(&out_mutex); + /* invalidate the current range before setting a new one */ + trigger_end = 0; + wmb(); + ipipe_trace_frozen_reset(); + + /* set new range */ + trigger_begin = begin; + wmb(); + trigger_end = end; + mutex_unlock(&out_mutex); + + return count; +} + +#ifdef CONFIG_IPIPE_TRACE_MCOUNT +static void notrace +ipipe_trace_function(unsigned long ip, unsigned long parent_ip) +{ + if (!ipipe_trace_enable) + return; + __ipipe_trace(IPIPE_TRACE_FUNC, ip, parent_ip, 0); +} + +static struct ftrace_ops ipipe_trace_ops = { + .func = ipipe_trace_function +}; + +static int __ipipe_wr_enable(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + char *end, buf[16]; + int val; + int n; + + n = (count > sizeof(buf) - 1) ? sizeof(buf) - 1 : count; + + if (copy_from_user(buf, buffer, n)) + return -EFAULT; + + buf[n] = '\0'; + val = simple_strtol(buf, &end, 0); + + if (((*end != '\0') && !isspace(*end)) || (val < 0)) + return -EINVAL; + + mutex_lock(&out_mutex); + + if (ipipe_trace_enable) { + if (!val) + unregister_ftrace_function(&ipipe_trace_ops); + } else if (val) + register_ftrace_function(&ipipe_trace_ops); + + ipipe_trace_enable = val; + + mutex_unlock(&out_mutex); + + return count; +} +#endif /* CONFIG_IPIPE_TRACE_MCOUNT */ + +extern struct proc_dir_entry *ipipe_proc_root; + +static struct proc_dir_entry * __init +__ipipe_create_trace_proc_val(struct proc_dir_entry *trace_dir, + const char *name, int *value_ptr) +{ + struct proc_dir_entry *entry; + + entry = create_proc_entry(name, 0644, trace_dir); + if (entry) { + entry->data = value_ptr; + entry->read_proc = __ipipe_rd_proc_val; + entry->write_proc = __ipipe_wr_proc_val; + } + return entry; +} + +void __init __ipipe_init_tracer(void) +{ + struct proc_dir_entry *trace_dir; + struct proc_dir_entry *entry; + unsigned long long start, end, min = ULLONG_MAX; + int i; +#ifdef CONFIG_IPIPE_TRACE_VMALLOC + int cpu, path; + + for_each_possible_cpu(cpu) { + struct ipipe_trace_path *tp_buf; + + tp_buf = vmalloc_node(sizeof(struct ipipe_trace_path) * + IPIPE_TRACE_PATHS, cpu_to_node(cpu)); + if (!tp_buf) { + printk(KERN_ERR "I-pipe: " + "insufficient memory for trace buffer.\n"); + return; + } + memset(tp_buf, 0, + sizeof(struct ipipe_trace_path) * IPIPE_TRACE_PATHS); + for (path = 0; path < IPIPE_TRACE_PATHS; path++) { + tp_buf[path].begin = -1; + tp_buf[path].end = -1; + } + per_cpu(trace_path, cpu) = tp_buf; + } +#endif /* CONFIG_IPIPE_TRACE_VMALLOC */ + + /* Calculate minimum overhead of __ipipe_trace() */ + hard_local_irq_disable(); + for (i = 0; i < 100; i++) { + ipipe_read_tsc(start); + __ipipe_trace(IPIPE_TRACE_FUNC, __BUILTIN_RETURN_ADDRESS0, + __BUILTIN_RETURN_ADDRESS1, 0); + ipipe_read_tsc(end); + + end -= start; + if (end < min) + min = end; + } + hard_local_irq_enable(); + trace_overhead = ipipe_tsc2ns(min); + +#ifdef CONFIG_IPIPE_TRACE_ENABLE + ipipe_trace_enable = 1; +#ifdef CONFIG_IPIPE_TRACE_MCOUNT + ftrace_enabled = 1; + register_ftrace_function(&ipipe_trace_ops); +#endif /* CONFIG_IPIPE_TRACE_MCOUNT */ +#endif /* CONFIG_IPIPE_TRACE_ENABLE */ + + trace_dir = create_proc_entry("trace", S_IFDIR, ipipe_proc_root); + + entry = create_proc_entry("max", 0644, trace_dir); + if (entry) + entry->proc_fops = &__ipipe_max_prtrace_fops; + + entry = create_proc_entry("frozen", 0644, trace_dir); + if (entry) + entry->proc_fops = &__ipipe_frozen_prtrace_fops; + + entry = create_proc_entry("trigger", 0644, trace_dir); + if (entry) { + entry->read_proc = __ipipe_rd_trigger; + entry->write_proc = __ipipe_wr_trigger; + } + + __ipipe_create_trace_proc_val(trace_dir, "pre_trace_points", + &pre_trace); + __ipipe_create_trace_proc_val(trace_dir, "post_trace_points", + &post_trace); + __ipipe_create_trace_proc_val(trace_dir, "back_trace_points", + &back_trace); + __ipipe_create_trace_proc_val(trace_dir, "verbose", + &verbose_trace); + entry = __ipipe_create_trace_proc_val(trace_dir, "enable", + &ipipe_trace_enable); +#ifdef CONFIG_IPIPE_TRACE_MCOUNT + if (entry) + entry->write_proc = __ipipe_wr_enable; +#endif /* CONFIG_IPIPE_TRACE_MCOUNT */ +} diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index eebd6d5..c5433eb 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -166,8 +167,10 @@ int irq_startup(struct irq_desc *desc, bool resend) desc->depth = 0; if (desc->irq_data.chip->irq_startup) { + unsigned long flags = hard_cond_local_irq_save(); ret = desc->irq_data.chip->irq_startup(&desc->irq_data); irq_state_clr_masked(desc); + hard_cond_local_irq_restore(flags); } else { irq_enable(desc); } @@ -191,12 +194,14 @@ void irq_shutdown(struct irq_desc *desc) void irq_enable(struct irq_desc *desc) { + unsigned long flags = hard_cond_local_irq_save(); irq_state_clr_disabled(desc); if (desc->irq_data.chip->irq_enable) desc->irq_data.chip->irq_enable(&desc->irq_data); else desc->irq_data.chip->irq_unmask(&desc->irq_data); irq_state_clr_masked(desc); + hard_cond_local_irq_restore(flags); } void irq_disable(struct irq_desc *desc) @@ -210,11 +215,13 @@ void irq_disable(struct irq_desc *desc) void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu) { + unsigned long flags = hard_cond_local_irq_save(); if (desc->irq_data.chip->irq_enable) desc->irq_data.chip->irq_enable(&desc->irq_data); else desc->irq_data.chip->irq_unmask(&desc->irq_data); cpumask_set_cpu(cpu, desc->percpu_enabled); + hard_cond_local_irq_restore(flags); } void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu) @@ -248,9 +255,13 @@ void mask_irq(struct irq_desc *desc) void unmask_irq(struct irq_desc *desc) { + unsigned long flags; + if (desc->irq_data.chip->irq_unmask) { + flags = hard_cond_local_irq_save(); desc->irq_data.chip->irq_unmask(&desc->irq_data); irq_state_clr_masked(desc); + hard_cond_local_irq_restore(flags); } } @@ -370,7 +381,9 @@ void handle_level_irq(unsigned int irq, struct irq_desc *desc) { raw_spin_lock(&desc->lock); +#ifndef CONFIG_IPIPE mask_ack_irq(desc); +#endif if (unlikely(irqd_irq_inprogress(&desc->irq_data))) if (!irq_check_poll(desc)) @@ -439,17 +452,26 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) goto out; } +#ifndef CONFIG_IPIPE if (desc->istate & IRQS_ONESHOT) mask_irq(desc); +#endif preflow_handler(desc); handle_irq_event(desc); +#ifdef CONFIG_IPIPE + /* XXX: IRQCHIP_EOI_IF_HANDLED is ignored. */ + if (desc->irq_data.chip->irq_release) + desc->irq_data.chip->irq_release(&desc->irq_data); +out_eoi: +#else /* !CONFIG_IPIPE */ if (desc->istate & IRQS_ONESHOT) cond_unmask_irq(desc); out_eoi: desc->irq_data.chip->irq_eoi(&desc->irq_data); +#endif /* !CONFIG_IPIPE */ out_unlock: raw_spin_unlock(&desc->lock); return; @@ -497,7 +519,9 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) kstat_incr_irqs_this_cpu(irq, desc); /* Start handling the irq */ +#ifndef CONFIG_IPIPE desc->irq_data.chip->irq_ack(&desc->irq_data); +#endif do { if (unlikely(!desc->action)) { @@ -585,6 +609,12 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) kstat_incr_irqs_this_cpu(irq, desc); +#ifdef CONFIG_IPIPE + handle_irq_event_percpu(desc, desc->action); + + if (chip->irq_eoi && !irqd_irq_masked(&desc->irq_data)) + chip->irq_unmask(&desc->irq_data); +#else if (chip->irq_ack) chip->irq_ack(&desc->irq_data); @@ -592,6 +622,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) if (chip->irq_eoi) chip->irq_eoi(&desc->irq_data); +#endif } /** @@ -615,17 +646,134 @@ void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc) kstat_incr_irqs_this_cpu(irq, desc); +#ifndef CONFIG_IPIPE if (chip->irq_ack) chip->irq_ack(&desc->irq_data); +#else + (void)chip; +#endif trace_irq_handler_entry(irq, action); res = action->handler(irq, dev_id); trace_irq_handler_exit(irq, action, res); +#ifndef CONFIG_IPIPE if (chip->irq_eoi) chip->irq_eoi(&desc->irq_data); +#else + if (chip->irq_eoi && !irqd_irq_masked(&desc->irq_data)) + chip->irq_unmask(&desc->irq_data); +#endif } +#ifdef CONFIG_IPIPE + +void __ipipe_ack_level_irq(unsigned irq, struct irq_desc *desc) +{ + mask_ack_irq(desc); +} + +void __ipipe_end_level_irq(unsigned irq, struct irq_desc *desc) +{ + desc->irq_data.chip->irq_unmask(&desc->irq_data); +} + +void __ipipe_ack_fasteoi_irq(unsigned irq, struct irq_desc *desc) +{ + desc->irq_data.chip->irq_hold(&desc->irq_data); +} + +void __ipipe_end_fasteoi_irq(unsigned irq, struct irq_desc *desc) +{ + if (desc->irq_data.chip->irq_release) + desc->irq_data.chip->irq_release(&desc->irq_data); +} + +void __ipipe_ack_edge_irq(unsigned irq, struct irq_desc *desc) +{ + desc->irq_data.chip->irq_ack(&desc->irq_data); +} + +void __ipipe_ack_percpu_irq(unsigned irq, struct irq_desc *desc) +{ + if (desc->irq_data.chip->irq_ack) + desc->irq_data.chip->irq_ack(&desc->irq_data); + + if (desc->irq_data.chip->irq_eoi) + desc->irq_data.chip->irq_eoi(&desc->irq_data); +} + +void __ipipe_nop_irq(unsigned irq, struct irq_desc *desc) +{ +} + +void __ipipe_chained_irq(unsigned irq, struct irq_desc *desc) +{ + /* + * XXX: Do NOT fold this into __ipipe_nop_irq(), see + * ipipe_chained_irq_p(). + */ +} + +static void __ipipe_ack_bad_irq(unsigned irq, struct irq_desc *desc) +{ + handle_bad_irq(irq, desc); + WARN_ON_ONCE(1); +} + +irq_flow_handler_t +__fixup_irq_handler(struct irq_desc *desc, irq_flow_handler_t handle, int is_chained) +{ + if (unlikely(handle == NULL)) { + desc->ipipe_ack = __ipipe_ack_bad_irq; + desc->ipipe_end = __ipipe_nop_irq; + } else { + if (is_chained) { + desc->ipipe_ack = handle; + desc->ipipe_end = __ipipe_nop_irq; + handle = __ipipe_chained_irq; + } else if (handle == handle_simple_irq) { + desc->ipipe_ack = __ipipe_nop_irq; + desc->ipipe_end = __ipipe_nop_irq; + } else if (handle == handle_level_irq) { + desc->ipipe_ack = __ipipe_ack_level_irq; + desc->ipipe_end = __ipipe_end_level_irq; + } else if (handle == handle_edge_irq) { + desc->ipipe_ack = __ipipe_ack_edge_irq; + desc->ipipe_end = __ipipe_nop_irq; + } else if (handle == handle_fasteoi_irq) { + desc->ipipe_ack = __ipipe_ack_fasteoi_irq; + desc->ipipe_end = __ipipe_end_fasteoi_irq; + } else if (handle == handle_percpu_irq || + handle == handle_percpu_devid_irq) { + desc->ipipe_ack = __ipipe_ack_percpu_irq; + desc->ipipe_end = __ipipe_nop_irq; + } else if (irq_desc_get_chip(desc) == &no_irq_chip) { + desc->ipipe_ack = __ipipe_nop_irq; + desc->ipipe_end = __ipipe_nop_irq; + } else { + desc->ipipe_ack = __ipipe_ack_bad_irq; + desc->ipipe_end = __ipipe_nop_irq; + } + } + + /* Suppress intermediate trampoline routine. */ + ipipe_root_domain->irqs[desc->irq_data.irq].ackfn = desc->ipipe_ack; + + return handle; +} + +#else /* !CONFIG_IPIPE */ + +irq_flow_handler_t +__fixup_irq_handler(struct irq_desc *desc, irq_flow_handler_t handle, int is_chained) +{ + return handle; +} + +#endif /* !CONFIG_IPIPE */ +EXPORT_SYMBOL_GPL(__fixup_irq_handler); + void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name) @@ -643,6 +791,8 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, goto out; } + handle = __fixup_irq_handler(desc, handle, is_chained); + /* Uninstall? */ if (handle == handle_bad_irq) { if (desc->irq_data.chip != &no_irq_chip) diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index c89295a..3c308ca 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -40,11 +40,12 @@ void irq_gc_mask_disable_reg(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); u32 mask = 1 << (d->irq - gc->irq_base); + unsigned long flags; - irq_gc_lock(gc); + flags = irq_gc_lock(gc); irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable); gc->mask_cache &= ~mask; - irq_gc_unlock(gc); + irq_gc_unlock(gc, flags); } /** @@ -58,11 +59,12 @@ void irq_gc_mask_set_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); u32 mask = 1 << (d->irq - gc->irq_base); + unsigned long flags; - irq_gc_lock(gc); + flags = irq_gc_lock(gc); gc->mask_cache |= mask; irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); - irq_gc_unlock(gc); + irq_gc_unlock(gc, flags); } /** @@ -76,11 +78,12 @@ void irq_gc_mask_clr_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); u32 mask = 1 << (d->irq - gc->irq_base); + unsigned long flags; - irq_gc_lock(gc); + flags = irq_gc_lock(gc); gc->mask_cache &= ~mask; irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); - irq_gc_unlock(gc); + irq_gc_unlock(gc, flags); } /** @@ -94,11 +97,12 @@ void irq_gc_unmask_enable_reg(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); u32 mask = 1 << (d->irq - gc->irq_base); + unsigned long flags; - irq_gc_lock(gc); + flags = irq_gc_lock(gc); irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable); gc->mask_cache |= mask; - irq_gc_unlock(gc); + irq_gc_unlock(gc, flags); } /** @@ -109,10 +113,11 @@ void irq_gc_ack_set_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); u32 mask = 1 << (d->irq - gc->irq_base); + unsigned long flags; - irq_gc_lock(gc); + flags = irq_gc_lock(gc); irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); - irq_gc_unlock(gc); + irq_gc_unlock(gc, flags); } /** @@ -123,10 +128,11 @@ void irq_gc_ack_clr_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); u32 mask = ~(1 << (d->irq - gc->irq_base)); + unsigned long flags; - irq_gc_lock(gc); + flags = irq_gc_lock(gc); irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); - irq_gc_unlock(gc); + irq_gc_unlock(gc, flags); } /** @@ -137,11 +143,12 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); u32 mask = 1 << (d->irq - gc->irq_base); + unsigned long flags; - irq_gc_lock(gc); + flags = irq_gc_lock(gc); irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask); irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); - irq_gc_unlock(gc); + irq_gc_unlock(gc, flags); } /** @@ -152,10 +159,11 @@ void irq_gc_eoi(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); u32 mask = 1 << (d->irq - gc->irq_base); + unsigned long flags; - irq_gc_lock(gc); + flags = irq_gc_lock(gc); irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi); - irq_gc_unlock(gc); + irq_gc_unlock(gc, flags); } /** @@ -170,16 +178,17 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); u32 mask = 1 << (d->irq - gc->irq_base); + unsigned long flags; if (!(mask & gc->wake_enabled)) return -EINVAL; - irq_gc_lock(gc); + flags = irq_gc_lock(gc); if (on) gc->wake_active |= mask; else gc->wake_active &= ~mask; - irq_gc_unlock(gc); + irq_gc_unlock(gc, flags); return 0; } diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 192a302..7e9ab4de1f 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -270,10 +270,12 @@ int __init early_irq_init(void) return arch_early_irq_init(); } +#ifndef CONFIG_IPIPE struct irq_desc *irq_to_desc(unsigned int irq) { return (irq < NR_IRQS) ? irq_desc + irq : NULL; } +#endif /* CONFIG_IPIPE */ static void free_desc(unsigned int irq) { diff --git a/kernel/lockdep.c b/kernel/lockdep.c index ea9ee45..04ffafd 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2576,7 +2576,7 @@ void trace_hardirqs_on_caller(unsigned long ip) * already enabled, yet we find the hardware thinks they are in fact * enabled.. someone messed up their IRQ state tracing. */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled() && !hard_irqs_disabled())) return; /* @@ -2620,7 +2620,7 @@ void trace_hardirqs_off_caller(unsigned long ip) * So we're supposed to get called after you mask local IRQs, but for * some reason the hardware doesn't quite think you did a proper job. */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled() && !hard_irqs_disabled())) return; if (curr->hardirqs_enabled) { @@ -2656,7 +2656,7 @@ void trace_softirqs_on(unsigned long ip) * We fancy IRQs being disabled here, see softirq.c, avoids * funny state and nesting things. */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled() && !hard_irqs_disabled())) return; if (curr->softirqs_enabled) { @@ -2695,7 +2695,7 @@ void trace_softirqs_off(unsigned long ip) /* * We fancy IRQs being disabled here, see softirq.c */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled() && !hard_irqs_disabled())) return; if (curr->softirqs_enabled) { diff --git a/kernel/panic.c b/kernel/panic.c index d2a5f4e..e9037b0 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -23,6 +23,7 @@ #include #include #include +#include #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -352,6 +353,8 @@ void oops_enter(void) { tracing_off(); /* can't trust the integrity of the kernel anymore: */ + ipipe_trace_panic_freeze(); + ipipe_disable_context_check(); debug_locks_off(); do_oops_enter_exit(); } diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 4d46daf..3649e97 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -270,6 +270,7 @@ static int create_image(int platform_mode) goto Enable_cpus; local_irq_disable(); + hard_cond_local_irq_disable(); error = syscore_suspend(); if (error) { @@ -423,6 +424,7 @@ static int resume_target_kernel(bool platform_mode) goto Enable_cpus; local_irq_disable(); + hard_cond_local_irq_disable(); error = syscore_suspend(); if (error) @@ -538,6 +540,7 @@ int hibernation_platform_enter(void) goto Platform_finish; local_irq_disable(); + hard_cond_local_irq_disable(); syscore_suspend(); if (pm_wakeup_pending()) { error = -EAGAIN; diff --git a/kernel/printk.c b/kernel/printk.c index 146827f..8927a03 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1580,6 +1580,43 @@ asmlinkage int printk_emit(int facility, int level, } EXPORT_SYMBOL(printk_emit); +#ifdef CONFIG_IPIPE + +extern int __ipipe_printk_bypass; + +static IPIPE_DEFINE_SPINLOCK(__ipipe_printk_lock); + +static int __ipipe_printk_fill; + +static char __ipipe_printk_buf[__LOG_BUF_LEN]; + +void __ipipe_flush_printk (unsigned virq, void *cookie) +{ + char *p = __ipipe_printk_buf; + int len, lmax, out = 0; + unsigned long flags; + + goto start; + + do { + spin_unlock_irqrestore(&__ipipe_printk_lock, flags); + start: + lmax = __ipipe_printk_fill; + while (out < lmax) { + len = strlen(p) + 1; + printk("%s",p); + p += len; + out += len; + } + spin_lock_irqsave(&__ipipe_printk_lock, flags); + } + while (__ipipe_printk_fill != lmax); + + __ipipe_printk_fill = 0; + + spin_unlock_irqrestore(&__ipipe_printk_lock, flags); +} + /** * printk - print a kernel message * @fmt: format string @@ -1603,6 +1640,59 @@ EXPORT_SYMBOL(printk_emit); */ asmlinkage int printk(const char *fmt, ...) { + int sprintk = 1, cs = -1; + int r, fbytes, oldcount; + unsigned long flags; + va_list args; + + va_start(args, fmt); + + flags = hard_local_irq_save(); + + if (__ipipe_printk_bypass || oops_in_progress) + cs = ipipe_disable_context_check(); + else if (__ipipe_current_domain == ipipe_root_domain) { + if (ipipe_head_domain != ipipe_root_domain && + (raw_irqs_disabled_flags(flags) || + test_bit(IPIPE_STALL_FLAG, &__ipipe_head_status))) + sprintk = 0; + } else + sprintk = 0; + + hard_local_irq_restore(flags); + + if (sprintk) { + r = vprintk(fmt, args); + if (cs != -1) + ipipe_restore_context_check(cs); + goto out; + } + + spin_lock_irqsave(&__ipipe_printk_lock, flags); + + oldcount = __ipipe_printk_fill; + fbytes = __LOG_BUF_LEN - oldcount; + if (fbytes > 1) { + r = vscnprintf(__ipipe_printk_buf + __ipipe_printk_fill, + fbytes, fmt, args) + 1; + __ipipe_printk_fill += r; + } else + r = 0; + + spin_unlock_irqrestore(&__ipipe_printk_lock, flags); + + if (oldcount == 0) + ipipe_raise_irq(__ipipe_printk_virq); +out: + va_end(args); + + return r; +} + +#else /* !CONFIG_IPIPE */ + +asmlinkage int printk(const char *fmt, ...) +{ va_list args; int r; @@ -1620,6 +1710,8 @@ asmlinkage int printk(const char *fmt, ...) return r; } +#endif /* CONFIG_IPIPE */ + EXPORT_SYMBOL(printk); #else @@ -2304,7 +2396,7 @@ EXPORT_SYMBOL(register_console); int unregister_console(struct console *console) { - struct console *a, *b; + struct console *a, *b; int res = 1; #ifdef CONFIG_A11Y_BRAILLE_CONSOLE diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9f81a3a..f8c95cf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1499,7 +1499,9 @@ void scheduler_ipi(void) * however a fair share of IPIs are still resched only so this would * somewhat pessimize the simple resched case. */ +#ifndef IPIPE_ARCH_HAVE_VIRQ_IPI irq_enter(); +#endif sched_ttwu_pending(); /* @@ -1509,7 +1511,9 @@ void scheduler_ipi(void) this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } +#ifndef IPIPE_ARCH_HAVE_VIRQ_IPI irq_exit(); +#endif } static void ttwu_queue_remote(struct task_struct *p, int cpu) @@ -1583,7 +1587,8 @@ try_to_wake_up(struct task_struct *p, un smp_wmb(); raw_spin_lock_irqsave(&p->pi_lock, flags); - if (!(p->state & state)) + if (!(p->state & state) || + (p->state & (TASK_NOWAKEUP|TASK_HARDENING))) goto out; success = 1; /* we're going to change ->state */ @@ -2022,6 +2027,8 @@ asmlinkage void schedule_tail(struct tas { struct rq *rq = this_rq(); + __ipipe_complete_domain_migration(); + finish_task_switch(rq, prev); /* @@ -2042,16 +2049,21 @@ asmlinkage void schedule_tail(struct tas * context_switch - switch to the new MM and the new * thread's register state. */ -static inline void +int context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { struct mm_struct *mm, *oldmm; - prepare_task_switch(rq, prev, next); - mm = next->mm; oldmm = prev->active_mm; + +if (!rq) { + switch_mm(oldmm, next->active_mm, next); + if (!mm) enter_lazy_tlb(oldmm, next); +} else { + prepare_task_switch(rq, prev, next); + /* * For paravirt, this is coupled with an exit in switch_to to * combine the page table reload and the switch backend into @@ -2079,11 +2091,19 @@ context_switch(struct rq *rq, struct tas #ifndef __ARCH_WANT_UNLOCKED_CTXSW spin_release(&rq->lock.dep_map, 1, _THIS_IP_); #endif +} +#ifdef CONFIG_IPIPE + next->ptd[IPIPE_ROOT_NPTDKEYS - 1] = prev; +#endif /* CONFIG_IPIPE */ /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); barrier(); + +if (unlikely(rq)) { + if (unlikely(__ipipe_switch_tail())) + return 1; __ipipe_notify_kevent(-2, NULL); /* * this_rq must be evaluated again because prev may have moved * CPUs since it called schedule(), thus the 'rq' on its stack @@ -2091,6 +2111,10 @@ context_switch(struct rq *rq, struct tas */ finish_task_switch(this_rq(), prev); } + return 0; +} + +EXPORT_SYMBOL(context_switch); /* * nr_running, nr_uninterruptible and nr_context_switches: @@ -3248,6 +3272,7 @@ notrace unsigned long get_parent_ip(unsi void __kprobes add_preempt_count(int val) { + ipipe_root_only(); #ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? @@ -3316,6 +3341,7 @@ static noinline void __schedule_bug(stru */ static inline void schedule_debug(struct task_struct *prev) { + ipipe_root_only(); /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. @@ -3368,7 +3394,7 @@ pick_next_task(struct rq *rq) /* * __schedule() is the main scheduler function. */ -static void __sched __schedule(void) +static int __sched __schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; @@ -3382,6 +3408,10 @@ need_resched: rcu_note_context_switch(cpu); prev = rq->curr; + if (unlikely(prev->state & TASK_HARDENING)) + /* Pop one disable level -- one still remains. */ + preempt_enable(); + schedule_debug(prev); if (sched_feat(HRTICK)) @@ -3428,7 +3458,8 @@ need_resched: rq->curr = next; ++*switch_count; - context_switch(rq, prev, next); /* unlocks the rq */ + if (context_switch(rq, prev, next)) /* unlocks the rq */ + return 1; /* task hijacked by higher domain */ /* * The context switch have flipped the stack from under us * and restored the local variables which were saved when @@ -3437,14 +3468,18 @@ need_resched: */ cpu = smp_processor_id(); rq = cpu_rq(cpu); - } else + } else { + prev->state &= ~TASK_HARDENING; raw_spin_unlock_irq(&rq->lock); + } post_schedule(rq); sched_preempt_enable_no_resched(); if (need_resched()) goto need_resched; + + return 0; } static inline void sched_submit_work(struct task_struct *tsk) @@ -3544,7 +3579,8 @@ asmlinkage void __sched notrace preempt_ do { add_preempt_count_notrace(PREEMPT_ACTIVE); - __schedule(); + if (__schedule()) + return; sub_preempt_count_notrace(PREEMPT_ACTIVE); /* @@ -3607,6 +3643,8 @@ static void __wake_up_common(wait_queue_ { wait_queue_t *curr, *next; + ipipe_root_only(); + list_for_each_entry_safe(curr, next, &q->task_list, task_list) { unsigned flags = curr->flags; @@ -4385,6 +4423,7 @@ recheck: oldprio = p->prio; prev_class = p->sched_class; __setscheduler(rq, p, policy, param->sched_priority); + __ipipe_report_setsched(p); if (running) p->sched_class->set_curr_task(rq); @@ -5132,6 +5171,7 @@ void __cpuinit init_idle(struct task_str /* Set the preempt count _outside_ the spinlocks! */ task_thread_info(idle)->preempt_count = 0; + ipipe_root_only(); /* * The idle tasks have their own, simple scheduling class: @@ -8559,3 +8599,40 @@ struct cgroup_subsys cpuacct_subsys = { .base_cftypes = files, }; #endif /* CONFIG_CGROUP_CPUACCT */ + +#ifdef CONFIG_IPIPE + +int __ipipe_migrate_head(void) +{ + struct task_struct *p = current; + + preempt_disable(); + + IPIPE_WARN_ONCE(__this_cpu_read(ipipe_percpu.task_hijacked) != NULL); + + __this_cpu_write(ipipe_percpu.task_hijacked, p); + set_current_state(TASK_INTERRUPTIBLE | TASK_HARDENING); + sched_submit_work(p); + if (likely(__schedule())) + return 0; + + if (signal_pending(p)) + return -ERESTARTSYS; + + BUG(); +} +EXPORT_SYMBOL_GPL(__ipipe_migrate_head); + +void __ipipe_reenter_root(void) +{ + struct rq *rq = this_rq(); + struct task_struct *p; + + p = __this_cpu_read(ipipe_percpu.rqlock_owner); + finish_task_switch(rq, p); + post_schedule(rq); + preempt_enable_no_resched(); +} +EXPORT_SYMBOL_GPL(__ipipe_reenter_root); + +#endif /* CONFIG_IPIPE */ diff --git a/kernel/signal.c b/kernel/signal.c index 6771027..402439a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -683,6 +683,8 @@ void signal_wake_up(struct task_struct *t, int resume) unsigned int mask; set_tsk_thread_flag(t, TIF_SIGPENDING); + /* TIF_SIGPENDING must be prior to reporting. */ + __ipipe_report_sigwake(t); /* * For SIGKILL, we want to wake it up in the stopped/traced/killable diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 5cdd806..5ab3a87 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -26,7 +26,9 @@ * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are * not re-enabled during lock-acquire (which the preempt-spin-ops do): */ -#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) +#if !defined(CONFIG_GENERIC_LOCKBREAK) || \ + defined(CONFIG_DEBUG_LOCK_ALLOC) || \ + defined(CONFIG_IPIPE) /* * The __lock_function inlines are taken from * include/linux/spinlock_api_smp.h diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 7e1ce01..4da3cd9 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "tick-internal.h" @@ -282,6 +283,9 @@ void clockevents_register_device(struct clock_event_device *dev) unsigned long flags; BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); + + ipipe_host_timer_register(dev); + if (!dev->cpumask) { WARN_ON(num_possible_cpus() > 1); dev->cpumask = cpumask_of(smp_processor_id()); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c958338..0e71d18 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -30,6 +30,7 @@ #include /* for spin_unlock_irq() using preempt_count() m68k */ #include #include +#include void timecounter_init(struct timecounter *tc, const struct cyclecounter *cc, @@ -630,6 +631,95 @@ static int __init clocksource_done_booting(void) } fs_initcall(clocksource_done_booting); +#ifdef CONFIG_IPIPE_WANT_CLOCKSOURCE +unsigned long long __ipipe_cs_freq; +EXPORT_SYMBOL_GPL(__ipipe_cs_freq); + +struct clocksource *__ipipe_cs; +EXPORT_SYMBOL_GPL(__ipipe_cs); + +cycle_t (*__ipipe_cs_read)(struct clocksource *cs); +cycle_t __ipipe_cs_last_tsc; +cycle_t __ipipe_cs_mask; +unsigned __ipipe_cs_lat = 0xffffffff; + +static void ipipe_check_clocksource(struct clocksource *cs) +{ + cycle_t (*cread)(struct clocksource *cs); + cycle_t lat, mask, saved; + unsigned long long freq; + unsigned long flags; + unsigned i; + + if (cs->ipipe_read) { + mask = CLOCKSOURCE_MASK(64); + cread = cs->ipipe_read; + } else { + mask = cs->mask; + cread = cs->read; + + if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) == 0) + return; + + /* + * We only support masks such that cs->mask + 1 is a power of 2, + * 64 bits masks or masks lesser than 32 bits + */ + if (mask != CLOCKSOURCE_MASK(64) + && ((mask & (mask + 1)) != 0 || mask > 0xffffffff)) + return; + } + + /* + * We prefer a clocksource with a better resolution than 1us + */ + if (cs->shift <= 34) { + freq = 1000000000ULL << cs->shift; + do_div(freq, cs->mult); + } else { + freq = 1000000ULL << cs->shift; + do_div(freq, cs->mult); + freq *= 1000; + } + if (freq < 1000000) + return; + + /* Measure the clocksource latency */ + flags = hard_local_irq_save(); + saved = __ipipe_cs_last_tsc; + lat = cread(cs); + for (i = 0; i < 10; i++) + cread(cs); + lat = cread(cs) - lat; + __ipipe_cs_last_tsc = saved; + hard_local_irq_restore(flags); + lat = (lat * cs->mult) >> cs->shift; + do_div(lat, i + 1); + + if (!strcmp(cs->name, override_name)) + goto skip_tests; + + if (lat > __ipipe_cs_lat) + return; + + if (__ipipe_cs && !strcmp(__ipipe_cs->name, override_name)) + return; + + skip_tests: + flags = hard_local_irq_save(); + if (__ipipe_cs_last_tsc == 0) { + __ipipe_cs_lat = lat; + __ipipe_cs_freq = freq; + __ipipe_cs = cs; + __ipipe_cs_read = cread; + __ipipe_cs_mask = mask; + } + hard_local_irq_restore(flags); +} +#else /* !CONFIG_IPIPE_WANT_CLOCKSOURCE */ +#define ipipe_check_clocksource(cs) do { }while (0) +#endif /* !CONFIG_IPIPE_WANT_CLOCKSOURCE */ + /* * Enqueue the clocksource sorted by rating */ @@ -643,6 +733,8 @@ static void clocksource_enqueue(struct clocksource *cs) if (tmp->rating >= cs->rating) entry = &tmp->list; list_add(&cs->list, entry); + + ipipe_check_clocksource(cs); } /** diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index da6c9ec..0c4be00 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -72,7 +72,7 @@ static void tick_periodic(int cpu) write_sequnlock(&xtime_lock); } - update_process_times(user_mode(get_irq_regs())); + update_root_process_times(get_irq_regs()); profile_tick(CPU_PROFILING); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a057ed4..32138a1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -657,7 +657,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) ts->idle_jiffies++; } - update_process_times(user_mode(regs)); + update_root_process_times(regs); profile_tick(CPU_PROFILING); while (tick_nohz_reprogram(ts, now)) { @@ -814,7 +814,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) touch_softlockup_watchdog(); ts->idle_jiffies++; } - update_process_times(user_mode(regs)); + update_root_process_times(regs); profile_tick(CPU_PROFILING); } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 63c88c1..4779fb3 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1362,3 +1362,9 @@ void xtime_update(unsigned long ticks) do_timer(ticks); write_sequnlock(&xtime_lock); } + +struct timespec get_wall_to_monotonic(void) +{ + /* xtime_lock must be held. */ + return timekeeper.wall_to_monotonic; +} diff --git a/kernel/timer.c b/kernel/timer.c index 6ec7e7e..c3ad7cf 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1359,6 +1359,25 @@ void update_process_times(int user_tick) run_posix_cpu_timers(p); } +#ifdef CONFIG_IPIPE + +void update_root_process_times(struct pt_regs *regs) +{ + int cpu, user_tick = user_mode(regs); + + if (__ipipe_root_tick_p(regs)) { + update_process_times(user_tick); + return; + } + + run_local_timers(); + cpu = smp_processor_id(); + rcu_check_callbacks(cpu, user_tick); + run_posix_cpu_timers(current); +} + +#endif + /* * This function runs timers and the timer-tq in bottom half context. */ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8c4c070..64ac2e1 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -408,6 +408,7 @@ config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FUNCTION_TRACER depends on HAVE_DYNAMIC_FTRACE + depends on !IPIPE_TRACE_MCOUNT default y help This option will modify all the calls to ftrace dynamically diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a008663..a460831 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -32,6 +32,7 @@ #include #include #include +#include #include @@ -1862,6 +1863,9 @@ void __weak arch_ftrace_update_code(int command) static void ftrace_run_update_code(int command) { +#ifdef CONFIG_IPIPE + unsigned long flags; +#endif /* CONFIG_IPIPE */ int ret; ret = ftrace_arch_code_modify_prepare(); @@ -1880,7 +1884,13 @@ static void ftrace_run_update_code(int command) * is safe. The stop_machine() is the safest, but also * produces the most overhead. */ +#ifdef CONFIG_IPIPE + flags = ipipe_critical_enter(NULL); + __ftrace_modify_code(&command); + ipipe_critical_exit(flags); +#else /* !CONFIG_IPIPE */ arch_ftrace_update_code(command); +#endif /* !CONFIG_IPIPE */ #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST /* @@ -3770,10 +3780,10 @@ static int ftrace_process_locs(struct module *mod, * reason to cause large interrupt latencies while we do it. */ if (!mod) - local_irq_save(flags); + flags = hard_local_irq_save(); ftrace_update_code(mod); if (!mod) - local_irq_restore(flags); + hard_local_irq_restore(flags); ret = 0; out: mutex_unlock(&ftrace_lock); @@ -3877,9 +3887,9 @@ void __init ftrace_init(void) /* Keep the ftrace pointer to the stub */ addr = (unsigned long)ftrace_stub; - local_irq_save(flags); + flags = hard_local_irq_save_notrace(); ftrace_dyn_arch_init(&addr); - local_irq_restore(flags); + hard_local_irq_restore_notrace(flags); /* ftrace_dyn_arch_init places the return code in addr */ if (addr) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ff5bdee..ac7cfa7 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -156,6 +156,8 @@ config DEBUG_SECTION_MISMATCH - Enable verbose reporting from modpost in order to help resolve the section mismatches that are reported. +source "kernel/ipipe/Kconfig.debug" + config DEBUG_KERNEL bool "Kernel debugging" help diff --git a/lib/bust_spinlocks.c b/lib/bust_spinlocks.c index 9681d54..2dba50c 100644 --- a/lib/bust_spinlocks.c +++ b/lib/bust_spinlocks.c @@ -13,6 +13,7 @@ #include #include #include +#include void __attribute__((weak)) bust_spinlocks(int yes) @@ -24,6 +25,7 @@ void __attribute__((weak)) bust_spinlocks(int yes) unblank_screen(); #endif console_unblank(); + ipipe_trace_panic_dump(); if (--oops_in_progress == 0) wake_up_klogd(); } diff --git a/lib/ioremap.c b/lib/ioremap.c index 0c9216c..ba85d06 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -86,8 +86,8 @@ int ioremap_page_range(unsigned long addr, if (err) break; } while (pgd++, addr = next, addr != end); - - flush_cache_vmap(start, end); + __ipipe_pin_range_globally(start, end); + flush_cache_vmap(start, end); return err; } diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index 4c0d0e5..886125d 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -12,10 +12,13 @@ notrace unsigned int debug_smp_processor_id(void) unsigned long preempt_count = preempt_count(); int this_cpu = raw_smp_processor_id(); + if (!ipipe_root_p) + goto out; + if (likely(preempt_count)) goto out; - if (irqs_disabled()) + if (irqs_disabled() || hard_irqs_disabled()) goto out; /* diff --git a/mm/Kconfig b/mm/Kconfig index 82fed4e..0ebccef 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -314,6 +314,7 @@ config NOMMU_INITIAL_TRIM_EXCESS config TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" depends on X86 && MMU + depends on !IPIPE select COMPACTION help Transparent Hugepages allows the kernel to use huge pages and diff --git a/mm/memory.c b/mm/memory.c index 2466d12..4230192 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -822,6 +822,32 @@ out: return pfn_to_page(pfn); } +static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) +{ + /* + * If the source page was a PFN mapping, we don't have + * a "struct page" for it. We do a best-effort copy by + * just copying from the original user address. If that + * fails, we just zero-fill it. Live with it. + */ + if (unlikely(!src)) { + void *kaddr = kmap_atomic(dst); + void __user *uaddr = (void __user *)(va & PAGE_MASK); + + /* + * This really shouldn't fail, because the page is there + * in the page tables. But it might just be unreadable, + * in which case we just give up and fill the result with + * zeroes. + */ + if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) + clear_page(kaddr); + kunmap_atomic(kaddr); + flush_dcache_page(dst); + } else + copy_user_highpage(dst, src, va, vma); +} + /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range @@ -830,8 +856,8 @@ out: static inline unsigned long copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, - unsigned long addr, int *rss) + pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, + unsigned long addr, int *rss, struct page *uncow_page) { unsigned long vm_flags = vma->vm_flags; pte_t pte = *src_pte; @@ -883,6 +909,21 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * in the parent and the child */ if (is_cow_mapping(vm_flags)) { +#ifdef CONFIG_IPIPE + if (uncow_page) { + struct page *old_page = vm_normal_page(vma, addr, pte); + cow_user_page(uncow_page, old_page, addr, vma); + pte = mk_pte(uncow_page, vma->vm_page_prot); + + if (vm_flags & VM_SHARED) + pte = pte_mkclean(pte); + pte = pte_mkold(pte); + + page_add_new_anon_rmap(uncow_page, vma, addr); + rss[!!PageAnon(uncow_page)]++; + goto out_set_pte; + } +#endif /* CONFIG_IPIPE */ ptep_set_wrprotect(src_mm, addr, src_pte); pte = pte_wrprotect(pte); } @@ -920,13 +961,27 @@ int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, int progress = 0; int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; - + struct page *uncow_page = NULL; +#ifdef CONFIG_IPIPE + int do_cow_break = 0; again: + if (do_cow_break) { + uncow_page = alloc_page_vma(GFP_HIGHUSER, vma, addr); + if (uncow_page == NULL) + return -ENOMEM; + do_cow_break = 0; + } +#else +again: +#endif init_rss_vec(rss); dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); - if (!dst_pte) + if (!dst_pte) { + if (uncow_page) + page_cache_release(uncow_page); return -ENOMEM; + } src_pte = pte_offset_map(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); @@ -949,8 +1004,25 @@ again: progress++; continue; } +#ifdef CONFIG_IPIPE + if (likely(uncow_page == NULL) && likely(pte_present(*src_pte))) { + if (is_cow_mapping(vma->vm_flags) && + test_bit(MMF_VM_PINNED, &src_mm->flags) && + ((vma->vm_flags|src_mm->def_flags) & VM_LOCKED)) { + arch_leave_lazy_mmu_mode(); + spin_unlock(src_ptl); + pte_unmap(src_pte); + add_mm_rss_vec(dst_mm, rss); + pte_unmap_unlock(dst_pte, dst_ptl); + cond_resched(); + do_cow_break = 1; + goto again; + } + } +#endif entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, - vma, addr, rss); + vma, addr, rss, uncow_page); + uncow_page = NULL; if (entry.val) break; progress += 8; @@ -2455,32 +2527,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, return same; } -static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) -{ - /* - * If the source page was a PFN mapping, we don't have - * a "struct page" for it. We do a best-effort copy by - * just copying from the original user address. If that - * fails, we just zero-fill it. Live with it. - */ - if (unlikely(!src)) { - void *kaddr = kmap_atomic(dst); - void __user *uaddr = (void __user *)(va & PAGE_MASK); - - /* - * This really shouldn't fail, because the page is there - * in the page tables. But it might just be unreadable, - * in which case we just give up and fill the result with - * zeroes. - */ - if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) - clear_page(kaddr); - kunmap_atomic(kaddr); - flush_dcache_page(dst); - } else - copy_user_highpage(dst, src, va, vma); -} - /* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address @@ -4026,3 +4072,38 @@ void copy_user_huge_page(struct page *dst, struct page *src, } } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ + +#ifdef CONFIG_IPIPE + +int __ipipe_disable_ondemand_mappings(struct task_struct *tsk) +{ + struct vm_area_struct *vma; + struct mm_struct *mm; + int result = 0; + + mm = get_task_mm(tsk); + if (!mm) + return -EPERM; + + down_write(&mm->mmap_sem); + if (test_bit(MMF_VM_PINNED, &mm->flags)) + goto done_mm; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (is_cow_mapping(vma->vm_flags) && + (vma->vm_flags & VM_WRITE)) { + result = __ipipe_pin_vma(mm, vma); + if (result < 0) + goto done_mm; + } + } + set_bit(MMF_VM_PINNED, &mm->flags); + + done_mm: + up_write(&mm->mmap_sem); + mmput(mm); + return result; +} +EXPORT_SYMBOL_GPL(__ipipe_disable_ondemand_mappings); + +#endif diff --git a/mm/mlock.c b/mm/mlock.c index ef726e8..158828b 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -624,3 +624,21 @@ void user_shm_unlock(size_t size, struct user_struct *user) spin_unlock(&shmlock_user_lock); free_uid(user); } + +#ifdef CONFIG_IPIPE +int __ipipe_pin_vma(struct mm_struct *mm, struct vm_area_struct *vma) +{ + int ret; + + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + return 0; + + if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || + is_vm_hugetlb_page(vma) || vma == get_gate_vma(mm))) { + ret = __mlock_vma_pages_range(vma, vma->vm_start, vma->vm_end, + NULL); + return (ret < 0) ? ret : 0; + } else + return make_pages_present(vma->vm_start, vma->vm_end); +} +#endif diff --git a/mm/mmap.c b/mm/mmap.c index fa1f274..a715aeb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2174,7 +2174,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); + error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED | MAP_BRK); if (error & ~PAGE_MASK) return error; diff --git a/mm/mmu_context.c b/mm/mmu_context.c index 3dcfaf4..e4ac923 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c @@ -24,15 +24,18 @@ void use_mm(struct mm_struct *mm) { struct mm_struct *active_mm; struct task_struct *tsk = current; + unsigned long flags; task_lock(tsk); active_mm = tsk->active_mm; + ipipe_mm_switch_protect(flags); if (active_mm != mm) { atomic_inc(&mm->mm_count); tsk->active_mm = mm; } tsk->mm = mm; - switch_mm(active_mm, mm, tsk); + __switch_mm(active_mm, mm, tsk); + ipipe_mm_switch_unprotect(flags); task_unlock(tsk); if (active_mm != mm) diff --git a/mm/mprotect.c b/mm/mprotect.c index a409926..057b1d8 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -218,6 +218,12 @@ success: hugetlb_change_protection(vma, start, end, vma->vm_page_prot); else change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); +#ifdef CONFIG_IPIPE + if (test_bit(MMF_VM_PINNED, &mm->flags) && + ((vma->vm_flags | mm->def_flags) & VM_LOCKED) && + (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))) + __ipipe_pin_vma(mm, vma); +#endif mmu_notifier_invalidate_range_end(mm, start, end); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2aad499..52aaaf7 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -171,6 +171,8 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end, return err; } while (pgd++, addr = next, addr != end); + __ipipe_pin_range_globally(start, end); + return nr; } diff --git a/scripts/package/builddeb b/scripts/package/builddeb index 6505ecd..acb8650 100644 (file) --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -243,7 +243,7 @@ EOF fi # Build header package -(cd $srctree; find . -name Makefile -o -name Kconfig\* -o -name \*.pl > "$objtree/debian/hdrsrcfiles") +(cd $srctree; find . -name Makefile\* -o -name Kconfig\* -o -name \*.pl > "$objtree/debian/hdrsrcfiles") (cd $srctree; find arch/$SRCARCH/include include scripts -type f >> "$objtree/debian/hdrsrcfiles") (cd $objtree; find arch/$SRCARCH/include .config Module.symvers include scripts -type f >> "$objtree/debian/hdrobjfiles") destdir=$kernel_headers_dir/usr/src/linux-headers-$version