summaryrefslogtreecommitdiffstats
path: root/kernel/kernel/time
diff options
context:
space:
mode:
authorJosé Pekkarinen <jose.pekkarinen@nokia.com>2016-04-11 10:41:07 +0300
committerJosé Pekkarinen <jose.pekkarinen@nokia.com>2016-04-13 08:17:18 +0300
commite09b41010ba33a20a87472ee821fa407a5b8da36 (patch)
treed10dc367189862e7ca5c592f033dc3726e1df4e3 /kernel/kernel/time
parentf93b97fd65072de626c074dbe099a1fff05ce060 (diff)
These changes are the raw update to linux-4.4.6-rt14. Kernel sources
are taken from kernel.org, and rt patch from the rt wiki download page. During the rebasing, the following patch collided: Force tick interrupt and get rid of softirq magic(I70131fb85). Collisions have been removed because its logic was found on the source already. Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769 Signed-off-by: José Pekkarinen <jose.pekkarinen@nokia.com>
Diffstat (limited to 'kernel/kernel/time')
-rw-r--r--kernel/kernel/time/Kconfig2
-rw-r--r--kernel/kernel/time/Makefile17
-rw-r--r--kernel/kernel/time/alarmtimer.c17
-rw-r--r--kernel/kernel/time/clockevents.c129
-rw-r--r--kernel/kernel/time/clocksource.c33
-rw-r--r--kernel/kernel/time/hrtimer.c904
-rw-r--r--kernel/kernel/time/itimer.c2
-rw-r--r--kernel/kernel/time/ntp.c82
-rw-r--r--kernel/kernel/time/ntp_internal.h3
-rw-r--r--kernel/kernel/time/posix-clock.c4
-rw-r--r--kernel/kernel/time/posix-cpu-timers.c141
-rw-r--r--kernel/kernel/time/posix-timers.c19
-rw-r--r--kernel/kernel/time/tick-broadcast-hrtimer.c56
-rw-r--r--kernel/kernel/time/tick-broadcast.c257
-rw-r--r--kernel/kernel/time/tick-common.c61
-rw-r--r--kernel/kernel/time/tick-internal.h31
-rw-r--r--kernel/kernel/time/tick-oneshot.c22
-rw-r--r--kernel/kernel/time/tick-sched.c375
-rw-r--r--kernel/kernel/time/tick-sched.h12
-rw-r--r--kernel/kernel/time/time.c131
-rw-r--r--kernel/kernel/time/timeconst.bc5
-rw-r--r--kernel/kernel/time/timekeeping.c224
-rw-r--r--kernel/kernel/time/timekeeping.h11
-rw-r--r--kernel/kernel/time/timer.c441
-rw-r--r--kernel/kernel/time/timer_list.c99
-rw-r--r--kernel/kernel/time/timer_stats.c10
26 files changed, 1560 insertions, 1528 deletions
diff --git a/kernel/kernel/time/Kconfig b/kernel/kernel/time/Kconfig
index 579ce1b92..4008d9f95 100644
--- a/kernel/kernel/time/Kconfig
+++ b/kernel/kernel/time/Kconfig
@@ -92,12 +92,10 @@ config NO_HZ_FULL
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
# We need at least one periodic CPU for timekeeping
depends on SMP
- # RCU_USER_QS dependency
depends on HAVE_CONTEXT_TRACKING
# VIRT_CPU_ACCOUNTING_GEN dependency
depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
select NO_HZ_COMMON
- select RCU_USER_QS
select RCU_NOCB_CPU
select VIRT_CPU_ACCOUNTING_GEN
select IRQ_WORK
diff --git a/kernel/kernel/time/Makefile b/kernel/kernel/time/Makefile
index 01f031241..49eca0bee 100644
--- a/kernel/kernel/time/Makefile
+++ b/kernel/kernel/time/Makefile
@@ -12,20 +12,3 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
obj-$(CONFIG_TIMER_STATS) += timer_stats.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
-
-$(obj)/time.o: $(obj)/timeconst.h
-
-quiet_cmd_hzfile = HZFILE $@
- cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
-
-targets += hz.bc
-$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
- $(call if_changed,hzfile)
-
-quiet_cmd_bc = BC $@
- cmd_bc = bc -q $(filter-out FORCE,$^) > $@
-
-targets += timeconst.h
-$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
- $(call if_changed,bc)
-
diff --git a/kernel/kernel/time/alarmtimer.c b/kernel/kernel/time/alarmtimer.c
index 1b001ed1e..7fbba635a 100644
--- a/kernel/kernel/time/alarmtimer.c
+++ b/kernel/kernel/time/alarmtimer.c
@@ -317,19 +317,16 @@ EXPORT_SYMBOL_GPL(alarm_init);
* @alarm: ptr to alarm to set
* @start: time to run the alarm
*/
-int alarm_start(struct alarm *alarm, ktime_t start)
+void alarm_start(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
unsigned long flags;
- int ret;
spin_lock_irqsave(&base->lock, flags);
alarm->node.expires = start;
alarmtimer_enqueue(base, alarm);
- ret = hrtimer_start(&alarm->timer, alarm->node.expires,
- HRTIMER_MODE_ABS);
+ hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
spin_unlock_irqrestore(&base->lock, flags);
- return ret;
}
EXPORT_SYMBOL_GPL(alarm_start);
@@ -338,12 +335,12 @@ EXPORT_SYMBOL_GPL(alarm_start);
* @alarm: ptr to alarm to set
* @start: time relative to now to run the alarm
*/
-int alarm_start_relative(struct alarm *alarm, ktime_t start)
+void alarm_start_relative(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
start = ktime_add(start, base->gettime());
- return alarm_start(alarm, start);
+ alarm_start(alarm, start);
}
EXPORT_SYMBOL_GPL(alarm_start_relative);
@@ -495,12 +492,12 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
*/
static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
{
- clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
-
if (!alarmtimer_get_rtcdev())
return -EINVAL;
- return hrtimer_get_res(baseid, tp);
+ tp->tv_sec = 0;
+ tp->tv_nsec = hrtimer_resolution;
+ return 0;
}
/**
diff --git a/kernel/kernel/time/clockevents.c b/kernel/kernel/time/clockevents.c
index 637a09461..a9b76a403 100644
--- a/kernel/kernel/time/clockevents.c
+++ b/kernel/kernel/time/clockevents.c
@@ -94,23 +94,9 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
}
EXPORT_SYMBOL_GPL(clockevent_delta2ns);
-static int __clockevents_set_state(struct clock_event_device *dev,
- enum clock_event_state state)
+static int __clockevents_switch_state(struct clock_event_device *dev,
+ enum clock_event_state state)
{
- /* Transition with legacy set_mode() callback */
- if (dev->set_mode) {
- /* Legacy callback doesn't support new modes */
- if (state > CLOCK_EVT_STATE_ONESHOT)
- return -ENOSYS;
- /*
- * 'clock_event_state' and 'clock_event_mode' have 1-to-1
- * mapping until *_ONESHOT, and so a simple cast will work.
- */
- dev->set_mode((enum clock_event_mode)state, dev);
- dev->mode = (enum clock_event_mode)state;
- return 0;
- }
-
if (dev->features & CLOCK_EVT_FEAT_DUMMY)
return 0;
@@ -120,19 +106,37 @@ static int __clockevents_set_state(struct clock_event_device *dev,
/* The clockevent device is getting replaced. Shut it down. */
case CLOCK_EVT_STATE_SHUTDOWN:
- return dev->set_state_shutdown(dev);
+ if (dev->set_state_shutdown)
+ return dev->set_state_shutdown(dev);
+ return 0;
case CLOCK_EVT_STATE_PERIODIC:
/* Core internal bug */
if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC))
return -ENOSYS;
- return dev->set_state_periodic(dev);
+ if (dev->set_state_periodic)
+ return dev->set_state_periodic(dev);
+ return 0;
case CLOCK_EVT_STATE_ONESHOT:
/* Core internal bug */
if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
return -ENOSYS;
- return dev->set_state_oneshot(dev);
+ if (dev->set_state_oneshot)
+ return dev->set_state_oneshot(dev);
+ return 0;
+
+ case CLOCK_EVT_STATE_ONESHOT_STOPPED:
+ /* Core internal bug */
+ if (WARN_ONCE(!clockevent_state_oneshot(dev),
+ "Current state: %d\n",
+ clockevent_get_state(dev)))
+ return -EINVAL;
+
+ if (dev->set_state_oneshot_stopped)
+ return dev->set_state_oneshot_stopped(dev);
+ else
+ return -ENOSYS;
default:
return -ENOSYS;
@@ -140,26 +144,26 @@ static int __clockevents_set_state(struct clock_event_device *dev,
}
/**
- * clockevents_set_state - set the operating state of a clock event device
+ * clockevents_switch_state - set the operating state of a clock event device
* @dev: device to modify
* @state: new state
*
* Must be called with interrupts disabled !
*/
-void clockevents_set_state(struct clock_event_device *dev,
- enum clock_event_state state)
+void clockevents_switch_state(struct clock_event_device *dev,
+ enum clock_event_state state)
{
- if (dev->state != state) {
- if (__clockevents_set_state(dev, state))
+ if (clockevent_get_state(dev) != state) {
+ if (__clockevents_switch_state(dev, state))
return;
- dev->state = state;
+ clockevent_set_state(dev, state);
/*
* A nsec2cyc multiplicator of 0 is invalid and we'd crash
* on it, so fix it up and emit a warning:
*/
- if (state == CLOCK_EVT_STATE_ONESHOT) {
+ if (clockevent_state_oneshot(dev)) {
if (unlikely(!dev->mult)) {
dev->mult = 1;
WARN_ON(1);
@@ -174,7 +178,7 @@ void clockevents_set_state(struct clock_event_device *dev,
*/
void clockevents_shutdown(struct clock_event_device *dev)
{
- clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
dev->next_event.tv64 = KTIME_MAX;
}
@@ -186,12 +190,8 @@ int clockevents_tick_resume(struct clock_event_device *dev)
{
int ret = 0;
- if (dev->set_mode) {
- dev->set_mode(CLOCK_EVT_MODE_RESUME, dev);
- dev->mode = CLOCK_EVT_MODE_RESUME;
- } else if (dev->tick_resume) {
+ if (dev->tick_resume)
ret = dev->tick_resume(dev);
- }
return ret;
}
@@ -248,7 +248,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
delta = dev->min_delta_ns;
dev->next_event = ktime_add_ns(ktime_get(), delta);
- if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
+ if (clockevent_state_shutdown(dev))
return 0;
dev->retries++;
@@ -285,7 +285,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
delta = dev->min_delta_ns;
dev->next_event = ktime_add_ns(ktime_get(), delta);
- if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
+ if (clockevent_state_shutdown(dev))
return 0;
dev->retries++;
@@ -317,9 +317,13 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
dev->next_event = expires;
- if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
+ if (clockevent_state_shutdown(dev))
return 0;
+ /* We must be in ONESHOT state here */
+ WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n",
+ clockevent_get_state(dev));
+
/* Shortcut for clockevent devices that can deal with ktime. */
if (dev->features & CLOCK_EVT_FEAT_KTIME)
return dev->set_next_ktime(expires, dev);
@@ -362,7 +366,7 @@ static int clockevents_replace(struct clock_event_device *ced)
struct clock_event_device *dev, *newdev = NULL;
list_for_each_entry(dev, &clockevent_devices, list) {
- if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED)
+ if (dev == ced || !clockevent_state_detached(dev))
continue;
if (!tick_check_replacement(newdev, dev))
@@ -388,7 +392,7 @@ static int clockevents_replace(struct clock_event_device *ced)
static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu)
{
/* Fast track. Device is unused */
- if (ced->state == CLOCK_EVT_STATE_DETACHED) {
+ if (clockevent_state_detached(ced)) {
list_del_init(&ced->list);
return 0;
}
@@ -438,37 +442,6 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
}
EXPORT_SYMBOL_GPL(clockevents_unbind_device);
-/* Sanity check of state transition callbacks */
-static int clockevents_sanity_check(struct clock_event_device *dev)
-{
- /* Legacy set_mode() callback */
- if (dev->set_mode) {
- /* We shouldn't be supporting new modes now */
- WARN_ON(dev->set_state_periodic || dev->set_state_oneshot ||
- dev->set_state_shutdown || dev->tick_resume);
-
- BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
- return 0;
- }
-
- if (dev->features & CLOCK_EVT_FEAT_DUMMY)
- return 0;
-
- /* New state-specific callbacks */
- if (!dev->set_state_shutdown)
- return -EINVAL;
-
- if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
- !dev->set_state_periodic)
- return -EINVAL;
-
- if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) &&
- !dev->set_state_oneshot)
- return -EINVAL;
-
- return 0;
-}
-
/**
* clockevents_register_device - register a clock event device
* @dev: device to register
@@ -477,10 +450,8 @@ void clockevents_register_device(struct clock_event_device *dev)
{
unsigned long flags;
- BUG_ON(clockevents_sanity_check(dev));
-
/* Initialize state to DETACHED */
- dev->state = CLOCK_EVT_STATE_DETACHED;
+ clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
if (!dev->cpumask) {
WARN_ON(num_possible_cpus() > 1);
@@ -545,11 +516,11 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
{
clockevents_config(dev, freq);
- if (dev->state == CLOCK_EVT_STATE_ONESHOT)
+ if (clockevent_state_oneshot(dev))
return clockevents_program_event(dev, dev->next_event, false);
- if (dev->state == CLOCK_EVT_STATE_PERIODIC)
- return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
+ if (clockevent_state_periodic(dev))
+ return __clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC);
return 0;
}
@@ -603,13 +574,13 @@ void clockevents_exchange_device(struct clock_event_device *old,
*/
if (old) {
module_put(old->owner);
- clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED);
+ clockevents_switch_state(old, CLOCK_EVT_STATE_DETACHED);
list_del(&old->list);
list_add(&old->list, &clockevents_released);
}
if (new) {
- BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED);
+ BUG_ON(!clockevent_state_detached(new));
clockevents_shutdown(new);
}
}
@@ -622,7 +593,7 @@ void clockevents_suspend(void)
struct clock_event_device *dev;
list_for_each_entry_reverse(dev, &clockevent_devices, list)
- if (dev->suspend)
+ if (dev->suspend && !clockevent_state_detached(dev))
dev->suspend(dev);
}
@@ -634,7 +605,7 @@ void clockevents_resume(void)
struct clock_event_device *dev;
list_for_each_entry(dev, &clockevent_devices, list)
- if (dev->resume)
+ if (dev->resume && !clockevent_state_detached(dev))
dev->resume(dev);
}
@@ -665,7 +636,7 @@ void tick_cleanup_dead_cpu(int cpu)
if (cpumask_test_cpu(cpu, dev->cpumask) &&
cpumask_weight(dev->cpumask) == 1 &&
!tick_is_broadcast_device(dev)) {
- BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED);
+ BUG_ON(!clockevent_state_detached(dev));
list_del(&dev->list);
}
}
diff --git a/kernel/kernel/time/clocksource.c b/kernel/kernel/time/clocksource.c
index 15facb1b9..1347882d1 100644
--- a/kernel/kernel/time/clocksource.c
+++ b/kernel/kernel/time/clocksource.c
@@ -23,6 +23,8 @@
* o Allow clocksource drivers to be unregistered
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/device.h>
#include <linux/clocksource.h>
#include <linux/init.h>
@@ -215,11 +217,12 @@ static void clocksource_watchdog(unsigned long data)
continue;
/* Check the deviation from the watchdog clocksource. */
- if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
- pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name);
- pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
+ if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
+ pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n",
+ cs->name);
+ pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
watchdog->name, wdnow, wdlast, watchdog->mask);
- pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
+ pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
cs->name, csnow, cslast, cs->mask);
__clocksource_unstable(cs);
continue;
@@ -476,7 +479,7 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
* return half the number of nanoseconds the hardware counter can technically
* cover. This is done so that we can potentially detect problems caused by
* delayed timers or bad hardware, which might result in time intervals that
- * are larger then what the math used can handle without overflows.
+ * are larger than what the math used can handle without overflows.
*/
u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
{
@@ -567,9 +570,8 @@ static void __clocksource_select(bool skipcur)
*/
if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {
/* Override clocksource cannot be used. */
- printk(KERN_WARNING "Override clocksource %s is not "
- "HRT compatible. Cannot switch while in "
- "HRT/NOHZ mode\n", cs->name);
+ pr_warn("Override clocksource %s is not HRT compatible - cannot switch while in HRT/NOHZ mode\n",
+ cs->name);
override_name[0] = 0;
} else
/* Override clocksource can be used. */
@@ -593,16 +595,15 @@ static void __clocksource_select(bool skipcur)
*/
static void clocksource_select(void)
{
- return __clocksource_select(false);
+ __clocksource_select(false);
}
static void clocksource_select_fallback(void)
{
- return __clocksource_select(true);
+ __clocksource_select(true);
}
#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
-
static inline void clocksource_select(void) { }
static inline void clocksource_select_fallback(void) { }
@@ -708,8 +709,8 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq
clocksource_update_max_deferment(cs);
- pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
- cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
+ pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
+ cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
}
EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
@@ -1008,12 +1009,10 @@ __setup("clocksource=", boot_override_clocksource);
static int __init boot_override_clock(char* str)
{
if (!strcmp(str, "pmtmr")) {
- printk("Warning: clock=pmtmr is deprecated. "
- "Use clocksource=acpi_pm.\n");
+ pr_warn("clock=pmtmr is deprecated - use clocksource=acpi_pm\n");
return boot_override_clocksource("acpi_pm");
}
- printk("Warning! clock= boot option is deprecated. "
- "Use clocksource=xyz\n");
+ pr_warn("clock= boot option is deprecated - use clocksource=xyz\n");
return boot_override_clocksource(str);
}
diff --git a/kernel/kernel/time/hrtimer.c b/kernel/kernel/time/hrtimer.c
index 5d193396e..2659c632a 100644
--- a/kernel/kernel/time/hrtimer.c
+++ b/kernel/kernel/time/hrtimer.c
@@ -61,40 +61,36 @@
/*
* The timer bases:
*
- * There are more clockids then hrtimer bases. Thus, we index
+ * There are more clockids than hrtimer bases. Thus, we index
* into the timer bases by the hrtimer_base_type enum. When trying
* to reach a base using a clockid, hrtimer_clockid_to_base()
* is used to convert from clockid to the proper hrtimer_base_type.
*/
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
-
.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
+ .seq = SEQCNT_ZERO(hrtimer_bases.seq),
.clock_base =
{
{
.index = HRTIMER_BASE_MONOTONIC,
.clockid = CLOCK_MONOTONIC,
.get_time = &ktime_get,
- .resolution = KTIME_LOW_RES,
},
{
.index = HRTIMER_BASE_REALTIME,
.clockid = CLOCK_REALTIME,
.get_time = &ktime_get_real,
- .resolution = KTIME_LOW_RES,
},
{
.index = HRTIMER_BASE_BOOTTIME,
.clockid = CLOCK_BOOTTIME,
.get_time = &ktime_get_boottime,
- .resolution = KTIME_LOW_RES,
},
{
.index = HRTIMER_BASE_TAI,
.clockid = CLOCK_TAI,
.get_time = &ktime_get_clocktai,
- .resolution = KTIME_LOW_RES,
},
}
};
@@ -111,27 +107,6 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
return hrtimer_clock_to_base_table[clock_id];
}
-
-/*
- * Get the coarse grained time at the softirq based on xtime and
- * wall_to_monotonic.
- */
-static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
-{
- ktime_t xtim, mono, boot, tai;
- ktime_t off_real, off_boot, off_tai;
-
- mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai);
- boot = ktime_add(mono, off_boot);
- xtim = ktime_add(mono, off_real);
- tai = ktime_add(mono, off_tai);
-
- base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
- base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
- base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
- base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai;
-}
-
/*
* Functions and macros which are different for UP/SMP systems are kept in a
* single place
@@ -139,6 +114,18 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
#ifdef CONFIG_SMP
/*
+ * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
+ * such that hrtimer_callback_running() can unconditionally dereference
+ * timer->base->cpu_base
+ */
+static struct hrtimer_cpu_base migration_cpu_base = {
+ .seq = SEQCNT_ZERO(migration_cpu_base),
+ .clock_base = { { .cpu_base = &migration_cpu_base, }, },
+};
+
+#define migration_base migration_cpu_base.clock_base[0]
+
+/*
* We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
* means that all timers which are tied to this base via timer->base are
* locked, and the base itself is locked too.
@@ -147,8 +134,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
* be found on the lists/queues.
*
* When the timer's base is locked, and the timer removed from list, it is
- * possible to set timer->base = NULL and drop the lock: the timer remains
- * locked.
+ * possible to set timer->base = &migration_base and drop the lock: the timer
+ * remains locked.
*/
static
struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
@@ -158,7 +145,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
for (;;) {
base = timer->base;
- if (likely(base != NULL)) {
+ if (likely(base != &migration_base)) {
raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
if (likely(base == timer->base))
return base;
@@ -192,21 +179,47 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
#endif
}
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+static inline
+struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
+ int pinned)
+{
+ if (pinned || !base->migration_enabled)
+ return base;
+ return &per_cpu(hrtimer_bases, get_nohz_timer_target());
+}
+#else
+static inline
+struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
+ int pinned)
+{
+ return base;
+}
+#endif
+
/*
- * Switch the timer base to the current CPU when possible.
+ * We switch the timer base to a power-optimized selected CPU target,
+ * if:
+ * - NO_HZ_COMMON is enabled
+ * - timer migration is enabled
+ * - the timer callback is not running
+ * - the timer is not the first expiring timer on the new target
+ *
+ * If one of the above requirements is not fulfilled we move the timer
+ * to the current CPU or leave it on the previously assigned CPU if
+ * the timer callback is currently running.
*/
static inline struct hrtimer_clock_base *
switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
int pinned)
{
+ struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
struct hrtimer_clock_base *new_base;
- struct hrtimer_cpu_base *new_cpu_base;
- int this_cpu = smp_processor_id();
- int cpu = get_nohz_timer_target(pinned);
int basenum = base->index;
+ this_cpu_base = this_cpu_ptr(&hrtimer_bases);
+ new_cpu_base = get_target_base(this_cpu_base, pinned);
again:
- new_cpu_base = &per_cpu(hrtimer_bases, cpu);
new_base = &new_cpu_base->clock_base[basenum];
if (base != new_base) {
@@ -222,22 +235,24 @@ again:
if (unlikely(hrtimer_callback_running(timer)))
return base;
- /* See the comment in lock_timer_base() */
- timer->base = NULL;
+ /* See the comment in lock_hrtimer_base() */
+ timer->base = &migration_base;
raw_spin_unlock(&base->cpu_base->lock);
raw_spin_lock(&new_base->cpu_base->lock);
- if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
- cpu = this_cpu;
+ if (new_cpu_base != this_cpu_base &&
+ hrtimer_check_target(timer, new_base)) {
raw_spin_unlock(&new_base->cpu_base->lock);
raw_spin_lock(&base->cpu_base->lock);
+ new_cpu_base = this_cpu_base;
timer->base = base;
goto again;
}
timer->base = new_base;
} else {
- if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
- cpu = this_cpu;
+ if (new_cpu_base != this_cpu_base &&
+ hrtimer_check_target(timer, new_base)) {
+ new_cpu_base = this_cpu_base;
goto again;
}
}
@@ -445,24 +460,35 @@ static inline void debug_deactivate(struct hrtimer *timer)
}
#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
+static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base,
+ struct hrtimer *timer)
+{
+#ifdef CONFIG_HIGH_RES_TIMERS
+ cpu_base->next_timer = timer;
+#endif
+}
+
static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
{
struct hrtimer_clock_base *base = cpu_base->clock_base;
ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
- int i;
+ unsigned int active = cpu_base->active_bases;
- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+ hrtimer_update_next_timer(cpu_base, NULL);
+ for (; active; base++, active >>= 1) {
struct timerqueue_node *next;
struct hrtimer *timer;
- next = timerqueue_getnext(&base->active);
- if (!next)
+ if (!(active & 0x01))
continue;
+ next = timerqueue_getnext(&base->active);
timer = container_of(next, struct hrtimer, node);
expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
- if (expires.tv64 < expires_next.tv64)
+ if (expires.tv64 < expires_next.tv64) {
expires_next = expires;
+ hrtimer_update_next_timer(cpu_base, timer);
+ }
}
/*
* clock_was_set() might have changed base->offset of any of
@@ -475,6 +501,16 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
}
#endif
+static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
+{
+ ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
+ ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
+ ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
+
+ return ktime_get_update_offsets_now(&base->clock_was_set_seq,
+ offs_real, offs_boot, offs_tai);
+}
+
/* High resolution timer related functions */
#ifdef CONFIG_HIGH_RES_TIMERS
@@ -482,6 +518,8 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
* High resolution timer enabled ?
*/
static int hrtimer_hres_enabled __read_mostly = 1;
+unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
+EXPORT_SYMBOL_GPL(hrtimer_resolution);
/*
* Enable / Disable high resolution mode
@@ -510,9 +548,14 @@ static inline int hrtimer_is_hres_enabled(void)
/*
* Is the high resolution mode active ?
*/
+static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
+{
+ return cpu_base->hres_active;
+}
+
static inline int hrtimer_hres_active(void)
{
- return __this_cpu_read(hrtimer_bases.hres_active);
+ return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases));
}
/*
@@ -523,7 +566,12 @@ static inline int hrtimer_hres_active(void)
static void
hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
{
- ktime_t expires_next = __hrtimer_get_next_event(cpu_base);
+ ktime_t expires_next;
+
+ if (!cpu_base->hres_active)
+ return;
+
+ expires_next = __hrtimer_get_next_event(cpu_base);
if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
return;
@@ -547,41 +595,40 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
if (cpu_base->hang_detected)
return;
- if (cpu_base->expires_next.tv64 != KTIME_MAX)
- tick_program_event(cpu_base->expires_next, 1);
+ tick_program_event(cpu_base->expires_next, 1);
}
/*
- * Shared reprogramming for clock_realtime and clock_monotonic
- *
* When a timer is enqueued and expires earlier than the already enqueued
* timers, we have to check, whether it expires earlier than the timer for
* which the clock event device was armed.
*
- * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming
- * and no expiry check happens. The timer gets enqueued into the rbtree. The
- * reprogramming and expiry check is done in the hrtimer_interrupt or in the
- * softirq.
- *
* Called with interrupts disabled and base->cpu_base.lock held
*/
-static int hrtimer_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
+static void hrtimer_reprogram(struct hrtimer *timer,
+ struct hrtimer_clock_base *base)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
- int res;
WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
/*
- * When the callback is running, we do not reprogram the clock event
- * device. The timer callback is either running on a different CPU or
- * the callback is executed in the hrtimer_interrupt context. The
- * reprogramming is handled at the end of the hrtimer_interrupt.
+ * If the timer is not on the current cpu, we cannot reprogram
+ * the other cpus clock event device.
*/
- if (hrtimer_callback_running(timer))
- return 0;
+ if (base->cpu_base != cpu_base)
+ return;
+
+ /*
+ * If the hrtimer interrupt is running, then it will
+ * reevaluate the clock bases and reprogram the clock event
+ * device. The callbacks are always executed in hard interrupt
+ * context so we don't need an extra check for a running
+ * callback.
+ */
+ if (cpu_base->in_hrtirq)
+ return;
if (base->cpu_base != cpu_base)
return 0;
@@ -591,24 +638,16 @@ static int hrtimer_reprogram(struct hrtimer *timer,
/*
* CLOCK_REALTIME timer might be requested with an absolute
- * expiry time which is less than base->offset. Nothing wrong
- * about that, just avoid to call into the tick code, which
- * has now objections against negative expiry values.
+ * expiry time which is less than base->offset. Set it to 0.
*/
if (expires.tv64 < 0)
- return -ETIME;
+ expires.tv64 = 0;
if (expires.tv64 >= cpu_base->expires_next.tv64)
- return 0;
+ return;
- /*
- * When the target cpu of the timer is currently executing
- * hrtimer_interrupt(), then we do not touch the clock event
- * device. hrtimer_interrupt() will reevaluate all clock bases
- * before reprogramming the device.
- */
- if (cpu_base->in_hrtirq)
- return 0;
+ /* Update the pointer to the next expiring timer */
+ cpu_base->next_timer = timer;
/*
* If a hang was detected in the last timer interrupt then we
@@ -617,19 +656,16 @@ static int hrtimer_reprogram(struct hrtimer *timer,
* to make progress.
*/
if (cpu_base->hang_detected)
- return 0;
+ return;
cpu_base->expires_next = expires;
/*
- * Clockevents returns -ETIME, when the event was in the past.
+ * Program the timer hardware. We enforce the expiry for
+ * events which are already in the past.
*/
- res = tick_program_event(expires, 1);
- return res;
+ tick_program_event(expires, 1);
}
-static void __run_hrtimer(struct hrtimer *timer, ktime_t *now);
-static int hrtimer_rt_defer(struct hrtimer *timer);
-
/*
* Initialize the high resolution related parts of cpu_base
*/
@@ -639,30 +675,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
base->hres_active = 0;
}
-static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base,
- int wakeup)
-{
- if (!hrtimer_reprogram(timer, base))
- return 0;
- if (!wakeup)
- return -ETIME;
-#ifdef CONFIG_PREEMPT_RT_BASE
- if (!hrtimer_rt_defer(timer))
- return -ETIME;
-#endif
- return 1;
-}
-
-static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
-{
- ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
- ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
- ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
-
- return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai);
-}
-
/*
* Retrigger next event is called after clock was set
*
@@ -672,7 +684,7 @@ static void retrigger_next_event(void *arg)
{
struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
- if (!hrtimer_hres_active())
+ if (!base->hres_active)
return;
raw_spin_lock(&base->lock);
@@ -684,32 +696,21 @@ static void retrigger_next_event(void *arg)
/*
* Switch to high resolution mode
*/
-static int hrtimer_switch_to_hres(void)
+static void hrtimer_switch_to_hres(void)
{
- int i, cpu = smp_processor_id();
- struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
- unsigned long flags;
-
- if (base->hres_active)
- return 1;
-
- local_irq_save(flags);
+ struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
if (tick_init_highres()) {
- local_irq_restore(flags);
printk(KERN_WARNING "Could not switch to high resolution "
- "mode on CPU %d\n", cpu);
- return 0;
+ "mode on CPU %d\n", base->cpu);
+ return;
}
base->hres_active = 1;
- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
- base->clock_base[i].resolution = KTIME_HIGH_RES;
+ hrtimer_resolution = HIGH_RES_NSEC;
tick_setup_sched_timer();
/* "Retrigger" the interrupt to get things going */
retrigger_next_event(NULL);
- local_irq_restore(flags);
- return 1;
}
static void clock_was_set_work(struct work_struct *work)
@@ -769,25 +770,17 @@ void clock_was_set_delayed(void)
#else
+static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; }
static inline int hrtimer_hres_active(void) { return 0; }
static inline int hrtimer_is_hres_enabled(void) { return 0; }
-static inline int hrtimer_switch_to_hres(void) { return 0; }
+static inline void hrtimer_switch_to_hres(void) { }
static inline void
hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
-static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base,
- int wakeup)
-{
- return 0;
-}
-
-static inline int hrtimer_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
-{
- return 0;
-}
+static inline void hrtimer_reprogram(struct hrtimer *timer,
+ struct hrtimer_clock_base *base) { }
static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
static inline void retrigger_next_event(void *arg) { }
+
#endif /* CONFIG_HIGH_RES_TIMERS */
/*
@@ -872,6 +865,14 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
*
* Forward the timer expiry so it will expire in the future.
* Returns the number of overruns.
+ *
+ * Can be safely called from the callback function of @timer. If
+ * called from other contexts @timer must neither be enqueued nor
+ * running the callback and the caller needs to take care of
+ * serialization.
+ *
+ * Note: This only updates the timer expiry value and does not requeue
+ * the timer.
*/
u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
{
@@ -883,8 +884,11 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
if (delta.tv64 < 0)
return 0;
- if (interval.tv64 < timer->base->resolution.tv64)
- interval.tv64 = timer->base->resolution.tv64;
+ if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED))
+ return 0;
+
+ if (interval.tv64 < hrtimer_resolution)
+ interval.tv64 = hrtimer_resolution;
if (unlikely(delta.tv64 >= interval.tv64)) {
s64 incr = ktime_to_ns(interval);
@@ -924,7 +928,7 @@ void hrtimer_wait_for_timer(const struct hrtimer *timer)
if (base && base->cpu_base && !timer->irqsafe)
wait_event(base->cpu_base->wait,
- !(timer->state & HRTIMER_STATE_CALLBACK));
+ !(hrtimer_callback_running(timer)));
}
#else
@@ -944,16 +948,11 @@ static int enqueue_hrtimer(struct hrtimer *timer,
{
debug_activate(timer);
- timerqueue_add(&base->active, &timer->node);
base->cpu_base->active_bases |= 1 << base->index;
- /*
- * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
- * state of a possibly running callback.
- */
- timer->state |= HRTIMER_STATE_ENQUEUED;
+ timer->state = HRTIMER_STATE_ENQUEUED;
- return (&timer->node == base->active.next);
+ return timerqueue_add(&base->active, &timer->node);
}
/*
@@ -968,46 +967,45 @@ static int enqueue_hrtimer(struct hrtimer *timer,
*/
static void __remove_hrtimer(struct hrtimer *timer,
struct hrtimer_clock_base *base,
- unsigned long newstate, int reprogram)
+ u8 newstate, int reprogram)
{
- struct timerqueue_node *next_timer;
- if (!(timer->state & HRTIMER_STATE_ENQUEUED))
- goto out;
+ struct hrtimer_cpu_base *cpu_base = base->cpu_base;
+ u8 state = timer->state;
+
+ timer->state = newstate;
+ if (!(state & HRTIMER_STATE_ENQUEUED))
+ return;
if (unlikely(!list_empty(&timer->cb_entry))) {
list_del_init(&timer->cb_entry);
- goto out;
+ return;
}
- next_timer = timerqueue_getnext(&base->active);
- timerqueue_del(&base->active, &timer->node);
- if (&timer->node == next_timer) {
+ if (!timerqueue_del(&base->active, &timer->node))
+ cpu_base->active_bases &= ~(1 << base->index);
+
#ifdef CONFIG_HIGH_RES_TIMERS
- /* Reprogram the clock event device. if enabled */
- if (reprogram && hrtimer_hres_active()) {
- ktime_t expires;
-
- expires = ktime_sub(hrtimer_get_expires(timer),
- base->offset);
- if (base->cpu_base->expires_next.tv64 == expires.tv64)
- hrtimer_force_reprogram(base->cpu_base, 1);
- }
+ /*
+ * Note: If reprogram is false we do not update
+ * cpu_base->next_timer. This happens when we remove the first
+ * timer on a remote cpu. No harm as we never dereference
+ * cpu_base->next_timer. So the worst thing what can happen is
+ * an superflous call to hrtimer_force_reprogram() on the
+ * remote cpu later on if the same timer gets enqueued again.
+ */
+ if (reprogram && timer == cpu_base->next_timer)
+ hrtimer_force_reprogram(cpu_base, 1);
#endif
- }
- if (!timerqueue_getnext(&base->active))
- base->cpu_base->active_bases &= ~(1 << base->index);
-out:
- timer->state = newstate;
}
/*
* remove hrtimer, called with base lock held
*/
static inline int
-remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
+remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart)
{
if (hrtimer_is_queued(timer)) {
- unsigned long state;
+ u8 state = timer->state;
int reprogram;
/*
@@ -1021,44 +1019,56 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
debug_deactivate(timer);
timer_stats_hrtimer_clear_start_info(timer);
reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
- /*
- * We must preserve the CALLBACK state flag here,
- * otherwise we could move the timer base in
- * switch_hrtimer_base.
- */
- state = timer->state & HRTIMER_STATE_CALLBACK;
+
+ if (!restart)
+ state = HRTIMER_STATE_INACTIVE;
+
__remove_hrtimer(timer, base, state, reprogram);
return 1;
}
return 0;
}
-int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
- unsigned long delta_ns, const enum hrtimer_mode mode,
- int wakeup)
+static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
+ const enum hrtimer_mode mode)
+{
+#ifdef CONFIG_TIME_LOW_RES
+ /*
+ * CONFIG_TIME_LOW_RES indicates that the system has no way to return
+ * granular time values. For relative timers we add hrtimer_resolution
+ * (i.e. one jiffie) to prevent short timeouts.
+ */
+ timer->is_rel = mode & HRTIMER_MODE_REL;
+ if (timer->is_rel)
+ tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution));
+#endif
+ return tim;
+}
+
+/**
+ * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
+ * @timer: the timer to be added
+ * @tim: expiry time
+ * @delta_ns: "slack" range for the timer
+ * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
+ * relative (HRTIMER_MODE_REL)
+ */
+void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+ unsigned long delta_ns, const enum hrtimer_mode mode)
{
struct hrtimer_clock_base *base, *new_base;
unsigned long flags;
- int ret, leftmost;
+ int leftmost;
base = lock_hrtimer_base(timer, &flags);
/* Remove an active timer from the queue: */
- ret = remove_hrtimer(timer, base);
+ remove_hrtimer(timer, base, true);
- if (mode & HRTIMER_MODE_REL) {
+ if (mode & HRTIMER_MODE_REL)
tim = ktime_add_safe(tim, base->get_time());
- /*
- * CONFIG_TIME_LOW_RES is a temporary way for architectures
- * to signal that they simply return xtime in
- * do_gettimeoffset(). In this case we want to round up by
- * resolution when starting a relative timer, to avoid short
- * timeouts. This will go away with the GTOD framework.
- */
-#ifdef CONFIG_TIME_LOW_RES
- tim = ktime_add_safe(tim, base->resolution);
-#endif
- }
+
+ tim = hrtimer_update_lowres(timer, tim, mode);
hrtimer_set_expires_range_ns(timer, tim, delta_ns);
@@ -1077,94 +1087,25 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
}
#endif
leftmost = enqueue_hrtimer(timer, new_base);
-
- if (!leftmost) {
- unlock_hrtimer_base(timer, &flags);
- return ret;
- }
+ if (!leftmost)
+ goto unlock;
if (!hrtimer_is_hres_active(timer)) {
/*
* Kick to reschedule the next tick to handle the new timer
* on dynticks target.
*/
- wake_up_nohz_cpu(new_base->cpu_base->cpu);
- } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases)) {
-
- ret = hrtimer_enqueue_reprogram(timer, new_base, wakeup);
- if (ret < 0) {
- /*
- * In case we failed to reprogram the timer (mostly
- * because out current timer is already elapsed),
- * remove it again and report a failure. This avoids
- * stale base->first entries.
- */
- debug_deactivate(timer);
- __remove_hrtimer(timer, new_base,
- timer->state & HRTIMER_STATE_CALLBACK, 0);
- } else if (ret > 0) {
- /*
- * Only allow reprogramming if the new base is on this CPU.
- * (it might still be on another CPU if the timer was pending)
- *
- * XXX send_remote_softirq() ?
- */
- /*
- * We need to drop cpu_base->lock to avoid a
- * lock ordering issue vs. rq->lock.
- */
- raw_spin_unlock(&new_base->cpu_base->lock);
- raise_softirq_irqoff(HRTIMER_SOFTIRQ);
- local_irq_restore(flags);
- return 0;
- }
+ if (new_base->cpu_base->nohz_active)
+ wake_up_nohz_cpu(new_base->cpu_base->cpu);
+ } else {
+ hrtimer_reprogram(timer, new_base);
}
-
+unlock:
unlock_hrtimer_base(timer, &flags);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns);
-
-/**
- * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
- * @timer: the timer to be added
- * @tim: expiry time
- * @delta_ns: "slack" range for the timer
- * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
- * relative (HRTIMER_MODE_REL)
- *
- * Returns:
- * 0 on success
- * 1 when the timer was active
- */
-int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
- unsigned long delta_ns, const enum hrtimer_mode mode)
-{
- return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
}
EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
/**
- * hrtimer_start - (re)start an hrtimer on the current CPU
- * @timer: the timer to be added
- * @tim: expiry time
- * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
- * relative (HRTIMER_MODE_REL)
- *
- * Returns:
- * 0 on success
- * 1 when the timer was active
- */
-int
-hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
-{
- return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);
-}
-EXPORT_SYMBOL_GPL(hrtimer_start);
-
-
-/**
* hrtimer_try_to_cancel - try to deactivate a timer
* @timer: hrtimer to stop
*
@@ -1180,10 +1121,19 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
unsigned long flags;
int ret = -1;
+ /*
+ * Check lockless first. If the timer is not active (neither
+ * enqueued nor running the callback, nothing to do here. The
+ * base lock does not serialize against a concurrent enqueue,
+ * so we can avoid taking it.
+ */
+ if (!hrtimer_active(timer))
+ return 0;
+
base = lock_hrtimer_base(timer, &flags);
if (!hrtimer_callback_running(timer))
- ret = remove_hrtimer(timer, base);
+ ret = remove_hrtimer(timer, base, false);
unlock_hrtimer_base(timer, &flags);
@@ -1215,44 +1165,44 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
/**
* hrtimer_get_remaining - get remaining time for the timer
* @timer: the timer to read
+ * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y
*/
-ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
+ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust)
{
unsigned long flags;
ktime_t rem;
lock_hrtimer_base(timer, &flags);
- rem = hrtimer_expires_remaining(timer);
+ if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust)
+ rem = hrtimer_expires_remaining_adjusted(timer);
+ else
+ rem = hrtimer_expires_remaining(timer);
unlock_hrtimer_base(timer, &flags);
return rem;
}
-EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
+EXPORT_SYMBOL_GPL(__hrtimer_get_remaining);
#ifdef CONFIG_NO_HZ_COMMON
/**
* hrtimer_get_next_event - get the time until next expiry event
*
- * Returns the delta to the next expiry event or KTIME_MAX if no timer
- * is pending.
+ * Returns the next expiry time or KTIME_MAX if no timer is pending.
*/
-ktime_t hrtimer_get_next_event(void)
+u64 hrtimer_get_next_event(void)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
- ktime_t mindelta = { .tv64 = KTIME_MAX };
+ u64 expires = KTIME_MAX;
unsigned long flags;
raw_spin_lock_irqsave(&cpu_base->lock, flags);
- if (!hrtimer_hres_active())
- mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
- ktime_get());
+ if (!__hrtimer_hres_active(cpu_base))
+ expires = __hrtimer_get_next_event(cpu_base).tv64;
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
- if (mindelta.tv64 < 0)
- mindelta.tv64 = 0;
- return mindelta;
+ return expires;
}
#endif
@@ -1295,41 +1245,86 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
}
EXPORT_SYMBOL_GPL(hrtimer_init);
-/**
- * hrtimer_get_res - get the timer resolution for a clock
- * @which_clock: which clock to query
- * @tp: pointer to timespec variable to store the resolution
+/*
+ * A timer is active, when it is enqueued into the rbtree or the
+ * callback function is running or it's in the state of being migrated
+ * to another cpu.
*
- * Store the resolution of the clock selected by @which_clock in the
- * variable pointed to by @tp.
+ * It is important for this function to not return a false negative.
*/
-int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
+bool hrtimer_active(const struct hrtimer *timer)
{
struct hrtimer_cpu_base *cpu_base;
- int base = hrtimer_clockid_to_base(which_clock);
+ unsigned int seq;
- cpu_base = raw_cpu_ptr(&hrtimer_bases);
- *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
+ do {
+ cpu_base = READ_ONCE(timer->base->cpu_base);
+ seq = raw_read_seqcount_begin(&cpu_base->seq);
- return 0;
+ if (timer->state != HRTIMER_STATE_INACTIVE ||
+ cpu_base->running_soft == timer ||
+ cpu_base->running == timer)
+ return true;
+
+ } while (read_seqcount_retry(&cpu_base->seq, seq) ||
+ cpu_base != READ_ONCE(timer->base->cpu_base));
+
+ return false;
}
-EXPORT_SYMBOL_GPL(hrtimer_get_res);
+EXPORT_SYMBOL_GPL(hrtimer_active);
+
+/*
+ * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
+ * distinct sections:
+ *
+ * - queued: the timer is queued
+ * - callback: the timer is being ran
+ * - post: the timer is inactive or (re)queued
+ *
+ * On the read side we ensure we observe timer->state and cpu_base->running
+ * from the same section, if anything changed while we looked at it, we retry.
+ * This includes timer->base changing because sequence numbers alone are
+ * insufficient for that.
+ *
+ * The sequence numbers are required because otherwise we could still observe
+ * a false negative if the read side got smeared over multiple consequtive
+ * __run_hrtimer() invocations.
+ */
-static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
+static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
+ struct hrtimer_clock_base *base,
+ struct hrtimer *timer, ktime_t *now)
{
- struct hrtimer_clock_base *base = timer->base;
- struct hrtimer_cpu_base *cpu_base = base->cpu_base;
enum hrtimer_restart (*fn)(struct hrtimer *);
int restart;
- WARN_ON(!irqs_disabled());
+ lockdep_assert_held(&cpu_base->lock);
debug_deactivate(timer);
- __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+ cpu_base->running = timer;
+
+ /*
+ * Separate the ->running assignment from the ->state assignment.
+ *
+ * As with a regular write barrier, this ensures the read side in
+ * hrtimer_active() cannot observe cpu_base->running == NULL &&
+ * timer->state == INACTIVE.
+ */
+ raw_write_seqcount_barrier(&cpu_base->seq);
+
+ __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
timer_stats_account_hrtimer(timer);
fn = timer->function;
/*
+ * Clear the 'is relative' flag for the TIME_LOW_RES case. If the
+ * timer is restarted with a period then it becomes an absolute
+ * timer. If its not restarted it does not matter.
+ */
+ if (IS_ENABLED(CONFIG_TIME_LOW_RES))
+ timer->is_rel = false;
+
+ /*
* Because we run timers from hardirq context, there is no chance
* they get migrated to another cpu, therefore its safe to unlock
* the timer base.
@@ -1341,69 +1336,57 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
raw_spin_lock(&cpu_base->lock);
/*
- * Note: We clear the CALLBACK bit after enqueue_hrtimer and
+ * Note: We clear the running state after enqueue_hrtimer and
* we do not reprogramm the event hardware. Happens either in
* hrtimer_start_range_ns() or in hrtimer_interrupt()
+ *
+ * Note: Because we dropped the cpu_base->lock above,
+ * hrtimer_start_range_ns() can have popped in and enqueued the timer
+ * for us already.
*/
- if (restart != HRTIMER_NORESTART) {
- BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
+ if (restart != HRTIMER_NORESTART &&
+ !(timer->state & HRTIMER_STATE_ENQUEUED))
enqueue_hrtimer(timer, base);
- }
- WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
+ /*
+ * Separate the ->running assignment from the ->state assignment.
+ *
+ * As with a regular write barrier, this ensures the read side in
+ * hrtimer_active() cannot observe cpu_base->running == NULL &&
+ * timer->state == INACTIVE.
+ */
+ raw_write_seqcount_barrier(&cpu_base->seq);
- timer->state &= ~HRTIMER_STATE_CALLBACK;
+ WARN_ON_ONCE(cpu_base->running != timer);
+ cpu_base->running = NULL;
}
-static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
-
#ifdef CONFIG_PREEMPT_RT_BASE
static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer,
struct hrtimer_clock_base *base)
{
- /*
- * Note, we clear the callback flag before we requeue the
- * timer otherwise we trigger the callback_running() check
- * in hrtimer_reprogram().
- */
- timer->state &= ~HRTIMER_STATE_CALLBACK;
-
- if (restart != HRTIMER_NORESTART) {
- BUG_ON(hrtimer_active(timer));
- /*
- * Enqueue the timer, if it's the leftmost timer then
- * we need to reprogram it.
- */
- if (!enqueue_hrtimer(timer, base))
- return;
+ int leftmost;
-#ifndef CONFIG_HIGH_RES_TIMERS
- }
-#else
- if (base->cpu_base->hres_active &&
- hrtimer_reprogram(timer, base))
- goto requeue;
+ if (restart != HRTIMER_NORESTART &&
+ !(timer->state & HRTIMER_STATE_ENQUEUED)) {
- } else if (hrtimer_active(timer)) {
- /*
- * If the timer was rearmed on another CPU, reprogram
- * the event device.
- */
- if (&timer->node == base->active.next &&
- base->cpu_base->hres_active &&
- hrtimer_reprogram(timer, base))
- goto requeue;
- }
- return;
+ leftmost = enqueue_hrtimer(timer, base);
+ if (!leftmost)
+ return;
+#ifdef CONFIG_HIGH_RES_TIMERS
+ if (!hrtimer_is_hres_active(timer)) {
+ /*
+ * Kick to reschedule the next tick to handle the new timer
+ * on dynticks target.
+ */
+ if (base->cpu_base->nohz_active)
+ wake_up_nohz_cpu(base->cpu_base->cpu);
+ } else {
-requeue:
- /*
- * Timer is expired. Thus move it from tree to pending list
- * again.
- */
- __remove_hrtimer(timer, base, timer->state, 0);
- list_add_tail(&timer->cb_entry, &base->expired);
+ hrtimer_reprogram(timer, base);
+ }
#endif
+ }
}
/*
@@ -1436,8 +1419,11 @@ static void hrtimer_rt_run_pending(void)
* Same as the above __run_hrtimer function
* just we run with interrupts enabled.
*/
- debug_hrtimer_deactivate(timer);
- __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+ debug_deactivate(timer);
+ cpu_base->running_soft = timer;
+ raw_write_seqcount_barrier(&cpu_base->seq);
+
+ __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
timer_stats_account_hrtimer(timer);
fn = timer->function;
@@ -1446,6 +1432,10 @@ static void hrtimer_rt_run_pending(void)
raw_spin_lock_irq(&cpu_base->lock);
hrtimer_rt_reprogram(restart, timer, base);
+ raw_write_seqcount_barrier(&cpu_base->seq);
+
+ WARN_ON_ONCE(cpu_base->running_soft != timer);
+ cpu_base->running_soft = NULL;
}
}
@@ -1466,53 +1456,25 @@ static int hrtimer_rt_defer(struct hrtimer *timer)
#else
-static inline void hrtimer_rt_run_pending(void)
-{
- hrtimer_peek_ahead_timers();
-}
-
static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; }
#endif
-#ifdef CONFIG_HIGH_RES_TIMERS
+static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
-/*
- * High resolution timer interrupt
- * Called with interrupts disabled
- */
-void hrtimer_interrupt(struct clock_event_device *dev)
+static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
{
- struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
- ktime_t expires_next, now, entry_time, delta;
- int i, retries = 0, raise = 0;
-
- BUG_ON(!cpu_base->hres_active);
- cpu_base->nr_events++;
- dev->next_event.tv64 = KTIME_MAX;
-
- raw_spin_lock(&cpu_base->lock);
- entry_time = now = hrtimer_update_base(cpu_base);
-retry:
- cpu_base->in_hrtirq = 1;
- /*
- * We set expires_next to KTIME_MAX here with cpu_base->lock
- * held to prevent that a timer is enqueued in our queue via
- * the migration code. This does not affect enqueueing of
- * timers which run their callback and need to be requeued on
- * this CPU.
- */
- cpu_base->expires_next.tv64 = KTIME_MAX;
+ struct hrtimer_clock_base *base = cpu_base->clock_base;
+ unsigned int active = cpu_base->active_bases;
+ int raise = 0;
- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
- struct hrtimer_clock_base *base;
+ for (; active; base++, active >>= 1) {
struct timerqueue_node *node;
ktime_t basenow;
- if (!(cpu_base->active_bases & (1 << i)))
+ if (!(active & 0x01))
continue;
- base = cpu_base->clock_base + i;
basenow = ktime_add(now, base->offset);
while ((node = timerqueue_getnext(&base->active))) {
@@ -1545,11 +1507,46 @@ retry:
break;
if (!hrtimer_rt_defer(timer))
- __run_hrtimer(timer, &basenow);
+ __run_hrtimer(cpu_base, base, timer, &basenow);
else
raise = 1;
}
}
+ if (raise)
+ raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+}
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+/*
+ * High resolution timer interrupt
+ * Called with interrupts disabled
+ */
+void hrtimer_interrupt(struct clock_event_device *dev)
+{
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ ktime_t expires_next, now, entry_time, delta;
+ int retries = 0;
+
+ BUG_ON(!cpu_base->hres_active);
+ cpu_base->nr_events++;
+ dev->next_event.tv64 = KTIME_MAX;
+
+ raw_spin_lock(&cpu_base->lock);
+ entry_time = now = hrtimer_update_base(cpu_base);
+retry:
+ cpu_base->in_hrtirq = 1;
+ /*
+ * We set expires_next to KTIME_MAX here with cpu_base->lock
+ * held to prevent that a timer is enqueued in our queue via
+ * the migration code. This does not affect enqueueing of
+ * timers which run their callback and need to be requeued on
+ * this CPU.
+ */
+ cpu_base->expires_next.tv64 = KTIME_MAX;
+
+ __hrtimer_run_queues(cpu_base, now);
+
/* Reevaluate the clock bases for the next expiry */
expires_next = __hrtimer_get_next_event(cpu_base);
/*
@@ -1561,10 +1558,9 @@ retry:
raw_spin_unlock(&cpu_base->lock);
/* Reprogramming necessary ? */
- if (expires_next.tv64 == KTIME_MAX ||
- !tick_program_event(expires_next, 0)) {
+ if (!tick_program_event(expires_next, 0)) {
cpu_base->hang_detected = 0;
- goto out;
+ return;
}
/*
@@ -1595,8 +1591,8 @@ retry:
cpu_base->hang_detected = 1;
raw_spin_unlock(&cpu_base->lock);
delta = ktime_sub(now, entry_time);
- if (delta.tv64 > cpu_base->max_hang_time.tv64)
- cpu_base->max_hang_time = delta;
+ if ((unsigned int)delta.tv64 > cpu_base->max_hang_time)
+ cpu_base->max_hang_time = (unsigned int) delta.tv64;
/*
* Limit it to a sensible value as we enforce a longer
* delay. Give the CPU at least 100ms to catch up.
@@ -1608,16 +1604,13 @@ retry:
tick_program_event(expires_next, 1);
printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
ktime_to_ns(delta));
-out:
- if (raise)
- raise_softirq_irqoff(HRTIMER_SOFTIRQ);
}
/*
* local version of hrtimer_peek_ahead_timers() called with interrupts
* disabled.
*/
-static void __hrtimer_peek_ahead_timers(void)
+static inline void __hrtimer_peek_ahead_timers(void)
{
struct tick_device *td;
@@ -1629,102 +1622,39 @@ static void __hrtimer_peek_ahead_timers(void)
hrtimer_interrupt(td->evtdev);
}
-/**
- * hrtimer_peek_ahead_timers -- run soft-expired timers now
- *
- * hrtimer_peek_ahead_timers will peek at the timer queue of
- * the current cpu and check if there are any timers for which
- * the soft expires time has passed. If any such timers exist,
- * they are run immediately and then removed from the timer queue.
- *
- */
-void hrtimer_peek_ahead_timers(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __hrtimer_peek_ahead_timers();
- local_irq_restore(flags);
-}
#else /* CONFIG_HIGH_RES_TIMERS */
static inline void __hrtimer_peek_ahead_timers(void) { }
#endif /* !CONFIG_HIGH_RES_TIMERS */
-
-static void run_hrtimer_softirq(struct softirq_action *h)
-{
- hrtimer_rt_run_pending();
-}
-
/*
- * Called from timer softirq every jiffy, expire hrtimers:
- *
- * For HRT its the fall back code to run the softirq in the timer
- * softirq context in case the hrtimer initialization failed or has
- * not been done yet.
+ * Called from run_local_timers in hardirq context every jiffy
*/
-void hrtimer_run_pending(void)
+void hrtimer_run_queues(void)
{
- if (hrtimer_hres_active())
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ ktime_t now;
+
+ if (__hrtimer_hres_active(cpu_base))
return;
/*
- * This _is_ ugly: We have to check in the softirq context,
- * whether we can switch to highres and / or nohz mode. The
- * clocksource switch happens in the timer interrupt with
- * xtime_lock held. Notification from there only sets the
- * check bit in the tick_oneshot code, otherwise we might
- * deadlock vs. xtime_lock.
+ * This _is_ ugly: We have to check periodically, whether we
+ * can switch to highres and / or nohz mode. The clocksource
+ * switch happens with xtime_lock held. Notification from
+ * there only sets the check bit in the tick_oneshot code,
+ * otherwise we might deadlock vs. xtime_lock.
*/
- if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
+ if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) {
hrtimer_switch_to_hres();
-}
-
-/*
- * Called from hardirq context every jiffy
- */
-void hrtimer_run_queues(void)
-{
- struct timerqueue_node *node;
- struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
- struct hrtimer_clock_base *base;
- int index, gettime = 1, raise = 0;
-
- if (hrtimer_hres_active())
return;
-
- for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
- base = &cpu_base->clock_base[index];
- if (!timerqueue_getnext(&base->active))
- continue;
-
- if (gettime) {
- hrtimer_get_softirq_time(cpu_base);
- gettime = 0;
- }
-
- raw_spin_lock(&cpu_base->lock);
-
- while ((node = timerqueue_getnext(&base->active))) {
- struct hrtimer *timer;
-
- timer = container_of(node, struct hrtimer, node);
- if (base->softirq_time.tv64 <=
- hrtimer_get_expires_tv64(timer))
- break;
-
- if (!hrtimer_rt_defer(timer))
- __run_hrtimer(timer, &base->softirq_time);
- else
- raise = 1;
- }
- raw_spin_unlock(&cpu_base->lock);
}
- if (raise)
- raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ raw_spin_lock(&cpu_base->lock);
+ now = hrtimer_update_base(cpu_base);
+ __hrtimer_run_queues(cpu_base, now);
+ raw_spin_unlock(&cpu_base->lock);
}
/*
@@ -1759,8 +1689,6 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
do {
set_current_state(state);
hrtimer_start_expires(&t->timer, mode);
- if (!hrtimer_active(&t->timer))
- t->task = NULL;
if (likely(t->task))
freezable_schedule();
@@ -1937,11 +1865,11 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
debug_deactivate(timer);
/*
- * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+ * Mark it as ENQUEUED not INACTIVE otherwise the
* timer could be seen as !active and just vanish away
* under us on another CPU
*/
- __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
+ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
timer->base = new_base;
/*
* Enqueue the timers on the new cpu. This does not
@@ -1952,9 +1880,6 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
* event device.
*/
enqueue_hrtimer(timer, new_base);
-
- /* Clear the migration state bit */
- timer->state &= ~HRTIMER_STATE_MIGRATE;
}
}
@@ -2021,12 +1946,21 @@ static struct notifier_block hrtimers_nb = {
.notifier_call = hrtimer_cpu_notify,
};
+#ifdef CONFIG_PREEMPT_RT_BASE
+static void run_hrtimer_softirq(struct softirq_action *h)
+{
+ hrtimer_rt_run_pending();
+}
+#endif
+
void __init hrtimers_init(void)
{
hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
register_cpu_notifier(&hrtimers_nb);
+#ifdef CONFIG_PREEMPT_RT_BASE
open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
+#endif
}
/**
@@ -2065,8 +1999,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
hrtimer_init_sleeper(&t, current);
hrtimer_start_expires(&t.timer, mode);
- if (!hrtimer_active(&t.timer))
- t.task = NULL;
if (likely(t.task))
schedule();
diff --git a/kernel/kernel/time/itimer.c b/kernel/kernel/time/itimer.c
index d0513909d..184de6751 100644
--- a/kernel/kernel/time/itimer.c
+++ b/kernel/kernel/time/itimer.c
@@ -26,7 +26,7 @@
*/
static struct timeval itimer_get_remtime(struct hrtimer *timer)
{
- ktime_t rem = hrtimer_get_remaining(timer);
+ ktime_t rem = __hrtimer_get_remaining(timer, true);
/*
* Racy but safe: if the itimer expires after the above
diff --git a/kernel/kernel/time/ntp.c b/kernel/kernel/time/ntp.c
index bd9c53985..cd925efb2 100644
--- a/kernel/kernel/time/ntp.c
+++ b/kernel/kernel/time/ntp.c
@@ -36,6 +36,7 @@ unsigned long tick_nsec;
static u64 tick_length;
static u64 tick_length_base;
+#define SECS_PER_DAY 86400
#define MAX_TICKADJ 500LL /* usecs */
#define MAX_TICKADJ_SCALED \
(((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
@@ -77,6 +78,9 @@ static long time_adjust;
/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
static s64 ntp_tick_adj;
+/* second value of the next pending leapsecond, or TIME64_MAX if no leap */
+static time64_t ntp_next_leap_sec = TIME64_MAX;
+
#ifdef CONFIG_NTP_PPS
/*
@@ -96,7 +100,7 @@ static s64 ntp_tick_adj;
static int pps_valid; /* signal watchdog counter */
static long pps_tf[3]; /* phase median filter */
static long pps_jitter; /* current jitter (ns) */
-static struct timespec pps_fbase; /* beginning of the last freq interval */
+static struct timespec64 pps_fbase; /* beginning of the last freq interval */
static int pps_shift; /* current interval duration (s) (shift) */
static int pps_intcnt; /* interval counter */
static s64 pps_freq; /* frequency offset (scaled ns/s) */
@@ -350,6 +354,7 @@ void ntp_clear(void)
tick_length = tick_length_base;
time_offset = 0;
+ ntp_next_leap_sec = TIME64_MAX;
/* Clear PPS state variables */
pps_clear();
}
@@ -360,6 +365,21 @@ u64 ntp_tick_length(void)
return tick_length;
}
+/**
+ * ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t
+ *
+ * Provides the time of the next leapsecond against CLOCK_REALTIME in
+ * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending.
+ */
+ktime_t ntp_get_next_leap(void)
+{
+ ktime_t ret;
+
+ if ((time_state == TIME_INS) && (time_status & STA_INS))
+ return ktime_set(ntp_next_leap_sec, 0);
+ ret.tv64 = KTIME_MAX;
+ return ret;
+}
/*
* this routine handles the overflow of the microsecond field
@@ -383,15 +403,21 @@ int second_overflow(unsigned long secs)
*/
switch (time_state) {
case TIME_OK:
- if (time_status & STA_INS)
+ if (time_status & STA_INS) {
time_state = TIME_INS;
- else if (time_status & STA_DEL)
+ ntp_next_leap_sec = secs + SECS_PER_DAY -
+ (secs % SECS_PER_DAY);
+ } else if (time_status & STA_DEL) {
time_state = TIME_DEL;
+ ntp_next_leap_sec = secs + SECS_PER_DAY -
+ ((secs+1) % SECS_PER_DAY);
+ }
break;
case TIME_INS:
- if (!(time_status & STA_INS))
+ if (!(time_status & STA_INS)) {
+ ntp_next_leap_sec = TIME64_MAX;
time_state = TIME_OK;
- else if (secs % 86400 == 0) {
+ } else if (secs % SECS_PER_DAY == 0) {
leap = -1;
time_state = TIME_OOP;
printk(KERN_NOTICE
@@ -399,19 +425,21 @@ int second_overflow(unsigned long secs)
}
break;
case TIME_DEL:
- if (!(time_status & STA_DEL))
+ if (!(time_status & STA_DEL)) {
+ ntp_next_leap_sec = TIME64_MAX;
time_state = TIME_OK;
- else if ((secs + 1) % 86400 == 0) {
+ } else if ((secs + 1) % SECS_PER_DAY == 0) {
leap = 1;
+ ntp_next_leap_sec = TIME64_MAX;
time_state = TIME_WAIT;
printk(KERN_NOTICE
"Clock: deleting leap second 23:59:59 UTC\n");
}
break;
case TIME_OOP:
+ ntp_next_leap_sec = TIME64_MAX;
time_state = TIME_WAIT;
break;
-
case TIME_WAIT:
if (!(time_status & (STA_INS | STA_DEL)))
time_state = TIME_OK;
@@ -460,6 +488,11 @@ out:
}
#ifdef CONFIG_GENERIC_CMOS_UPDATE
+int __weak update_persistent_clock(struct timespec now)
+{
+ return -ENODEV;
+}
+
int __weak update_persistent_clock64(struct timespec64 now64)
{
struct timespec now;
@@ -477,7 +510,7 @@ static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
static void sync_cmos_clock(struct work_struct *work)
{
struct timespec64 now;
- struct timespec next;
+ struct timespec64 next;
int fail = 1;
/*
@@ -527,7 +560,7 @@ static void sync_cmos_clock(struct work_struct *work)
next.tv_nsec -= NSEC_PER_SEC;
}
queue_delayed_work(system_power_efficient_wq,
- &sync_cmos_work, timespec_to_jiffies(&next));
+ &sync_cmos_work, timespec64_to_jiffies(&next));
}
#ifdef CONFIG_PREEMPT_RT_FULL
@@ -590,6 +623,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec64 *ts)
if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
time_state = TIME_OK;
time_status = STA_UNSYNC;
+ ntp_next_leap_sec = TIME64_MAX;
/* restart PPS frequency calibration */
pps_reset_freq_interval();
}
@@ -754,6 +788,24 @@ int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai)
if (!(time_status & STA_NANO))
txc->time.tv_usec /= NSEC_PER_USEC;
+ /* Handle leapsec adjustments */
+ if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) {
+ if ((time_state == TIME_INS) && (time_status & STA_INS)) {
+ result = TIME_OOP;
+ txc->tai++;
+ txc->time.tv_sec--;
+ }
+ if ((time_state == TIME_DEL) && (time_status & STA_DEL)) {
+ result = TIME_WAIT;
+ txc->tai--;
+ txc->time.tv_sec++;
+ }
+ if ((time_state == TIME_OOP) &&
+ (ts->tv_sec == ntp_next_leap_sec)) {
+ result = TIME_WAIT;
+ }
+ }
+
return result;
}
@@ -764,13 +816,13 @@ int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai)
* pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ]
* while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */
struct pps_normtime {
- __kernel_time_t sec; /* seconds */
+ s64 sec; /* seconds */
long nsec; /* nanoseconds */
};
/* normalize the timestamp so that nsec is in the
( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */
-static inline struct pps_normtime pps_normalize_ts(struct timespec ts)
+static inline struct pps_normtime pps_normalize_ts(struct timespec64 ts)
{
struct pps_normtime norm = {
.sec = ts.tv_sec,
@@ -852,7 +904,7 @@ static long hardpps_update_freq(struct pps_normtime freq_norm)
pps_errcnt++;
pps_dec_freq_interval();
printk_deferred(KERN_ERR
- "hardpps: PPSERROR: interval too long - %ld s\n",
+ "hardpps: PPSERROR: interval too long - %lld s\n",
freq_norm.sec);
return 0;
}
@@ -939,7 +991,7 @@ static void hardpps_update_phase(long error)
* This code is based on David Mills's reference nanokernel
* implementation. It was mostly rewritten but keeps the same idea.
*/
-void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
{
struct pps_normtime pts_norm, freq_norm;
@@ -960,7 +1012,7 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
}
/* ok, now we have a base for frequency calculation */
- freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase));
+ freq_norm = pps_normalize_ts(timespec64_sub(*raw_ts, pps_fbase));
/* check that the signal is in the range
* [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */
diff --git a/kernel/kernel/time/ntp_internal.h b/kernel/kernel/time/ntp_internal.h
index bbd102ad9..af924470e 100644
--- a/kernel/kernel/time/ntp_internal.h
+++ b/kernel/kernel/time/ntp_internal.h
@@ -5,8 +5,9 @@ extern void ntp_init(void);
extern void ntp_clear(void);
/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
extern u64 ntp_tick_length(void);
+extern ktime_t ntp_get_next_leap(void);
extern int second_overflow(unsigned long secs);
extern int ntp_validate_timex(struct timex *);
extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
-extern void __hardpps(const struct timespec *, const struct timespec *);
+extern void __hardpps(const struct timespec64 *, const struct timespec64 *);
#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/kernel/time/posix-clock.c b/kernel/kernel/time/posix-clock.c
index ce033c7aa..9cff0ab82 100644
--- a/kernel/kernel/time/posix-clock.c
+++ b/kernel/kernel/time/posix-clock.c
@@ -69,10 +69,10 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf,
static unsigned int posix_clock_poll(struct file *fp, poll_table *wait)
{
struct posix_clock *clk = get_posix_clock(fp);
- int result = 0;
+ unsigned int result = 0;
if (!clk)
- return -ENODEV;
+ return POLLERR;
if (clk->ops.poll)
result = clk->ops.poll(clk, fp, wait);
diff --git a/kernel/kernel/time/posix-cpu-timers.c b/kernel/kernel/time/posix-cpu-timers.c
index c8c7150f2..bf28ef601 100644
--- a/kernel/kernel/time/posix-cpu-timers.c
+++ b/kernel/kernel/time/posix-cpu-timers.c
@@ -197,39 +197,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
return 0;
}
-static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
+/*
+ * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
+ * to avoid race conditions with concurrent updates to cputime.
+ */
+static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
{
- if (b->utime > a->utime)
- a->utime = b->utime;
+ u64 curr_cputime;
+retry:
+ curr_cputime = atomic64_read(cputime);
+ if (sum_cputime > curr_cputime) {
+ if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
+ goto retry;
+ }
+}
- if (b->stime > a->stime)
- a->stime = b->stime;
+static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum)
+{
+ __update_gt_cputime(&cputime_atomic->utime, sum->utime);
+ __update_gt_cputime(&cputime_atomic->stime, sum->stime);
+ __update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime);
+}
- if (b->sum_exec_runtime > a->sum_exec_runtime)
- a->sum_exec_runtime = b->sum_exec_runtime;
+/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */
+static inline void sample_cputime_atomic(struct task_cputime *times,
+ struct task_cputime_atomic *atomic_times)
+{
+ times->utime = atomic64_read(&atomic_times->utime);
+ times->stime = atomic64_read(&atomic_times->stime);
+ times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime);
}
void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
struct task_cputime sum;
- unsigned long flags;
- if (!cputimer->running) {
+ /* Check if cputimer isn't running. This is accessed without locking. */
+ if (!READ_ONCE(cputimer->running)) {
/*
* The POSIX timer interface allows for absolute time expiry
* values through the TIMER_ABSTIME flag, therefore we have
- * to synchronize the timer to the clock every time we start
- * it.
+ * to synchronize the timer to the clock every time we start it.
*/
thread_group_cputime(tsk, &sum);
- raw_spin_lock_irqsave(&cputimer->lock, flags);
- cputimer->running = 1;
- update_gt_cputime(&cputimer->cputime, &sum);
- } else
- raw_spin_lock_irqsave(&cputimer->lock, flags);
- *times = cputimer->cputime;
- raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+ update_gt_cputime(&cputimer->cputime_atomic, &sum);
+
+ /*
+ * We're setting cputimer->running without a lock. Ensure
+ * this only gets written to in one operation. We set
+ * running after update_gt_cputime() as a small optimization,
+ * but barriers are not required because update_gt_cputime()
+ * can handle concurrent updates.
+ */
+ WRITE_ONCE(cputimer->running, true);
+ }
+ sample_cputime_atomic(times, &cputimer->cputime_atomic);
}
/*
@@ -583,7 +606,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
if (!task_cputime_zero(&tsk->cputime_expires))
return false;
- if (tsk->signal->cputimer.running)
+ /* Check if cputimer is running. This is accessed without locking. */
+ if (READ_ONCE(tsk->signal->cputimer.running))
return false;
return true;
@@ -841,6 +865,13 @@ static void check_thread_timers(struct task_struct *tsk,
unsigned long long expires;
unsigned long soft;
+ /*
+ * If cputime_expires is zero, then there are no active
+ * per thread CPU timers.
+ */
+ if (task_cputime_zero(&tsk->cputime_expires))
+ return;
+
expires = check_timers_list(timers, firing, prof_ticks(tsk));
tsk_expires->prof_exp = expires_to_cputime(expires);
@@ -853,10 +884,10 @@ static void check_thread_timers(struct task_struct *tsk,
/*
* Check for the special case thread timers.
*/
- soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
+ soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
if (soft != RLIM_INFINITY) {
unsigned long hard =
- ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
+ READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
if (hard != RLIM_INFINITY &&
tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -883,14 +914,12 @@ static void check_thread_timers(struct task_struct *tsk,
}
}
-static void stop_process_timers(struct signal_struct *sig)
+static inline void stop_process_timers(struct signal_struct *sig)
{
struct thread_group_cputimer *cputimer = &sig->cputimer;
- unsigned long flags;
- raw_spin_lock_irqsave(&cputimer->lock, flags);
- cputimer->running = 0;
- raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+ /* Turn off cputimer->running. This is done without locking. */
+ WRITE_ONCE(cputimer->running, false);
}
static u32 onecputick;
@@ -941,6 +970,19 @@ static void check_process_timers(struct task_struct *tsk,
unsigned long soft;
/*
+ * If cputimer is not running, then there are no active
+ * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU).
+ */
+ if (!READ_ONCE(tsk->signal->cputimer.running))
+ return;
+
+ /*
+ * Signify that a thread is checking for process timers.
+ * Write access to this field is protected by the sighand lock.
+ */
+ sig->cputimer.checking_timer = true;
+
+ /*
* Collect the current process totals.
*/
thread_group_cputimer(tsk, &cputime);
@@ -959,11 +1001,11 @@ static void check_process_timers(struct task_struct *tsk,
SIGPROF);
check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
SIGVTALRM);
- soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+ soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (soft != RLIM_INFINITY) {
unsigned long psecs = cputime_to_secs(ptime);
unsigned long hard =
- ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
+ READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
cputime_t x;
if (psecs >= hard) {
/*
@@ -994,6 +1036,8 @@ static void check_process_timers(struct task_struct *tsk,
sig->cputime_expires.sched_exp = sched_expires;
if (task_cputime_zero(&sig->cputime_expires))
stop_process_timers(sig);
+
+ sig->cputimer.checking_timer = false;
}
/*
@@ -1096,29 +1140,36 @@ static inline int task_cputime_expired(const struct task_cputime *sample,
static inline int fastpath_timer_check(struct task_struct *tsk)
{
struct signal_struct *sig;
- cputime_t utime, stime;
-
- task_cputime(tsk, &utime, &stime);
if (!task_cputime_zero(&tsk->cputime_expires)) {
- struct task_cputime task_sample = {
- .utime = utime,
- .stime = stime,
- .sum_exec_runtime = tsk->se.sum_exec_runtime
- };
+ struct task_cputime task_sample;
+ task_cputime(tsk, &task_sample.utime, &task_sample.stime);
+ task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime;
if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
return 1;
}
sig = tsk->signal;
- if (sig->cputimer.running) {
+ /*
+ * Check if thread group timers expired when the cputimer is
+ * running and no other thread in the group is already checking
+ * for thread group cputimers. These fields are read without the
+ * sighand lock. However, this is fine because this is meant to
+ * be a fastpath heuristic to determine whether we should try to
+ * acquire the sighand lock to check/handle timers.
+ *
+ * In the worst case scenario, if 'running' or 'checking_timer' gets
+ * set but the current thread doesn't see the change yet, we'll wait
+ * until the next thread in the group gets a scheduler interrupt to
+ * handle the timer. This isn't an issue in practice because these
+ * types of delays with signals actually getting sent are expected.
+ */
+ if (READ_ONCE(sig->cputimer.running) &&
+ !READ_ONCE(sig->cputimer.checking_timer)) {
struct task_cputime group_sample;
- unsigned long flags;
- raw_spin_lock_irqsave(&sig->cputimer.lock, flags);
- group_sample = sig->cputimer.cputime;
- raw_spin_unlock_irqrestore(&sig->cputimer.lock, flags);
+ sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
if (task_cputime_expired(&group_sample, &sig->cputime_expires))
return 1;
@@ -1155,12 +1206,8 @@ static void __run_posix_cpu_timers(struct task_struct *tsk)
* put them on the firing list.
*/
check_thread_timers(tsk, &firing);
- /*
- * If there are any active process wide timers (POSIX 1.b, itimers,
- * RLIMIT_CPU) cputimer must be running.
- */
- if (tsk->signal->cputimer.running)
- check_process_timers(tsk, &firing);
+
+ check_process_timers(tsk, &firing);
/*
* We must release these locks before taking any timer's lock.
diff --git a/kernel/kernel/time/posix-timers.c b/kernel/kernel/time/posix-timers.c
index 0f5d7eae6..464a98155 100644
--- a/kernel/kernel/time/posix-timers.c
+++ b/kernel/kernel/time/posix-timers.c
@@ -272,13 +272,20 @@ static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
return 0;
}
+static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec *tp)
+{
+ tp->tv_sec = 0;
+ tp->tv_nsec = hrtimer_resolution;
+ return 0;
+}
+
/*
* Initialize everything, well, just everything in Posix clocks/timers ;)
*/
static __init int init_posix_timers(void)
{
struct k_clock clock_realtime = {
- .clock_getres = hrtimer_get_res,
+ .clock_getres = posix_get_hrtimer_res,
.clock_get = posix_clock_realtime_get,
.clock_set = posix_clock_realtime_set,
.clock_adj = posix_clock_realtime_adj,
@@ -290,7 +297,7 @@ static __init int init_posix_timers(void)
.timer_del = common_timer_del,
};
struct k_clock clock_monotonic = {
- .clock_getres = hrtimer_get_res,
+ .clock_getres = posix_get_hrtimer_res,
.clock_get = posix_ktime_get_ts,
.nsleep = common_nsleep,
.nsleep_restart = hrtimer_nanosleep_restart,
@@ -300,7 +307,7 @@ static __init int init_posix_timers(void)
.timer_del = common_timer_del,
};
struct k_clock clock_monotonic_raw = {
- .clock_getres = hrtimer_get_res,
+ .clock_getres = posix_get_hrtimer_res,
.clock_get = posix_get_monotonic_raw,
};
struct k_clock clock_realtime_coarse = {
@@ -312,7 +319,7 @@ static __init int init_posix_timers(void)
.clock_get = posix_get_monotonic_coarse,
};
struct k_clock clock_tai = {
- .clock_getres = hrtimer_get_res,
+ .clock_getres = posix_get_hrtimer_res,
.clock_get = posix_get_tai,
.nsleep = common_nsleep,
.nsleep_restart = hrtimer_nanosleep_restart,
@@ -322,7 +329,7 @@ static __init int init_posix_timers(void)
.timer_del = common_timer_del,
};
struct k_clock clock_boottime = {
- .clock_getres = hrtimer_get_res,
+ .clock_getres = posix_get_hrtimer_res,
.clock_get = posix_get_boottime,
.nsleep = common_nsleep,
.nsleep_restart = hrtimer_nanosleep_restart,
@@ -755,7 +762,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
(timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
- remaining = ktime_sub(hrtimer_get_expires(timer), now);
+ remaining = __hrtimer_expires_remaining_adjusted(timer, now);
/* Return 0 only, when the timer is expired and not pending */
if (remaining.tv64 <= 0) {
/*
diff --git a/kernel/kernel/time/tick-broadcast-hrtimer.c b/kernel/kernel/time/tick-broadcast-hrtimer.c
index 6aac4beed..1b4ac3361 100644
--- a/kernel/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/kernel/time/tick-broadcast-hrtimer.c
@@ -18,29 +18,23 @@
static struct hrtimer bctimer;
-static void bc_set_mode(enum clock_event_mode mode,
- struct clock_event_device *bc)
+static int bc_shutdown(struct clock_event_device *evt)
{
- switch (mode) {
- case CLOCK_EVT_MODE_SHUTDOWN:
- /*
- * Note, we cannot cancel the timer here as we might
- * run into the following live lock scenario:
- *
- * cpu 0 cpu1
- * lock(broadcast_lock);
- * hrtimer_interrupt()
- * bc_handler()
- * tick_handle_oneshot_broadcast();
- * lock(broadcast_lock);
- * hrtimer_cancel()
- * wait_for_callback()
- */
- hrtimer_try_to_cancel(&bctimer);
- break;
- default:
- break;
- }
+ /*
+ * Note, we cannot cancel the timer here as we might
+ * run into the following live lock scenario:
+ *
+ * cpu 0 cpu1
+ * lock(broadcast_lock);
+ * hrtimer_interrupt()
+ * bc_handler()
+ * tick_handle_oneshot_broadcast();
+ * lock(broadcast_lock);
+ * hrtimer_cancel()
+ * wait_for_callback()
+ */
+ hrtimer_try_to_cancel(&bctimer);
+ return 0;
}
/*
@@ -66,9 +60,11 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
* hrtimer_{start/cancel} functions call into tracing,
* calls to these functions must be bound within RCU_NONIDLE.
*/
- RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ?
- !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) :
- 0);
+ RCU_NONIDLE({
+ bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0;
+ if (bc_moved)
+ hrtimer_start(&bctimer, expires,
+ HRTIMER_MODE_ABS_PINNED);});
if (bc_moved) {
/* Bind the "device" to the cpu */
bc->bound_on = smp_processor_id();
@@ -79,7 +75,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
}
static struct clock_event_device ce_broadcast_hrtimer = {
- .set_mode = bc_set_mode,
+ .set_state_shutdown = bc_shutdown,
.set_next_ktime = bc_set_next,
.features = CLOCK_EVT_FEAT_ONESHOT |
CLOCK_EVT_FEAT_KTIME |
@@ -99,15 +95,17 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
{
ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer);
- if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX)
- return HRTIMER_NORESTART;
+ if (clockevent_state_oneshot(&ce_broadcast_hrtimer))
+ if (ce_broadcast_hrtimer.next_event.tv64 != KTIME_MAX)
+ return HRTIMER_RESTART;
- return HRTIMER_RESTART;
+ return HRTIMER_NORESTART;
}
void tick_setup_hrtimer_broadcast(void)
{
hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
bctimer.function = bc_handler;
+ bctimer.irqsafe = true;
clockevents_register_device(&ce_broadcast_hrtimer);
}
diff --git a/kernel/kernel/time/tick-broadcast.c b/kernel/kernel/time/tick-broadcast.c
index 7e8ca4f44..f6aae7977 100644
--- a/kernel/kernel/time/tick-broadcast.c
+++ b/kernel/kernel/time/tick-broadcast.c
@@ -159,7 +159,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
{
struct clock_event_device *bc = tick_broadcast_device.evtdev;
unsigned long flags;
- int ret;
+ int ret = 0;
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -221,13 +221,14 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
* If we kept the cpu in the broadcast mask,
* tell the caller to leave the per cpu device
* in shutdown state. The periodic interrupt
- * is delivered by the broadcast device.
+ * is delivered by the broadcast device, if
+ * the broadcast device exists and is not
+ * hrtimer based.
*/
- ret = cpumask_test_cpu(cpu, tick_broadcast_mask);
+ if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER))
+ ret = cpumask_test_cpu(cpu, tick_broadcast_mask);
break;
default:
- /* Nothing to do */
- ret = 0;
break;
}
}
@@ -255,18 +256,32 @@ int tick_receive_broadcast(void)
/*
* Broadcast the event to the cpus, which are set in the mask (mangled).
*/
-static void tick_do_broadcast(struct cpumask *mask)
+static bool tick_do_broadcast(struct cpumask *mask)
{
int cpu = smp_processor_id();
struct tick_device *td;
+ bool local = false;
/*
* Check, if the current cpu is in the mask
*/
if (cpumask_test_cpu(cpu, mask)) {
+ struct clock_event_device *bc = tick_broadcast_device.evtdev;
+
cpumask_clear_cpu(cpu, mask);
- td = &per_cpu(tick_cpu_device, cpu);
- td->evtdev->event_handler(td->evtdev);
+ /*
+ * We only run the local handler, if the broadcast
+ * device is not hrtimer based. Otherwise we run into
+ * a hrtimer recursion.
+ *
+ * local timer_interrupt()
+ * local_handler()
+ * expire_hrtimers()
+ * bc_handler()
+ * local_handler()
+ * expire_hrtimers()
+ */
+ local = !(bc->features & CLOCK_EVT_FEAT_HRTIMER);
}
if (!cpumask_empty(mask)) {
@@ -279,16 +294,17 @@ static void tick_do_broadcast(struct cpumask *mask)
td = &per_cpu(tick_cpu_device, cpumask_first(mask));
td->evtdev->broadcast(mask);
}
+ return local;
}
/*
* Periodic broadcast:
* - invoke the broadcast handlers
*/
-static void tick_do_periodic_broadcast(void)
+static bool tick_do_periodic_broadcast(void)
{
cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask);
- tick_do_broadcast(tmpmask);
+ return tick_do_broadcast(tmpmask);
}
/*
@@ -296,34 +312,33 @@ static void tick_do_periodic_broadcast(void)
*/
static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
{
- ktime_t next;
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+ bool bc_local;
raw_spin_lock(&tick_broadcast_lock);
- tick_do_periodic_broadcast();
+ /* Handle spurious interrupts gracefully */
+ if (clockevent_state_shutdown(tick_broadcast_device.evtdev)) {
+ raw_spin_unlock(&tick_broadcast_lock);
+ return;
+ }
- /*
- * The device is in periodic mode. No reprogramming necessary:
- */
- if (dev->state == CLOCK_EVT_STATE_PERIODIC)
- goto unlock;
+ bc_local = tick_do_periodic_broadcast();
- /*
- * Setup the next period for devices, which do not have
- * periodic mode. We read dev->next_event first and add to it
- * when the event already expired. clockevents_program_event()
- * sets dev->next_event only when the event is really
- * programmed to the device.
- */
- for (next = dev->next_event; ;) {
- next = ktime_add(next, tick_period);
+ if (clockevent_state_oneshot(dev)) {
+ ktime_t next = ktime_add(dev->next_event, tick_period);
- if (!clockevents_program_event(dev, next, false))
- goto unlock;
- tick_do_periodic_broadcast();
+ clockevents_program_event(dev, next, true);
}
-unlock:
raw_spin_unlock(&tick_broadcast_lock);
+
+ /*
+ * We run the handler of the local cpu after dropping
+ * tick_broadcast_lock because the handler might deadlock when
+ * trying to switch to oneshot mode.
+ */
+ if (bc_local)
+ td->evtdev->event_handler(td->evtdev);
}
/**
@@ -366,8 +381,16 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
case TICK_BROADCAST_ON:
cpumask_set_cpu(cpu, tick_broadcast_on);
if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
- if (tick_broadcast_device.mode ==
- TICKDEV_MODE_PERIODIC)
+ /*
+ * Only shutdown the cpu local device, if:
+ *
+ * - the broadcast device exists
+ * - the broadcast device is not a hrtimer based one
+ * - the broadcast device is in periodic mode to
+ * avoid a hickup during switch to oneshot mode
+ */
+ if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER) &&
+ tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
clockevents_shutdown(dev);
}
break;
@@ -386,14 +409,16 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
break;
}
- if (cpumask_empty(tick_broadcast_mask)) {
- if (!bc_stopped)
- clockevents_shutdown(bc);
- } else if (bc_stopped) {
- if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
- tick_broadcast_start_periodic(bc);
- else
- tick_broadcast_setup_oneshot(bc);
+ if (bc) {
+ if (cpumask_empty(tick_broadcast_mask)) {
+ if (!bc_stopped)
+ clockevents_shutdown(bc);
+ } else if (bc_stopped) {
+ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
+ tick_broadcast_start_periodic(bc);
+ else
+ tick_broadcast_setup_oneshot(bc);
+ }
}
raw_spin_unlock(&tick_broadcast_lock);
}
@@ -532,23 +557,19 @@ static void tick_broadcast_set_affinity(struct clock_event_device *bc,
irq_set_affinity(bc->irq, bc->cpumask);
}
-static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
- ktime_t expires, int force)
+static void tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
+ ktime_t expires)
{
- int ret;
-
- if (bc->state != CLOCK_EVT_STATE_ONESHOT)
- clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
+ if (!clockevent_state_oneshot(bc))
+ clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
- ret = clockevents_program_event(bc, expires, force);
- if (!ret)
- tick_broadcast_set_affinity(bc, cpumask_of(cpu));
- return ret;
+ clockevents_program_event(bc, expires, 1);
+ tick_broadcast_set_affinity(bc, cpumask_of(cpu));
}
static void tick_resume_broadcast_oneshot(struct clock_event_device *bc)
{
- clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
+ clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
}
/*
@@ -566,7 +587,7 @@ void tick_check_oneshot_broadcast_this_cpu(void)
* switched over, leave the device alone.
*/
if (td->mode == TICKDEV_MODE_ONESHOT) {
- clockevents_set_state(td->evtdev,
+ clockevents_switch_state(td->evtdev,
CLOCK_EVT_STATE_ONESHOT);
}
}
@@ -580,9 +601,9 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
struct tick_device *td;
ktime_t now, next_event;
int cpu, next_cpu = 0;
+ bool bc_local;
raw_spin_lock(&tick_broadcast_lock);
-again:
dev->next_event.tv64 = KTIME_MAX;
next_event.tv64 = KTIME_MAX;
cpumask_clear(tmpmask);
@@ -624,7 +645,7 @@ again:
/*
* Wakeup the cpus which have an expired event.
*/
- tick_do_broadcast(tmpmask);
+ bc_local = tick_do_broadcast(tmpmask);
/*
* Two reasons for reprogram:
@@ -636,15 +657,15 @@ again:
* - There are pending events on sleeping CPUs which were not
* in the event mask
*/
- if (next_event.tv64 != KTIME_MAX) {
- /*
- * Rearm the broadcast device. If event expired,
- * repeat the above
- */
- if (tick_broadcast_set_event(dev, next_cpu, next_event, 0))
- goto again;
- }
+ if (next_event.tv64 != KTIME_MAX)
+ tick_broadcast_set_event(dev, next_cpu, next_event);
+
raw_spin_unlock(&tick_broadcast_lock);
+
+ if (bc_local) {
+ td = this_cpu_ptr(&tick_cpu_device);
+ td->evtdev->event_handler(td->evtdev);
+ }
}
static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu)
@@ -670,77 +691,88 @@ static void broadcast_shutdown_local(struct clock_event_device *bc,
if (dev->next_event.tv64 < bc->next_event.tv64)
return;
}
- clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
}
-/**
- * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode
- * @state: The target state (enter/exit)
- *
- * The system enters/leaves a state, where affected devices might stop
- * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
- *
- * Called with interrupts disabled, so clockevents_lock is not
- * required here because the local clock event device cannot go away
- * under us.
- */
-int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
{
struct clock_event_device *bc, *dev;
- struct tick_device *td;
int cpu, ret = 0;
ktime_t now;
/*
- * Periodic mode does not care about the enter/exit of power
- * states
+ * If there is no broadcast device, tell the caller not to go
+ * into deep idle.
*/
- if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
- return 0;
+ if (!tick_broadcast_device.evtdev)
+ return -EBUSY;
- /*
- * We are called with preemtion disabled from the depth of the
- * idle code, so we can't be moved away.
- */
- td = this_cpu_ptr(&tick_cpu_device);
- dev = td->evtdev;
-
- if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
- return 0;
+ dev = this_cpu_ptr(&tick_cpu_device)->evtdev;
raw_spin_lock(&tick_broadcast_lock);
bc = tick_broadcast_device.evtdev;
cpu = smp_processor_id();
if (state == TICK_BROADCAST_ENTER) {
+ /*
+ * If the current CPU owns the hrtimer broadcast
+ * mechanism, it cannot go deep idle and we do not add
+ * the CPU to the broadcast mask. We don't have to go
+ * through the EXIT path as the local timer is not
+ * shutdown.
+ */
+ ret = broadcast_needs_cpu(bc, cpu);
+ if (ret)
+ goto out;
+
+ /*
+ * If the broadcast device is in periodic mode, we
+ * return.
+ */
+ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
+ /* If it is a hrtimer based broadcast, return busy */
+ if (bc->features & CLOCK_EVT_FEAT_HRTIMER)
+ ret = -EBUSY;
+ goto out;
+ }
+
if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
+
+ /* Conditionally shut down the local timer. */
broadcast_shutdown_local(bc, dev);
+
/*
* We only reprogram the broadcast timer if we
* did not mark ourself in the force mask and
* if the cpu local event is earlier than the
* broadcast event. If the current CPU is in
* the force mask, then we are going to be
- * woken by the IPI right away.
+ * woken by the IPI right away; we return
+ * busy, so the CPU does not try to go deep
+ * idle.
*/
- if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) &&
- dev->next_event.tv64 < bc->next_event.tv64)
- tick_broadcast_set_event(bc, cpu, dev->next_event, 1);
+ if (cpumask_test_cpu(cpu, tick_broadcast_force_mask)) {
+ ret = -EBUSY;
+ } else if (dev->next_event.tv64 < bc->next_event.tv64) {
+ tick_broadcast_set_event(bc, cpu, dev->next_event);
+ /*
+ * In case of hrtimer broadcasts the
+ * programming might have moved the
+ * timer to this cpu. If yes, remove
+ * us from the broadcast mask and
+ * return busy.
+ */
+ ret = broadcast_needs_cpu(bc, cpu);
+ if (ret) {
+ cpumask_clear_cpu(cpu,
+ tick_broadcast_oneshot_mask);
+ }
+ }
}
- /*
- * If the current CPU owns the hrtimer broadcast
- * mechanism, it cannot go deep idle and we remove the
- * CPU from the broadcast mask. We don't have to go
- * through the EXIT path as the local timer is not
- * shutdown.
- */
- ret = broadcast_needs_cpu(bc, cpu);
- if (ret)
- cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
} else {
if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
- clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
/*
* The cpu which was handling the broadcast
* timer marked this cpu in the broadcast
@@ -807,7 +839,6 @@ out:
raw_spin_unlock(&tick_broadcast_lock);
return ret;
}
-EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
/*
* Reset the one shot broadcast for a cpu
@@ -842,7 +873,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
/* Set it up only once ! */
if (bc->event_handler != tick_handle_oneshot_broadcast) {
- int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC;
+ int was_periodic = clockevent_state_periodic(bc);
bc->event_handler = tick_handle_oneshot_broadcast;
@@ -858,10 +889,10 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
tick_broadcast_oneshot_mask, tmpmask);
if (was_periodic && !cpumask_empty(tmpmask)) {
- clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
+ clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
tick_broadcast_init_next_event(tmpmask,
tick_next_period);
- tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
+ tick_broadcast_set_event(bc, cpu, tick_next_period);
} else
bc->next_event.tv64 = KTIME_MAX;
} else {
@@ -949,6 +980,16 @@ bool tick_broadcast_oneshot_available(void)
return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false;
}
+#else
+int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+{
+ struct clock_event_device *bc = tick_broadcast_device.evtdev;
+
+ if (!bc || (bc->features & CLOCK_EVT_FEAT_HRTIMER))
+ return -EBUSY;
+
+ return 0;
+}
#endif
void __init tick_broadcast_init(void)
diff --git a/kernel/kernel/time/tick-common.c b/kernel/kernel/time/tick-common.c
index 14a10917c..5a47f2e98 100644
--- a/kernel/kernel/time/tick-common.c
+++ b/kernel/kernel/time/tick-common.c
@@ -19,6 +19,7 @@
#include <linux/profile.h>
#include <linux/sched.h>
#include <linux/module.h>
+#include <trace/events/power.h>
#include <asm/irq_regs.h>
@@ -104,7 +105,17 @@ void tick_handle_periodic(struct clock_event_device *dev)
tick_periodic(cpu);
- if (dev->state != CLOCK_EVT_STATE_ONESHOT)
+#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_NO_HZ_COMMON)
+ /*
+ * The cpu might have transitioned to HIGHRES or NOHZ mode via
+ * update_process_times() -> run_local_timers() ->
+ * hrtimer_run_queues().
+ */
+ if (dev->event_handler != tick_handle_periodic)
+ return;
+#endif
+
+ if (!clockevent_state_oneshot(dev))
return;
for (;;) {
/*
@@ -142,7 +153,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
!tick_broadcast_oneshot_active()) {
- clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC);
} else {
unsigned long seq;
ktime_t next;
@@ -152,7 +163,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
next = tick_next_period;
} while (read_seqcount_retry(&jiffies_seq, seq));
- clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
for (;;) {
if (!clockevents_program_event(dev, next, false))
@@ -295,9 +306,6 @@ void tick_check_new_device(struct clock_event_device *newdev)
int cpu;
cpu = smp_processor_id();
- if (!cpumask_test_cpu(cpu, newdev->cpumask))
- goto out_bc;
-
td = &per_cpu(tick_cpu_device, cpu);
curdev = td->evtdev;
@@ -334,6 +342,28 @@ out_bc:
tick_install_broadcast_device(newdev);
}
+/**
+ * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode
+ * @state: The target state (enter/exit)
+ *
+ * The system enters/leaves a state, where affected devices might stop
+ * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
+ *
+ * Called with interrupts disabled, so clockevents_lock is not
+ * required here because the local clock event device cannot go away
+ * under us.
+ */
+int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+{
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+
+ if (!(td->evtdev->features & CLOCK_EVT_FEAT_C3STOP))
+ return 0;
+
+ return __tick_broadcast_oneshot_control(state);
+}
+EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
+
#ifdef CONFIG_HOTPLUG_CPU
/*
* Transfer the do_timer job away from a dying cpu.
@@ -369,8 +399,7 @@ void tick_shutdown(unsigned int cpu)
* Prevent that the clock events layer tries to call
* the set mode function!
*/
- dev->state = CLOCK_EVT_STATE_DETACHED;
- dev->mode = CLOCK_EVT_MODE_UNUSED;
+ clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
clockevents_exchange_device(dev, NULL);
dev->event_handler = clockevents_handle_noop;
td->evtdev = NULL;
@@ -442,6 +471,7 @@ void tick_resume(void)
tick_resume_local();
}
+#ifdef CONFIG_SUSPEND
static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
static unsigned int tick_freeze_depth;
@@ -459,10 +489,13 @@ void tick_freeze(void)
raw_spin_lock(&tick_freeze_lock);
tick_freeze_depth++;
- if (tick_freeze_depth == num_online_cpus())
+ if (tick_freeze_depth == num_online_cpus()) {
+ trace_suspend_resume(TPS("timekeeping_freeze"),
+ smp_processor_id(), true);
timekeeping_suspend();
- else
+ } else {
tick_suspend_local();
+ }
raw_spin_unlock(&tick_freeze_lock);
}
@@ -480,15 +513,19 @@ void tick_unfreeze(void)
{
raw_spin_lock(&tick_freeze_lock);
- if (tick_freeze_depth == num_online_cpus())
+ if (tick_freeze_depth == num_online_cpus()) {
timekeeping_resume();
- else
+ trace_suspend_resume(TPS("timekeeping_freeze"),
+ smp_processor_id(), false);
+ } else {
tick_resume_local();
+ }
tick_freeze_depth--;
raw_spin_unlock(&tick_freeze_lock);
}
+#endif /* CONFIG_SUSPEND */
/**
* tick_init - initialize the tick control
diff --git a/kernel/kernel/time/tick-internal.h b/kernel/kernel/time/tick-internal.h
index b64fdd805..966a5a6fd 100644
--- a/kernel/kernel/time/tick-internal.h
+++ b/kernel/kernel/time/tick-internal.h
@@ -36,11 +36,22 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
}
+static inline enum clock_event_state clockevent_get_state(struct clock_event_device *dev)
+{
+ return dev->state_use_accessors;
+}
+
+static inline void clockevent_set_state(struct clock_event_device *dev,
+ enum clock_event_state state)
+{
+ dev->state_use_accessors = state;
+}
+
extern void clockevents_shutdown(struct clock_event_device *dev);
extern void clockevents_exchange_device(struct clock_event_device *old,
struct clock_event_device *new);
-extern void clockevents_set_state(struct clock_event_device *dev,
- enum clock_event_state state);
+extern void clockevents_switch_state(struct clock_event_device *dev,
+ enum clock_event_state state);
extern int clockevents_program_event(struct clock_event_device *dev,
ktime_t expires, bool force);
extern void clockevents_handle_noop(struct clock_event_device *dev);
@@ -137,3 +148,19 @@ extern void tick_nohz_init(void);
# else
static inline void tick_nohz_init(void) { }
#endif
+
+#ifdef CONFIG_NO_HZ_COMMON
+extern unsigned long tick_nohz_active;
+#else
+#define tick_nohz_active (0)
+#endif
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+extern void timers_update_migration(bool update_nohz);
+#else
+static inline void timers_update_migration(bool update_nohz) { }
+#endif
+
+DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
+
+extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
diff --git a/kernel/kernel/time/tick-oneshot.c b/kernel/kernel/time/tick-oneshot.c
index 67a64b167..b51344652 100644
--- a/kernel/kernel/time/tick-oneshot.c
+++ b/kernel/kernel/time/tick-oneshot.c
@@ -28,6 +28,22 @@ int tick_program_event(ktime_t expires, int force)
{
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
+ if (unlikely(expires.tv64 == KTIME_MAX)) {
+ /*
+ * We don't need the clock event device any more, stop it.
+ */
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED);
+ return 0;
+ }
+
+ if (unlikely(clockevent_state_oneshot_stopped(dev))) {
+ /*
+ * We need the clock event again, configure it in ONESHOT mode
+ * before using it.
+ */
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
+ }
+
return clockevents_program_event(dev, expires, force);
}
@@ -38,7 +54,7 @@ void tick_resume_oneshot(void)
{
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
- clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
clockevents_program_event(dev, ktime_get(), true);
}
@@ -50,7 +66,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
ktime_t next_event)
{
newdev->event_handler = handler;
- clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT);
+ clockevents_switch_state(newdev, CLOCK_EVT_STATE_ONESHOT);
clockevents_program_event(newdev, next_event, true);
}
@@ -81,7 +97,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
td->mode = TICKDEV_MODE_ONESHOT;
dev->event_handler = handler;
- clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
tick_broadcast_switch_to_oneshot();
return 0;
}
diff --git a/kernel/kernel/time/tick-sched.c b/kernel/kernel/time/tick-sched.c
index f61dbf202..d536824cb 100644
--- a/kernel/kernel/time/tick-sched.c
+++ b/kernel/kernel/time/tick-sched.c
@@ -207,27 +207,9 @@ static bool can_stop_full_tick(void)
return true;
}
-static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
-
-/*
- * Re-evaluate the need for the tick on the current CPU
- * and restart it if necessary.
- */
-void __tick_nohz_full_check(void)
-{
- struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
-
- if (tick_nohz_full_cpu(smp_processor_id())) {
- if (ts->tick_stopped && !is_idle_task(current)) {
- if (!can_stop_full_tick())
- tick_nohz_restart_sched_tick(ts, ktime_get());
- }
- }
-}
-
static void nohz_full_kick_work_func(struct irq_work *work)
{
- __tick_nohz_full_check();
+ /* Empty, the tick restart happens on tick_nohz_irq_exit() */
}
static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
@@ -263,7 +245,7 @@ void tick_nohz_full_kick_cpu(int cpu)
static void nohz_full_kick_ipi(void *info)
{
- __tick_nohz_full_check();
+ /* Empty, the tick restart happens on tick_nohz_irq_exit() */
}
/*
@@ -287,7 +269,7 @@ void tick_nohz_full_kick_all(void)
* It might need the tick due to per task/process properties:
* perf events, posix cpu timers, ...
*/
-void __tick_nohz_task_switch(struct task_struct *tsk)
+void __tick_nohz_task_switch(void)
{
unsigned long flags;
@@ -319,16 +301,17 @@ static int __init tick_nohz_full_setup(char *str)
__setup("nohz_full=", tick_nohz_full_setup);
static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+ unsigned long action,
+ void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DOWN_PREPARE:
/*
- * If we handle the timekeeping duty for full dynticks CPUs,
- * we can't safely shutdown that CPU.
+ * The boot CPU handles housekeeping duty (unbound timers,
+ * workqueues, timekeeping, ...) on behalf of full dynticks
+ * CPUs. It must remain online when nohz full is enabled.
*/
if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
return NOTIFY_BAD;
@@ -399,6 +382,12 @@ void __init tick_nohz_init(void)
cpu_notifier(tick_nohz_cpu_down_callback, 0);
pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
cpumask_pr_args(tick_nohz_full_mask));
+
+ /*
+ * We need at least one CPU to handle housekeeping work such
+ * as timekeeping, unbound timers, workqueues, ...
+ */
+ WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
}
#endif
@@ -410,7 +399,7 @@ void __init tick_nohz_init(void)
* NO HZ enabled ?
*/
static int tick_nohz_enabled __read_mostly = 1;
-int tick_nohz_active __read_mostly;
+unsigned long tick_nohz_active __read_mostly;
/*
* Enable / Disable tickless mode
*/
@@ -578,179 +567,176 @@ EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
{
- hrtimer_cancel(&ts->sched_timer);
- hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
+ hrtimer_cancel(&ts->sched_timer);
+ hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
- /* Forward the time to expire in the future */
- hrtimer_forward(&ts->sched_timer, now, tick_period);
+ /* Forward the time to expire in the future */
+ hrtimer_forward(&ts->sched_timer, now, tick_period);
- if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
- hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
- else
- tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+ hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
+ else
+ tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
}
static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
ktime_t now, int cpu)
{
- unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
- ktime_t last_update, expires, ret = { .tv64 = 0 };
- unsigned long rcu_delta_jiffies;
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
- u64 time_delta;
-
- time_delta = timekeeping_max_deferment();
+ u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
+ unsigned long seq, basejiff;
+ ktime_t tick;
/* Read jiffies and the time when jiffies were updated last */
do {
seq = read_seqcount_begin(&jiffies_seq);
- last_update = last_jiffies_update;
- last_jiffies = jiffies;
+ basemono = last_jiffies_update.tv64;
+ basejiff = jiffies;
} while (read_seqcount_retry(&jiffies_seq, seq));
+ ts->last_jiffies = basejiff;
- if (rcu_needs_cpu(&rcu_delta_jiffies) ||
+ if (rcu_needs_cpu(basemono, &next_rcu) ||
arch_needs_cpu() || irq_work_needs_cpu()) {
- next_jiffies = last_jiffies + 1;
- delta_jiffies = 1;
+ next_tick = basemono + TICK_NSEC;
} else {
- /* Get the next timer wheel timer */
- next_jiffies = get_next_timer_interrupt(last_jiffies);
- delta_jiffies = next_jiffies - last_jiffies;
- if (rcu_delta_jiffies < delta_jiffies) {
- next_jiffies = last_jiffies + rcu_delta_jiffies;
- delta_jiffies = rcu_delta_jiffies;
- }
+ /*
+ * Get the next pending timer. If high resolution
+ * timers are enabled this only takes the timer wheel
+ * timers into account. If high resolution timers are
+ * disabled this also looks at the next expiring
+ * hrtimer.
+ */
+ next_tmr = get_next_timer_interrupt(basejiff, basemono);
+ ts->next_timer = next_tmr;
+ /* Take the next rcu event into account */
+ next_tick = next_rcu < next_tmr ? next_rcu : next_tmr;
}
/*
- * Do not stop the tick, if we are only one off (or less)
- * or if the cpu is required for RCU:
+ * If the tick is due in the next period, keep it ticking or
+ * restart it proper.
*/
- if (!ts->tick_stopped && delta_jiffies <= 1)
- goto out;
-
- /* Schedule the tick, if we are at least one jiffie off */
- if ((long)delta_jiffies >= 1) {
-
- /*
- * If this cpu is the one which updates jiffies, then
- * give up the assignment and let it be taken by the
- * cpu which runs the tick timer next, which might be
- * this cpu as well. If we don't drop this here the
- * jiffies might be stale and do_timer() never
- * invoked. Keep track of the fact that it was the one
- * which had the do_timer() duty last. If this cpu is
- * the one which had the do_timer() duty last, we
- * limit the sleep time to the timekeeping
- * max_deferement value which we retrieved
- * above. Otherwise we can sleep as long as we want.
- */
- if (cpu == tick_do_timer_cpu) {
- tick_do_timer_cpu = TICK_DO_TIMER_NONE;
- ts->do_timer_last = 1;
- } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
- time_delta = KTIME_MAX;
- ts->do_timer_last = 0;
- } else if (!ts->do_timer_last) {
- time_delta = KTIME_MAX;
+ delta = next_tick - basemono;
+ if (delta <= (u64)TICK_NSEC) {
+ tick.tv64 = 0;
+ if (!ts->tick_stopped)
+ goto out;
+ if (delta == 0) {
+ /* Tick is stopped, but required now. Enforce it */
+ tick_nohz_restart(ts, now);
+ goto out;
}
+ }
+
+ /*
+ * If this cpu is the one which updates jiffies, then give up
+ * the assignment and let it be taken by the cpu which runs
+ * the tick timer next, which might be this cpu as well. If we
+ * don't drop this here the jiffies might be stale and
+ * do_timer() never invoked. Keep track of the fact that it
+ * was the one which had the do_timer() duty last. If this cpu
+ * is the one which had the do_timer() duty last, we limit the
+ * sleep time to the timekeeping max_deferement value.
+ * Otherwise we can sleep as long as we want.
+ */
+ delta = timekeeping_max_deferment();
+ if (cpu == tick_do_timer_cpu) {
+ tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+ ts->do_timer_last = 1;
+ } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
+ delta = KTIME_MAX;
+ ts->do_timer_last = 0;
+ } else if (!ts->do_timer_last) {
+ delta = KTIME_MAX;
+ }
#ifdef CONFIG_NO_HZ_FULL
- if (!ts->inidle) {
- time_delta = min(time_delta,
- scheduler_tick_max_deferment());
- }
+ /* Limit the tick delta to the maximum scheduler deferment */
+ if (!ts->inidle)
+ delta = min(delta, scheduler_tick_max_deferment());
#endif
- /*
- * calculate the expiry time for the next timer wheel
- * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
- * that there is no timer pending or at least extremely
- * far into the future (12 days for HZ=1000). In this
- * case we set the expiry to the end of time.
- */
- if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
- /*
- * Calculate the time delta for the next timer event.
- * If the time delta exceeds the maximum time delta
- * permitted by the current clocksource then adjust
- * the time delta accordingly to ensure the
- * clocksource does not wrap.
- */
- time_delta = min_t(u64, time_delta,
- tick_period.tv64 * delta_jiffies);
- }
-
- if (time_delta < KTIME_MAX)
- expires = ktime_add_ns(last_update, time_delta);
- else
- expires.tv64 = KTIME_MAX;
-
- /* Skip reprogram of event if its not changed */
- if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
- goto out;
+ /* Calculate the next expiry time */
+ if (delta < (KTIME_MAX - basemono))
+ expires = basemono + delta;
+ else
+ expires = KTIME_MAX;
- ret = expires;
+ expires = min_t(u64, expires, next_tick);
+ tick.tv64 = expires;
- /*
- * nohz_stop_sched_tick can be called several times before
- * the nohz_restart_sched_tick is called. This happens when
- * interrupts arrive which do not cause a reschedule. In the
- * first call we save the current tick time, so we can restart
- * the scheduler tick in nohz_restart_sched_tick.
- */
- if (!ts->tick_stopped) {
- nohz_balance_enter_idle(cpu);
- calc_load_enter_idle();
-
- ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
- ts->tick_stopped = 1;
- trace_tick_stop(1, " ");
- }
+ /* Skip reprogram of event if its not changed */
+ if (ts->tick_stopped && (expires == dev->next_event.tv64))
+ goto out;
- /*
- * If the expiration time == KTIME_MAX, then
- * in this case we simply stop the tick timer.
- */
- if (unlikely(expires.tv64 == KTIME_MAX)) {
- if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
- hrtimer_cancel(&ts->sched_timer);
- goto out;
- }
+ /*
+ * nohz_stop_sched_tick can be called several times before
+ * the nohz_restart_sched_tick is called. This happens when
+ * interrupts arrive which do not cause a reschedule. In the
+ * first call we save the current tick time, so we can restart
+ * the scheduler tick in nohz_restart_sched_tick.
+ */
+ if (!ts->tick_stopped) {
+ nohz_balance_enter_idle(cpu);
+ calc_load_enter_idle();
- if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
- hrtimer_start(&ts->sched_timer, expires,
- HRTIMER_MODE_ABS_PINNED);
- else
- tick_program_event(expires, 1);
- } else {
- /* Tick is stopped, but required now. Enforce it */
- tick_nohz_restart(ts, now);
+ ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
+ ts->tick_stopped = 1;
+ trace_tick_stop(1, " ");
+ }
+ /*
+ * If the expiration time == KTIME_MAX, then we simply stop
+ * the tick timer.
+ */
+ if (unlikely(expires == KTIME_MAX)) {
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+ hrtimer_cancel(&ts->sched_timer);
+ goto out;
}
+
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+ hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED);
+ else
+ tick_program_event(tick, 1);
out:
- ts->next_jiffies = next_jiffies;
- ts->last_jiffies = last_jiffies;
+ /* Update the estimated sleep length */
ts->sleep_length = ktime_sub(dev->next_event, now);
+ return tick;
+}
- return ret;
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
+{
+ /* Update jiffies first */
+ tick_do_update_jiffies64(now);
+ update_cpu_load_nohz();
+
+ calc_load_exit_idle();
+ touch_softlockup_watchdog();
+ /*
+ * Cancel the scheduled timer and restore the tick
+ */
+ ts->tick_stopped = 0;
+ ts->idle_exittime = now;
+
+ tick_nohz_restart(ts, now);
}
-static void tick_nohz_full_stop_tick(struct tick_sched *ts)
+static void tick_nohz_full_update_tick(struct tick_sched *ts)
{
#ifdef CONFIG_NO_HZ_FULL
int cpu = smp_processor_id();
- if (!tick_nohz_full_cpu(cpu) || is_idle_task(current))
+ if (!tick_nohz_full_cpu(cpu))
return;
if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
return;
- if (!can_stop_full_tick())
- return;
-
- tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
+ if (can_stop_full_tick())
+ tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
+ else if (ts->tick_stopped)
+ tick_nohz_restart_sched_tick(ts, ktime_get());
#endif
}
@@ -873,7 +859,7 @@ void tick_nohz_irq_exit(void)
if (ts->inidle)
__tick_nohz_idle_enter(ts);
else
- tick_nohz_full_stop_tick(ts);
+ tick_nohz_full_update_tick(ts);
}
/**
@@ -888,23 +874,6 @@ ktime_t tick_nohz_get_sleep_length(void)
return ts->sleep_length;
}
-static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
-{
- /* Update jiffies first */
- tick_do_update_jiffies64(now);
- update_cpu_load_nohz();
-
- calc_load_exit_idle();
- touch_softlockup_watchdog();
- /*
- * Cancel the scheduled timer and restore the tick
- */
- ts->tick_stopped = 0;
- ts->idle_exittime = now;
-
- tick_nohz_restart(ts, now);
-}
-
static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
{
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
@@ -958,12 +927,6 @@ void tick_nohz_idle_exit(void)
local_irq_enable();
}
-static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
-{
- hrtimer_forward(&ts->sched_timer, now, tick_period);
- return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0);
-}
-
/*
* The nohz low res interrupt handler
*/
@@ -982,10 +945,18 @@ static void tick_nohz_handler(struct clock_event_device *dev)
if (unlikely(ts->tick_stopped))
return;
- while (tick_nohz_reprogram(ts, now)) {
- now = ktime_get();
- tick_do_update_jiffies64(now);
- }
+ hrtimer_forward(&ts->sched_timer, now, tick_period);
+ tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+}
+
+static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
+{
+ if (!tick_nohz_enabled)
+ return;
+ ts->nohz_mode = mode;
+ /* One update is enough */
+ if (!test_and_set_bit(0, &tick_nohz_active))
+ timers_update_migration(true);
}
/**
@@ -999,13 +970,8 @@ static void tick_nohz_switch_to_nohz(void)
if (!tick_nohz_enabled)
return;
- local_irq_disable();
- if (tick_switch_to_oneshot(tick_nohz_handler)) {
- local_irq_enable();
+ if (tick_switch_to_oneshot(tick_nohz_handler))
return;
- }
- tick_nohz_active = 1;
- ts->nohz_mode = NOHZ_MODE_LOWRES;
/*
* Recycle the hrtimer in ts, so we can share the
@@ -1015,13 +981,10 @@ static void tick_nohz_switch_to_nohz(void)
/* Get the next period */
next = tick_init_jiffy_update();
- for (;;) {
- hrtimer_set_expires(&ts->sched_timer, next);
- if (!tick_program_event(next, 0))
- break;
- next = ktime_add(next, tick_period);
- }
- local_irq_enable();
+ hrtimer_set_expires(&ts->sched_timer, next);
+ hrtimer_forward_now(&ts->sched_timer, tick_period);
+ tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+ tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
}
/*
@@ -1073,6 +1036,7 @@ static inline void tick_nohz_irq_enter(void)
static inline void tick_nohz_switch_to_nohz(void) { }
static inline void tick_nohz_irq_enter(void) { }
+static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { }
#endif /* CONFIG_NO_HZ_COMMON */
@@ -1154,22 +1118,9 @@ void tick_setup_sched_timer(void)
hrtimer_add_expires_ns(&ts->sched_timer, offset);
}
- for (;;) {
- hrtimer_forward(&ts->sched_timer, now, tick_period);
- hrtimer_start_expires(&ts->sched_timer,
- HRTIMER_MODE_ABS_PINNED);
- /* Check, if the timer was already in the past */
- if (hrtimer_active(&ts->sched_timer))
- break;
- now = ktime_get();
- }
-
-#ifdef CONFIG_NO_HZ_COMMON
- if (tick_nohz_enabled) {
- ts->nohz_mode = NOHZ_MODE_HIGHRES;
- tick_nohz_active = 1;
- }
-#endif
+ hrtimer_forward(&ts->sched_timer, now, tick_period);
+ hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
+ tick_nohz_activate(ts, NOHZ_MODE_HIGHRES);
}
#endif /* HIGH_RES_TIMERS */
@@ -1214,7 +1165,7 @@ void tick_oneshot_notify(void)
* Called cyclic from the hrtimer softirq (driven by the timer
* softirq) allow_nohz signals, that we can switch into low-res nohz
* mode, because high resolution timers are disabled (either compile
- * or runtime).
+ * or runtime). Called with interrupts disabled.
*/
int tick_check_oneshot_change(int allow_nohz)
{
diff --git a/kernel/kernel/time/tick-sched.h b/kernel/kernel/time/tick-sched.h
index 28b5da3e1..a4a8d4e9b 100644
--- a/kernel/kernel/time/tick-sched.h
+++ b/kernel/kernel/time/tick-sched.h
@@ -57,7 +57,7 @@ struct tick_sched {
ktime_t iowait_sleeptime;
ktime_t sleep_length;
unsigned long last_jiffies;
- unsigned long next_jiffies;
+ u64 next_timer;
ktime_t idle_expires;
int do_timer_last;
};
@@ -71,4 +71,14 @@ extern void tick_cancel_sched_timer(int cpu);
static inline void tick_cancel_sched_timer(int cpu) { }
#endif
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+extern int __tick_broadcast_oneshot_control(enum tick_broadcast_state state);
+#else
+static inline int
+__tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+{
+ return -EBUSY;
+}
+#endif
+
#endif
diff --git a/kernel/kernel/time/time.c b/kernel/kernel/time/time.c
index 2c85b7724..86751c68e 100644
--- a/kernel/kernel/time/time.c
+++ b/kernel/kernel/time/time.c
@@ -41,7 +41,7 @@
#include <asm/uaccess.h>
#include <asm/unistd.h>
-#include "timeconst.h"
+#include <generated/timeconst.h>
#include "timekeeping.h"
/*
@@ -173,6 +173,10 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
return error;
if (tz) {
+ /* Verify we're witin the +-15 hrs range */
+ if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60)
+ return -EINVAL;
+
sys_tz = *tz;
update_vsyscall_tz();
if (firsttime) {
@@ -264,10 +268,14 @@ EXPORT_SYMBOL(jiffies_to_msecs);
unsigned int jiffies_to_usecs(const unsigned long j)
{
-#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
+ /*
+ * Hz usually doesn't go much further MSEC_PER_SEC.
+ * jiffies_to_usecs() and usecs_to_jiffies() depend on that.
+ */
+ BUILD_BUG_ON(HZ > USEC_PER_SEC);
+
+#if !(USEC_PER_SEC % HZ)
return (USEC_PER_SEC / HZ) * j;
-#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
- return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
#else
# if BITS_PER_LONG == 32
return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
@@ -283,26 +291,20 @@ EXPORT_SYMBOL(jiffies_to_usecs);
* @t: Timespec
* @gran: Granularity in ns.
*
- * Truncate a timespec to a granularity. gran must be smaller than a second.
- * Always rounds down.
- *
- * This function should be only used for timestamps returned by
- * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because
- * it doesn't handle the better resolution of the latter.
+ * Truncate a timespec to a granularity. Always rounds down. gran must
+ * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns).
*/
struct timespec timespec_trunc(struct timespec t, unsigned gran)
{
- /*
- * Division is pretty slow so avoid it for common cases.
- * Currently current_kernel_time() never returns better than
- * jiffies resolution. Exploit that.
- */
- if (gran <= jiffies_to_usecs(1) * 1000) {
+ /* Avoid division in the common cases 1 ns and 1 s. */
+ if (gran == 1) {
/* nothing */
- } else if (gran == 1000000000) {
+ } else if (gran == NSEC_PER_SEC) {
t.tv_nsec = 0;
- } else {
+ } else if (gran > 1 && gran < NSEC_PER_SEC) {
t.tv_nsec -= t.tv_nsec % gran;
+ } else {
+ WARN(1, "illegal file time granularity: %u", gran);
}
return t;
}
@@ -483,9 +485,11 @@ struct timespec64 ns_to_timespec64(const s64 nsec)
}
EXPORT_SYMBOL(ns_to_timespec64);
#endif
-/*
- * When we convert to jiffies then we interpret incoming values
- * the following way:
+/**
+ * msecs_to_jiffies: - convert milliseconds to jiffies
+ * @m: time in milliseconds
+ *
+ * conversion is done as follows:
*
* - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
*
@@ -493,66 +497,36 @@ EXPORT_SYMBOL(ns_to_timespec64);
* MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
*
* - all other values are converted to jiffies by either multiplying
- * the input value by a factor or dividing it with a factor
- *
- * We must also be careful about 32-bit overflows.
+ * the input value by a factor or dividing it with a factor and
+ * handling any 32-bit overflows.
+ * for the details see __msecs_to_jiffies()
+ *
+ * msecs_to_jiffies() checks for the passed in value being a constant
+ * via __builtin_constant_p() allowing gcc to eliminate most of the
+ * code, __msecs_to_jiffies() is called if the value passed does not
+ * allow constant folding and the actual conversion must be done at
+ * runtime.
+ * the _msecs_to_jiffies helpers are the HZ dependent conversion
+ * routines found in include/linux/jiffies.h
*/
-unsigned long msecs_to_jiffies(const unsigned int m)
+unsigned long __msecs_to_jiffies(const unsigned int m)
{
/*
* Negative value, means infinite timeout:
*/
if ((int)m < 0)
return MAX_JIFFY_OFFSET;
-
-#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
- /*
- * HZ is equal to or smaller than 1000, and 1000 is a nice
- * round multiple of HZ, divide with the factor between them,
- * but round upwards:
- */
- return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
-#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
- /*
- * HZ is larger than 1000, and HZ is a nice round multiple of
- * 1000 - simply multiply with the factor between them.
- *
- * But first make sure the multiplication result cannot
- * overflow:
- */
- if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
- return MAX_JIFFY_OFFSET;
-
- return m * (HZ / MSEC_PER_SEC);
-#else
- /*
- * Generic case - multiply, round and divide. But first
- * check that if we are doing a net multiplication, that
- * we wouldn't overflow:
- */
- if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
- return MAX_JIFFY_OFFSET;
-
- return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
- >> MSEC_TO_HZ_SHR32;
-#endif
+ return _msecs_to_jiffies(m);
}
-EXPORT_SYMBOL(msecs_to_jiffies);
+EXPORT_SYMBOL(__msecs_to_jiffies);
-unsigned long usecs_to_jiffies(const unsigned int u)
+unsigned long __usecs_to_jiffies(const unsigned int u)
{
if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
return MAX_JIFFY_OFFSET;
-#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
- return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
-#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
- return u * (HZ / USEC_PER_SEC);
-#else
- return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
- >> USEC_TO_HZ_SHR32;
-#endif
+ return _usecs_to_jiffies(u);
}
-EXPORT_SYMBOL(usecs_to_jiffies);
+EXPORT_SYMBOL(__usecs_to_jiffies);
/*
* The TICK_NSEC - 1 rounds up the value to the next resolution. Note
@@ -570,7 +544,7 @@ EXPORT_SYMBOL(usecs_to_jiffies);
* value to a scaled second value.
*/
static unsigned long
-__timespec_to_jiffies(unsigned long sec, long nsec)
+__timespec64_to_jiffies(u64 sec, long nsec)
{
nsec = nsec + TICK_NSEC - 1;
@@ -578,22 +552,27 @@ __timespec_to_jiffies(unsigned long sec, long nsec)
sec = MAX_SEC_IN_JIFFIES;
nsec = 0;
}
- return (((u64)sec * SEC_CONVERSION) +
+ return ((sec * SEC_CONVERSION) +
(((u64)nsec * NSEC_CONVERSION) >>
(NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
}
-unsigned long
-timespec_to_jiffies(const struct timespec *value)
+static unsigned long
+__timespec_to_jiffies(unsigned long sec, long nsec)
{
- return __timespec_to_jiffies(value->tv_sec, value->tv_nsec);
+ return __timespec64_to_jiffies((u64)sec, nsec);
}
-EXPORT_SYMBOL(timespec_to_jiffies);
+unsigned long
+timespec64_to_jiffies(const struct timespec64 *value)
+{
+ return __timespec64_to_jiffies(value->tv_sec, value->tv_nsec);
+}
+EXPORT_SYMBOL(timespec64_to_jiffies);
void
-jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
+jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value)
{
/*
* Convert jiffies to nanoseconds and separate with
@@ -604,7 +583,7 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
NSEC_PER_SEC, &rem);
value->tv_nsec = rem;
}
-EXPORT_SYMBOL(jiffies_to_timespec);
+EXPORT_SYMBOL(jiffies_to_timespec64);
/*
* We could use a similar algorithm to timespec_to_jiffies (with a
diff --git a/kernel/kernel/time/timeconst.bc b/kernel/kernel/time/timeconst.bc
index 511bdf2ca..c48688904 100644
--- a/kernel/kernel/time/timeconst.bc
+++ b/kernel/kernel/time/timeconst.bc
@@ -39,7 +39,7 @@ define fmuls(b,n,d) {
}
define timeconst(hz) {
- print "/* Automatically generated by kernel/timeconst.bc */\n"
+ print "/* Automatically generated by kernel/time/timeconst.bc */\n"
print "/* Time conversion constants for HZ == ", hz, " */\n"
print "\n"
@@ -50,7 +50,7 @@ define timeconst(hz) {
print "#include <linux/types.h>\n\n"
print "#if HZ != ", hz, "\n"
- print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n"
+ print "#error \qinclude/generated/timeconst.h has the wrong HZ value!\q\n"
print "#endif\n\n"
if (hz < 2) {
@@ -105,4 +105,5 @@ define timeconst(hz) {
halt
}
+hz = read();
timeconst(hz)
diff --git a/kernel/kernel/time/timekeeping.c b/kernel/kernel/time/timekeeping.c
index 17d4a8933..a1c5c6fc2 100644
--- a/kernel/kernel/time/timekeeping.c
+++ b/kernel/kernel/time/timekeeping.c
@@ -118,18 +118,6 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
#ifdef CONFIG_DEBUG_TIMEKEEPING
#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
-/*
- * These simple flag variables are managed
- * without locks, which is racy, but ok since
- * we don't really care about being super
- * precise about how many events were seen,
- * just that a problem was observed.
- */
-static int timekeeping_underflow_seen;
-static int timekeeping_overflow_seen;
-
-/* last_warning is only modified under the timekeeping lock */
-static long timekeeping_last_warning;
static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
{
@@ -149,29 +137,30 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
}
}
- if (timekeeping_underflow_seen) {
- if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+ if (tk->underflow_seen) {
+ if (jiffies - tk->last_warning > WARNING_FREQ) {
printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
printk_deferred(" Please report this, consider using a different clocksource, if possible.\n");
printk_deferred(" Your kernel is probably still fine.\n");
- timekeeping_last_warning = jiffies;
+ tk->last_warning = jiffies;
}
- timekeeping_underflow_seen = 0;
+ tk->underflow_seen = 0;
}
- if (timekeeping_overflow_seen) {
- if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+ if (tk->overflow_seen) {
+ if (jiffies - tk->last_warning > WARNING_FREQ) {
printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
printk_deferred(" Please report this, consider using a different clocksource, if possible.\n");
printk_deferred(" Your kernel is probably still fine.\n");
- timekeeping_last_warning = jiffies;
+ tk->last_warning = jiffies;
}
- timekeeping_overflow_seen = 0;
+ tk->overflow_seen = 0;
}
}
static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
{
+ struct timekeeper *tk = &tk_core.timekeeper;
cycle_t now, last, mask, max, delta;
unsigned int seq;
@@ -197,13 +186,13 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
* mask-relative negative values.
*/
if (unlikely((~delta & mask) < (mask >> 3))) {
- timekeeping_underflow_seen = 1;
+ tk->underflow_seen = 1;
delta = 0;
}
/* Cap delta value to the max_cycles values to avoid mult overflows */
if (unlikely(delta > max)) {
- timekeeping_overflow_seen = 1;
+ tk->overflow_seen = 1;
delta = tkr->clock->max_cycles;
}
@@ -316,8 +305,7 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
delta = timekeeping_get_delta(tkr);
- nsec = delta * tkr->mult + tkr->xtime_nsec;
- nsec >>= tkr->shift;
+ nsec = (delta * tkr->mult + tkr->xtime_nsec) >> tkr->shift;
/* If arch requires, add in get_arch_timeoffset() */
return nsec + arch_gettimeoffset();
@@ -330,32 +318,7 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
* We want to use this from any context including NMI and tracing /
* instrumenting the timekeeping code itself.
*
- * So we handle this differently than the other timekeeping accessor
- * functions which retry when the sequence count has changed. The
- * update side does:
- *
- * smp_wmb(); <- Ensure that the last base[1] update is visible
- * tkf->seq++;
- * smp_wmb(); <- Ensure that the seqcount update is visible
- * update(tkf->base[0], tkr);
- * smp_wmb(); <- Ensure that the base[0] update is visible
- * tkf->seq++;
- * smp_wmb(); <- Ensure that the seqcount update is visible
- * update(tkf->base[1], tkr);
- *
- * The reader side does:
- *
- * do {
- * seq = tkf->seq;
- * smp_rmb();
- * idx = seq & 0x01;
- * now = now(tkf->base[idx]);
- * smp_rmb();
- * } while (seq != tkf->seq)
- *
- * As long as we update base[0] readers are forced off to
- * base[1]. Once base[0] is updated readers are redirected to base[0]
- * and the base[1] update takes place.
+ * Employ the latch technique; see @raw_write_seqcount_latch.
*
* So if a NMI hits the update of base[0] then it will use base[1]
* which is still consistent. In the worst case this can result is a
@@ -418,7 +381,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
u64 now;
do {
- seq = raw_read_seqcount(&tkf->seq);
+ seq = raw_read_seqcount_latch(&tkf->seq);
tkr = tkf->base + (seq & 0x01);
now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
} while (read_seqcount_retry(&tkf->seq, seq));
@@ -551,6 +514,17 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
/*
+ * tk_update_leap_state - helper to update the next_leap_ktime
+ */
+static inline void tk_update_leap_state(struct timekeeper *tk)
+{
+ tk->next_leap_ktime = ntp_get_next_leap();
+ if (tk->next_leap_ktime.tv64 != KTIME_MAX)
+ /* Convert to monotonic time */
+ tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real);
+}
+
+/*
* Update the ktime_t based scalar nsec members of the timekeeper
*/
static inline void tk_update_ktime_data(struct timekeeper *tk)
@@ -591,17 +565,25 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
ntp_clear();
}
+ tk_update_leap_state(tk);
tk_update_ktime_data(tk);
update_vsyscall(tk);
update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
+ update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
+ update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
+
+ if (action & TK_CLOCK_WAS_SET)
+ tk->clock_was_set_seq++;
+ /*
+ * The mirroring of the data to the shadow-timekeeper needs
+ * to happen last here to ensure we don't over-write the
+ * timekeeper structure on the next update with stale data
+ */
if (action & TK_MIRROR)
memcpy(&shadow_timekeeper, &tk_core.timekeeper,
sizeof(tk_core.timekeeper));
-
- update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
- update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
}
/**
@@ -699,6 +681,23 @@ ktime_t ktime_get(void)
}
EXPORT_SYMBOL_GPL(ktime_get);
+u32 ktime_get_resolution_ns(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned int seq;
+ u32 nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return nsecs;
+}
+EXPORT_SYMBOL_GPL(ktime_get_resolution_ns);
+
static ktime_t *offsets[TK_OFFS_MAX] = {
[TK_OFFS_REAL] = &tk_core.timekeeper.offs_real,
[TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot,
@@ -849,7 +848,7 @@ EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
#ifdef CONFIG_NTP_PPS
/**
- * getnstime_raw_and_real - get day and raw monotonic time in timespec format
+ * ktime_get_raw_and_real_ts64 - get day and raw monotonic time in timespec format
* @ts_raw: pointer to the timespec to be set to raw monotonic time
* @ts_real: pointer to the timespec to be set to the time of day
*
@@ -857,7 +856,7 @@ EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
* same time atomically and stores the resulting timestamps in timespec
* format.
*/
-void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
+void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned long seq;
@@ -868,7 +867,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
do {
seq = read_seqcount_begin(&tk_core.seq);
- *ts_raw = timespec64_to_timespec(tk->raw_time);
+ *ts_raw = tk->raw_time;
ts_real->tv_sec = tk->xtime_sec;
ts_real->tv_nsec = 0;
@@ -877,10 +876,10 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
} while (read_seqcount_retry(&tk_core.seq, seq));
- timespec_add_ns(ts_raw, nsecs_raw);
- timespec_add_ns(ts_real, nsecs_real);
+ timespec64_add_ns(ts_raw, nsecs_raw);
+ timespec64_add_ns(ts_real, nsecs_real);
}
-EXPORT_SYMBOL(getnstime_raw_and_real);
+EXPORT_SYMBOL(ktime_get_raw_and_real_ts64);
#endif /* CONFIG_NTP_PPS */
@@ -911,6 +910,7 @@ int do_settimeofday64(const struct timespec64 *ts)
struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 ts_delta, xt;
unsigned long flags;
+ int ret = 0;
if (!timespec64_valid_strict(ts))
return -EINVAL;
@@ -924,10 +924,15 @@ int do_settimeofday64(const struct timespec64 *ts)
ts_delta.tv_sec = ts->tv_sec - xt.tv_sec;
ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec;
+ if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));
tk_set_xtime(tk, ts);
-
+out:
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
write_seqcount_end(&tk_core.seq);
@@ -936,7 +941,7 @@ int do_settimeofday64(const struct timespec64 *ts)
/* signal hrtimers about time change */
clock_was_set();
- return 0;
+ return ret;
}
EXPORT_SYMBOL(do_settimeofday64);
@@ -965,7 +970,8 @@ int timekeeping_inject_offset(struct timespec *ts)
/* Make sure the proposed value is valid */
tmp = timespec64_add(tk_xtime(tk), ts64);
- if (!timespec64_valid_strict(&tmp)) {
+ if (timespec64_compare(&tk->wall_to_monotonic, &ts64) > 0 ||
+ !timespec64_valid_strict(&tmp)) {
ret = -EINVAL;
goto error;
}
@@ -1179,28 +1185,20 @@ void __weak read_persistent_clock64(struct timespec64 *ts64)
}
/**
- * read_boot_clock - Return time of the system start.
+ * read_boot_clock64 - Return time of the system start.
*
* Weak dummy function for arches that do not yet support it.
* Function to read the exact time the system has been started.
- * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
+ * Returns a timespec64 with tv_sec=0 and tv_nsec=0 if unsupported.
*
* XXX - Do be sure to remove it once all arches implement it.
*/
-void __weak read_boot_clock(struct timespec *ts)
+void __weak read_boot_clock64(struct timespec64 *ts)
{
ts->tv_sec = 0;
ts->tv_nsec = 0;
}
-void __weak read_boot_clock64(struct timespec64 *ts64)
-{
- struct timespec ts;
-
- read_boot_clock(&ts);
- *ts64 = timespec_to_timespec64(ts);
-}
-
/* Flag for if timekeeping_resume() has injected sleeptime */
static bool sleeptime_injected;
@@ -1252,7 +1250,7 @@ void __init timekeeping_init(void)
set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec);
tk_set_wall_to_mono(tk, tmp);
- timekeeping_update(tk, TK_MIRROR);
+ timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -1675,7 +1673,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
/**
* accumulate_nsecs_to_secs - Accumulates nsecs into secs
*
- * Helper function that accumulates a the nsecs greater then a second
+ * Helper function that accumulates the nsecs greater than a second
* from the xtime_nsec field to the xtime_secs field.
* It also calls into the NTP code to handle leapsecond processing.
*
@@ -1727,7 +1725,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
cycle_t interval = tk->cycle_interval << shift;
u64 raw_nsecs;
- /* If the offset is smaller then a shifted interval, do nothing */
+ /* If the offset is smaller than a shifted interval, do nothing */
if (offset < interval)
return offset;
@@ -1836,8 +1834,9 @@ void update_wall_time(void)
* memcpy under the tk_core.seq against one before we start
* updating.
*/
+ timekeeping_update(tk, clock_set);
memcpy(real_tk, tk, sizeof(*tk));
- timekeeping_update(real_tk, clock_set);
+ /* The memcpy must come last. Do not put anything here! */
write_seqcount_end(&tk_core.seq);
out:
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -1881,7 +1880,7 @@ struct timespec __current_kernel_time(void)
return timespec64_to_timespec(tk_xtime(tk));
}
-struct timespec current_kernel_time(void)
+struct timespec64 current_kernel_time64(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 now;
@@ -1893,9 +1892,9 @@ struct timespec current_kernel_time(void)
now = tk_xtime(tk);
} while (read_seqcount_retry(&tk_core.seq, seq));
- return timespec64_to_timespec(now);
+ return now;
}
-EXPORT_SYMBOL(current_kernel_time);
+EXPORT_SYMBOL(current_kernel_time64);
struct timespec64 get_monotonic_coarse64(void)
{
@@ -1926,47 +1925,20 @@ void do_timer(unsigned long ticks)
}
/**
- * ktime_get_update_offsets_tick - hrtimer helper
- * @offs_real: pointer to storage for monotonic -> realtime offset
- * @offs_boot: pointer to storage for monotonic -> boottime offset
- * @offs_tai: pointer to storage for monotonic -> clock tai offset
- *
- * Returns monotonic time at last tick and various offsets
- */
-ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
- ktime_t *offs_tai)
-{
- struct timekeeper *tk = &tk_core.timekeeper;
- unsigned int seq;
- ktime_t base;
- u64 nsecs;
-
- do {
- seq = read_seqcount_begin(&tk_core.seq);
-
- base = tk->tkr_mono.base;
- nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
-
- *offs_real = tk->offs_real;
- *offs_boot = tk->offs_boot;
- *offs_tai = tk->offs_tai;
- } while (read_seqcount_retry(&tk_core.seq, seq));
-
- return ktime_add_ns(base, nsecs);
-}
-
-#ifdef CONFIG_HIGH_RES_TIMERS
-/**
* ktime_get_update_offsets_now - hrtimer helper
+ * @cwsseq: pointer to check and store the clock was set sequence number
* @offs_real: pointer to storage for monotonic -> realtime offset
* @offs_boot: pointer to storage for monotonic -> boottime offset
* @offs_tai: pointer to storage for monotonic -> clock tai offset
*
- * Returns current monotonic time and updates the offsets
+ * Returns current monotonic time and updates the offsets if the
+ * sequence number in @cwsseq and timekeeper.clock_was_set_seq are
+ * different.
+ *
* Called from hrtimer_interrupt() or retrigger_next_event()
*/
-ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
- ktime_t *offs_tai)
+ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
+ ktime_t *offs_boot, ktime_t *offs_tai)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
@@ -1978,15 +1950,23 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
base = tk->tkr_mono.base;
nsecs = timekeeping_get_ns(&tk->tkr_mono);
+ base = ktime_add_ns(base, nsecs);
+
+ if (*cwsseq != tk->clock_was_set_seq) {
+ *cwsseq = tk->clock_was_set_seq;
+ *offs_real = tk->offs_real;
+ *offs_boot = tk->offs_boot;
+ *offs_tai = tk->offs_tai;
+ }
+
+ /* Handle leapsecond insertion adjustments */
+ if (unlikely(base.tv64 >= tk->next_leap_ktime.tv64))
+ *offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0));
- *offs_real = tk->offs_real;
- *offs_boot = tk->offs_boot;
- *offs_tai = tk->offs_tai;
} while (read_seqcount_retry(&tk_core.seq, seq));
- return ktime_add_ns(base, nsecs);
+ return base;
}
-#endif
/**
* do_adjtimex() - Accessor function to NTP __do_adjtimex function
@@ -2027,6 +2007,8 @@ int do_adjtimex(struct timex *txc)
__timekeeping_set_tai_offset(tk, tai);
timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
}
+ tk_update_leap_state(tk);
+
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -2042,7 +2024,7 @@ int do_adjtimex(struct timex *txc)
/**
* hardpps() - Accessor function to NTP __hardpps function
*/
-void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
{
unsigned long flags;
diff --git a/kernel/kernel/time/timekeeping.h b/kernel/kernel/time/timekeeping.h
index d7a9120a9..763a3e512 100644
--- a/kernel/kernel/time/timekeeping.h
+++ b/kernel/kernel/time/timekeeping.h
@@ -3,19 +3,16 @@
/*
* Internal interfaces for kernel/time/
*/
-extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real,
- ktime_t *offs_boot,
- ktime_t *offs_tai);
-extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real,
- ktime_t *offs_boot,
- ktime_t *offs_tai);
+extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq,
+ ktime_t *offs_real,
+ ktime_t *offs_boot,
+ ktime_t *offs_tai);
extern int timekeeping_valid_for_hres(void);
extern u64 timekeeping_max_deferment(void);
extern int timekeeping_inject_offset(struct timespec *ts);
extern s32 timekeeping_get_tai_offset(void);
extern void timekeeping_set_tai_offset(s32 tai_offset);
-extern void timekeeping_clocktai(struct timespec *ts);
extern int timekeeping_suspend(void);
extern void timekeeping_resume(void);
diff --git a/kernel/kernel/time/timer.c b/kernel/kernel/time/timer.c
index adb1d82d6..fee8682c2 100644
--- a/kernel/kernel/time/timer.c
+++ b/kernel/kernel/time/timer.c
@@ -49,6 +49,8 @@
#include <asm/timex.h>
#include <asm/io.h>
+#include "tick-internal.h"
+
#define CREATE_TRACE_POINTS
#include <trace/events/timer.h>
@@ -68,11 +70,11 @@ EXPORT_SYMBOL(jiffies_64);
#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
struct tvec {
- struct list_head vec[TVN_SIZE];
+ struct hlist_head vec[TVN_SIZE];
};
struct tvec_root {
- struct list_head vec[TVR_SIZE];
+ struct hlist_head vec[TVR_SIZE];
};
struct tvec_base {
@@ -86,6 +88,8 @@ struct tvec_base {
unsigned long active_timers;
unsigned long all_timers;
int cpu;
+ bool migration_enabled;
+ bool nohz_active;
struct tvec_root tv1;
struct tvec tv2;
struct tvec tv3;
@@ -93,43 +97,60 @@ struct tvec_base {
struct tvec tv5;
} ____cacheline_aligned;
-/*
- * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've
- * made NULL special, hint: lock_timer_base()) and we cannot get a compile time
- * pointer to per-cpu entries because we don't know where we'll map the section,
- * even for the boot cpu.
- *
- * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the
- * rest of them.
- */
-struct tvec_base boot_tvec_bases;
-EXPORT_SYMBOL(boot_tvec_bases);
-static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
+static DEFINE_PER_CPU(struct tvec_base, tvec_bases);
-/* Functions below help us manage 'deferrable' flag */
-static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+unsigned int sysctl_timer_migration = 1;
+
+void timers_update_migration(bool update_nohz)
{
- return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE);
+ bool on = sysctl_timer_migration && tick_nohz_active;
+ unsigned int cpu;
+
+ /* Avoid the loop, if nothing to update */
+ if (this_cpu_read(tvec_bases.migration_enabled) == on)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ per_cpu(tvec_bases.migration_enabled, cpu) = on;
+ per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
+ if (!update_nohz)
+ continue;
+ per_cpu(tvec_bases.nohz_active, cpu) = true;
+ per_cpu(hrtimer_bases.nohz_active, cpu) = true;
+ }
}
-static inline unsigned int tbase_get_irqsafe(struct tvec_base *base)
+int timer_migration_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
{
- return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE);
+ static DEFINE_MUTEX(mutex);
+ int ret;
+
+ mutex_lock(&mutex);
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (!ret && write)
+ timers_update_migration(false);
+ mutex_unlock(&mutex);
+ return ret;
}
-static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
+static inline struct tvec_base *get_target_base(struct tvec_base *base,
+ int pinned)
{
- return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK));
+ if (pinned || !base->migration_enabled)
+ return this_cpu_ptr(&tvec_bases);
+ return per_cpu_ptr(&tvec_bases, get_nohz_timer_target());
}
-
-static inline void
-timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
+#else
+static inline struct tvec_base *get_target_base(struct tvec_base *base,
+ int pinned)
{
- unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK;
-
- timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags);
+ return this_cpu_ptr(&tvec_bases);
}
+#endif
static unsigned long round_jiffies_common(unsigned long j, int cpu,
bool force_up)
@@ -352,26 +373,12 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
}
EXPORT_SYMBOL_GPL(set_timer_slack);
-/*
- * If the list is empty, catch up ->timer_jiffies to the current time.
- * The caller must hold the tvec_base lock. Returns true if the list
- * was empty and therefore ->timer_jiffies was updated.
- */
-static bool catchup_timer_jiffies(struct tvec_base *base)
-{
- if (!base->all_timers) {
- base->timer_jiffies = jiffies;
- return true;
- }
- return false;
-}
-
static void
__internal_add_timer(struct tvec_base *base, struct timer_list *timer)
{
unsigned long expires = timer->expires;
unsigned long idx = expires - base->timer_jiffies;
- struct list_head *vec;
+ struct hlist_head *vec;
if (idx < TVR_SIZE) {
int i = expires & TVR_MASK;
@@ -404,25 +411,25 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
vec = base->tv5.vec + i;
}
- /*
- * Timers are FIFO:
- */
- list_add_tail(&timer->entry, vec);
+
+ hlist_add_head(&timer->entry, vec);
}
static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
{
- (void)catchup_timer_jiffies(base);
+ /* Advance base->jiffies, if the base is empty */
+ if (!base->all_timers++)
+ base->timer_jiffies = jiffies;
+
__internal_add_timer(base, timer);
/*
* Update base->active_timers and base->next_timer
*/
- if (!tbase_get_deferrable(timer->base)) {
+ if (!(timer->flags & TIMER_DEFERRABLE)) {
if (!base->active_timers++ ||
time_before(timer->expires, base->next_timer))
base->next_timer = timer->expires;
}
- base->all_timers++;
/*
* Check whether the other CPU is in dynticks mode and needs
@@ -437,8 +444,11 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
* require special care against races with idle_cpu(), lets deal
* with that later.
*/
- if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu))
- wake_up_nohz_cpu(base->cpu);
+ if (base->nohz_active) {
+ if (!(timer->flags & TIMER_DEFERRABLE) ||
+ tick_nohz_full_cpu(base->cpu))
+ wake_up_nohz_cpu(base->cpu);
+ }
}
#ifdef CONFIG_TIMER_STATS
@@ -454,15 +464,19 @@ void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
static void timer_stats_account_timer(struct timer_list *timer)
{
- unsigned int flag = 0;
+ void *site;
- if (likely(!timer->start_site))
+ /*
+ * start_site can be concurrently reset by
+ * timer_stats_timer_clear_start_info()
+ */
+ site = READ_ONCE(timer->start_site);
+ if (likely(!site))
return;
- if (unlikely(tbase_get_deferrable(timer->base)))
- flag |= TIMER_STATS_FLAG_DEFERRABLE;
- timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
- timer->function, timer->start_comm, flag);
+ timer_stats_update_stats(timer, timer->start_pid, site,
+ timer->function, timer->start_comm,
+ timer->flags);
}
#else
@@ -519,8 +533,8 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
* statically initialized. We just make sure that it
* is tracked in the object tracker.
*/
- if (timer->entry.next == NULL &&
- timer->entry.prev == TIMER_ENTRY_STATIC) {
+ if (timer->entry.pprev == NULL &&
+ timer->entry.next == TIMER_ENTRY_STATIC) {
debug_object_init(timer, &timer_debug_descr);
debug_object_activate(timer, &timer_debug_descr);
return 0;
@@ -566,7 +580,7 @@ static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
switch (state) {
case ODEBUG_STATE_NOTAVAILABLE:
- if (timer->entry.prev == TIMER_ENTRY_STATIC) {
+ if (timer->entry.next == TIMER_ENTRY_STATIC) {
/*
* This is not really a fixup. The timer was
* statically initialized. We just make sure that it
@@ -651,7 +665,7 @@ static inline void
debug_activate(struct timer_list *timer, unsigned long expires)
{
debug_timer_activate(timer);
- trace_timer_start(timer, expires);
+ trace_timer_start(timer, expires, timer->flags);
}
static inline void debug_deactivate(struct timer_list *timer)
@@ -668,10 +682,8 @@ static inline void debug_assert_init(struct timer_list *timer)
static void do_init_timer(struct timer_list *timer, unsigned int flags,
const char *name, struct lock_class_key *key)
{
- struct tvec_base *base = raw_cpu_read(tvec_bases);
-
- timer->entry.next = NULL;
- timer->base = (void *)((unsigned long)base | flags);
+ timer->entry.pprev = NULL;
+ timer->flags = flags | raw_smp_processor_id();
timer->slack = -1;
#ifdef CONFIG_TIMER_STATS
timer->start_site = NULL;
@@ -702,24 +714,23 @@ EXPORT_SYMBOL(init_timer_key);
static inline void detach_timer(struct timer_list *timer, bool clear_pending)
{
- struct list_head *entry = &timer->entry;
+ struct hlist_node *entry = &timer->entry;
debug_deactivate(timer);
- __list_del(entry->prev, entry->next);
+ __hlist_del(entry);
if (clear_pending)
- entry->next = NULL;
- entry->prev = LIST_POISON2;
+ entry->pprev = NULL;
+ entry->next = LIST_POISON2;
}
static inline void
detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
{
detach_timer(timer, true);
- if (!tbase_get_deferrable(timer->base))
+ if (!(timer->flags & TIMER_DEFERRABLE))
base->active_timers--;
base->all_timers--;
- (void)catchup_timer_jiffies(base);
}
static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
@@ -729,13 +740,14 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
return 0;
detach_timer(timer, clear_pending);
- if (!tbase_get_deferrable(timer->base)) {
+ if (!(timer->flags & TIMER_DEFERRABLE)) {
base->active_timers--;
if (timer->expires == base->next_timer)
base->next_timer = base->timer_jiffies;
}
- base->all_timers--;
- (void)catchup_timer_jiffies(base);
+ /* If this was the last timer, advance base->jiffies */
+ if (!--base->all_timers)
+ base->timer_jiffies = jiffies;
return 1;
}
@@ -747,67 +759,68 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
* So __run_timers/migrate_timers can safely modify all timers which could
* be found on ->tvX lists.
*
- * When the timer's base is locked, and the timer removed from list, it is
- * possible to set timer->base = NULL and drop the lock: the timer remains
- * locked.
+ * When the timer's base is locked and removed from the list, the
+ * TIMER_MIGRATING flag is set, FIXME
*/
static struct tvec_base *lock_timer_base(struct timer_list *timer,
unsigned long *flags)
__acquires(timer->base->lock)
{
- struct tvec_base *base;
-
for (;;) {
- struct tvec_base *prelock_base = timer->base;
- base = tbase_get_base(prelock_base);
- if (likely(base != NULL)) {
+ u32 tf = timer->flags;
+ struct tvec_base *base;
+
+ if (!(tf & TIMER_MIGRATING)) {
+ base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK);
spin_lock_irqsave(&base->lock, *flags);
- if (likely(prelock_base == timer->base))
+ if (timer->flags == tf)
return base;
- /* The timer has migrated to another CPU */
spin_unlock_irqrestore(&base->lock, *flags);
}
cpu_relax();
}
}
-
-#ifndef CONFIG_PREEMPT_RT_FULL
-static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
- struct tvec_base *old,
- struct tvec_base *new)
-{
- /* See the comment in lock_timer_base() */
- timer_set_base(timer, NULL);
- spin_unlock(&old->lock);
- spin_lock(&new->lock);
- timer_set_base(timer, new);
- return new;
-}
-#else
+#ifdef CONFIG_PREEMPT_RT_FULL
static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
struct tvec_base *old,
struct tvec_base *new)
{
/*
- * We cannot do the above because we might be preempted and
+ * We cannot do the below because we might be preempted and
* then the preempter would see NULL and loop forever.
*/
if (spin_trylock(&new->lock)) {
- timer_set_base(timer, new);
+ WRITE_ONCE(timer->flags,
+ (timer->flags & ~TIMER_BASEMASK) | new->cpu);
spin_unlock(&old->lock);
return new;
}
return old;
}
+
+#else
+static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
+ struct tvec_base *old,
+ struct tvec_base *new)
+{
+ /* See the comment in lock_timer_base() */
+ timer->flags |= TIMER_MIGRATING;
+
+ spin_unlock(&old->lock);
+ spin_lock(&new->lock);
+ WRITE_ONCE(timer->flags,
+ (timer->flags & ~TIMER_BASEMASK) | new->cpu);
+ return new;
+}
#endif
static inline int
__mod_timer(struct timer_list *timer, unsigned long expires,
- bool pending_only, int pinned)
+ bool pending_only, int pinned)
{
struct tvec_base *base, *new_base;
unsigned long flags;
- int ret = 0 , cpu;
+ int ret = 0;
timer_stats_timer_set_start_info(timer);
BUG_ON(!timer->function);
@@ -820,8 +833,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
debug_activate(timer, expires);
- cpu = get_nohz_timer_target(pinned);
- new_base = per_cpu(tvec_bases, cpu);
+ new_base = get_target_base(base, pinned);
if (base != new_base) {
/*
@@ -890,7 +902,7 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
if (mask == 0)
return expires;
- bit = find_last_bit(&mask, BITS_PER_LONG);
+ bit = __fls(mask);
mask = (1UL << bit) - 1;
@@ -993,13 +1005,29 @@ EXPORT_SYMBOL(add_timer);
*/
void add_timer_on(struct timer_list *timer, int cpu)
{
- struct tvec_base *base = per_cpu(tvec_bases, cpu);
+ struct tvec_base *new_base = per_cpu_ptr(&tvec_bases, cpu);
+ struct tvec_base *base;
unsigned long flags;
timer_stats_timer_set_start_info(timer);
BUG_ON(timer_pending(timer) || !timer->function);
- spin_lock_irqsave(&base->lock, flags);
- timer_set_base(timer, base);
+
+ /*
+ * If @timer was on a different CPU, it should be migrated with the
+ * old base locked to prevent other operations proceeding with the
+ * wrong base locked. See lock_timer_base().
+ */
+ base = lock_timer_base(timer, &flags);
+ if (base != new_base) {
+ timer->flags |= TIMER_MIGRATING;
+
+ spin_unlock(&base->lock);
+ base = new_base;
+ spin_lock(&base->lock);
+ WRITE_ONCE(timer->flags,
+ (timer->flags & ~TIMER_BASEMASK) | cpu);
+ }
+
debug_activate(timer, timer->expires);
internal_add_timer(base, timer);
spin_unlock_irqrestore(&base->lock, flags);
@@ -1012,11 +1040,15 @@ EXPORT_SYMBOL_GPL(add_timer_on);
*/
static void wait_for_running_timer(struct timer_list *timer)
{
- struct tvec_base *base = timer->base;
+ struct tvec_base *base;
+ u32 tf = timer->flags;
- if (base->running_timer == timer)
- wait_event(base->wait_for_running_timer,
- base->running_timer != timer);
+ if (tf & TIMER_MIGRATING)
+ return;
+
+ base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK);
+ wait_event(base->wait_for_running_timer,
+ base->running_timer != timer);
}
# define wakeup_timer_waiters(b) wake_up(&(b)->wait_for_running_timer)
@@ -1087,8 +1119,6 @@ int try_to_del_timer_sync(struct timer_list *timer)
EXPORT_SYMBOL(try_to_del_timer_sync);
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
-static DEFINE_PER_CPU(struct tvec_base, __tvec_bases);
-
/**
* del_timer_sync - deactivate a timer and wait for the handler to finish.
* @timer: the timer to be deactivated
@@ -1143,7 +1173,7 @@ int del_timer_sync(struct timer_list *timer)
* don't use it in hardirq context, because it
* could lead to deadlock.
*/
- WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base));
+ WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));
for (;;) {
int ret = try_to_del_timer_sync(timer);
if (ret >= 0)
@@ -1157,17 +1187,17 @@ EXPORT_SYMBOL(del_timer_sync);
static int cascade(struct tvec_base *base, struct tvec *tv, int index)
{
/* cascade all the timers from tv up one level */
- struct timer_list *timer, *tmp;
- struct list_head tv_list;
+ struct timer_list *timer;
+ struct hlist_node *tmp;
+ struct hlist_head tv_list;
- list_replace_init(tv->vec + index, &tv_list);
+ hlist_move_list(tv->vec + index, &tv_list);
/*
* We are removing _all_ timers from the list, so we
* don't have to detach them individually.
*/
- list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
- BUG_ON(tbase_get_base(timer->base) != base);
+ hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) {
/* No accounting, while moving them */
__internal_add_timer(base, timer);
}
@@ -1232,14 +1262,18 @@ static inline void __run_timers(struct tvec_base *base)
struct timer_list *timer;
spin_lock_irq(&base->lock);
- if (catchup_timer_jiffies(base)) {
- spin_unlock_irq(&base->lock);
- return;
- }
+
while (time_after_eq(jiffies, base->timer_jiffies)) {
- struct list_head work_list;
- struct list_head *head = &work_list;
- int index = base->timer_jiffies & TVR_MASK;
+ struct hlist_head work_list;
+ struct hlist_head *head = &work_list;
+ int index;
+
+ if (!base->all_timers) {
+ base->timer_jiffies = jiffies;
+ break;
+ }
+
+ index = base->timer_jiffies & TVR_MASK;
/*
* Cascade timers:
@@ -1250,16 +1284,16 @@ static inline void __run_timers(struct tvec_base *base)
!cascade(base, &base->tv4, INDEX(2)))
cascade(base, &base->tv5, INDEX(3));
++base->timer_jiffies;
- list_replace_init(base->tv1.vec + index, head);
- while (!list_empty(head)) {
+ hlist_move_list(base->tv1.vec + index, head);
+ while (!hlist_empty(head)) {
void (*fn)(unsigned long);
unsigned long data;
bool irqsafe;
- timer = list_first_entry(head, struct timer_list,entry);
+ timer = hlist_entry(head->first, struct timer_list, entry);
fn = timer->function;
data = timer->data;
- irqsafe = tbase_get_irqsafe(timer->base);
+ irqsafe = timer->flags & TIMER_IRQSAFE;
timer_stats_account_timer(timer);
@@ -1300,8 +1334,8 @@ static unsigned long __next_timer_interrupt(struct tvec_base *base)
/* Look for timer events in tv1. */
index = slot = timer_jiffies & TVR_MASK;
do {
- list_for_each_entry(nte, base->tv1.vec + slot, entry) {
- if (tbase_get_deferrable(nte->base))
+ hlist_for_each_entry(nte, base->tv1.vec + slot, entry) {
+ if (nte->flags & TIMER_DEFERRABLE)
continue;
found = 1;
@@ -1331,8 +1365,8 @@ cascade:
index = slot = timer_jiffies & TVN_MASK;
do {
- list_for_each_entry(nte, varp->vec + slot, entry) {
- if (tbase_get_deferrable(nte->base))
+ hlist_for_each_entry(nte, varp->vec + slot, entry) {
+ if (nte->flags & TIMER_DEFERRABLE)
continue;
found = 1;
@@ -1363,54 +1397,48 @@ cascade:
* Check, if the next hrtimer event is before the next timer wheel
* event:
*/
-static unsigned long cmp_next_hrtimer_event(unsigned long now,
- unsigned long expires)
+static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
{
- ktime_t hr_delta = hrtimer_get_next_event();
- struct timespec tsdelta;
- unsigned long delta;
-
- if (hr_delta.tv64 == KTIME_MAX)
- return expires;
+ u64 nextevt = hrtimer_get_next_event();
/*
- * Expired timer available, let it expire in the next tick
+ * If high resolution timers are enabled
+ * hrtimer_get_next_event() returns KTIME_MAX.
*/
- if (hr_delta.tv64 <= 0)
- return now + 1;
-
- tsdelta = ktime_to_timespec(hr_delta);
- delta = timespec_to_jiffies(&tsdelta);
+ if (expires <= nextevt)
+ return expires;
/*
- * Limit the delta to the max value, which is checked in
- * tick_nohz_stop_sched_tick():
+ * If the next timer is already expired, return the tick base
+ * time so the tick is fired immediately.
*/
- if (delta > NEXT_TIMER_MAX_DELTA)
- delta = NEXT_TIMER_MAX_DELTA;
+ if (nextevt <= basem)
+ return basem;
/*
- * Take rounding errors in to account and make sure, that it
- * expires in the next tick. Otherwise we go into an endless
- * ping pong due to tick_nohz_stop_sched_tick() retriggering
- * the timer softirq
+ * Round up to the next jiffie. High resolution timers are
+ * off, so the hrtimers are expired in the tick and we need to
+ * make sure that this tick really expires the timer to avoid
+ * a ping pong of the nohz stop code.
+ *
+ * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
*/
- if (delta < 1)
- delta = 1;
- now += delta;
- if (time_before(now, expires))
- return now;
- return expires;
+ return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
}
/**
- * get_next_timer_interrupt - return the jiffy of the next pending timer
- * @now: current time (in jiffies)
+ * get_next_timer_interrupt - return the time (clock mono) of the next timer
+ * @basej: base time jiffies
+ * @basem: base time clock monotonic
+ *
+ * Returns the tick aligned clock monotonic time of the next pending
+ * timer or KTIME_MAX if no timer is pending.
*/
-unsigned long get_next_timer_interrupt(unsigned long now)
+u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
{
- struct tvec_base *base = __this_cpu_read(tvec_bases);
- unsigned long expires = now + NEXT_TIMER_MAX_DELTA;
+ struct tvec_base *base = this_cpu_ptr(&tvec_bases);
+ u64 expires = KTIME_MAX;
+ unsigned long nextevt;
/*
* Pretend that there is no timer pending if the cpu is offline.
@@ -1425,20 +1453,21 @@ unsigned long get_next_timer_interrupt(unsigned long now)
* the base lock to check when the next timer is pending and so
* we assume the next jiffy.
*/
- return now + 1;
+ return basem + TICK_NSEC;
#endif
spin_lock(&base->lock);
if (base->active_timers) {
if (time_before_eq(base->next_timer, base->timer_jiffies))
base->next_timer = __next_timer_interrupt(base);
- expires = base->next_timer;
+ nextevt = base->next_timer;
+ if (time_before_eq(nextevt, basej))
+ expires = basem;
+ else
+ expires = basem + (nextevt - basej) * TICK_NSEC;
}
spin_unlock(&base->lock);
- if (time_before_eq(expires, now))
- return now;
-
- return cmp_next_hrtimer_event(now, expires);
+ return cmp_next_hrtimer_event(basem, expires);
}
#endif
@@ -1455,7 +1484,7 @@ void update_process_times(int user_tick)
scheduler_tick();
run_local_timers();
rcu_check_callbacks(user_tick);
-#if defined(CONFIG_IRQ_WORK) && !defined(CONFIG_PREEMPT_RT_FULL)
+#if defined(CONFIG_IRQ_WORK)
if (in_irq())
irq_work_tick();
#endif
@@ -1467,13 +1496,9 @@ void update_process_times(int user_tick)
*/
static void run_timer_softirq(struct softirq_action *h)
{
- struct tvec_base *base = __this_cpu_read(tvec_bases);
-
- hrtimer_run_pending();
+ struct tvec_base *base = this_cpu_ptr(&tvec_bases);
-#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
- irq_work_tick();
-#endif
+ irq_work_tick_soft();
if (time_after_eq(jiffies, base->timer_jiffies))
__run_timers(base);
@@ -1609,15 +1634,16 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
EXPORT_SYMBOL(schedule_timeout_uninterruptible);
#ifdef CONFIG_HOTPLUG_CPU
-static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
+static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head)
{
struct timer_list *timer;
+ int cpu = new_base->cpu;
- while (!list_empty(head)) {
- timer = list_first_entry(head, struct timer_list, entry);
+ while (!hlist_empty(head)) {
+ timer = hlist_entry(head->first, struct timer_list, entry);
/* We ignore the accounting on the dying cpu */
detach_timer(timer, false);
- timer_set_base(timer, new_base);
+ timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
internal_add_timer(new_base, timer);
}
}
@@ -1629,8 +1655,8 @@ static void migrate_timers(int cpu)
int i;
BUG_ON(cpu_online(cpu));
- old_base = per_cpu(tvec_bases, cpu);
- new_base = get_local_var(tvec_bases);
+ old_base = per_cpu_ptr(&tvec_bases, cpu);
+ new_base = get_local_ptr(&tvec_bases);
/*
* The caller is globally serialized and nobody else
* takes two locks at once, deadlock is not possible.
@@ -1654,7 +1680,7 @@ static void migrate_timers(int cpu)
spin_unlock(&old_base->lock);
spin_unlock_irq(&new_base->lock);
- put_local_var(tvec_bases);
+ put_local_ptr(&tvec_bases);
}
static int timer_cpu_notify(struct notifier_block *self,
@@ -1680,55 +1706,30 @@ static inline void timer_register_cpu_notifier(void)
static inline void timer_register_cpu_notifier(void) { }
#endif /* CONFIG_HOTPLUG_CPU */
-static void __init init_timer_cpu(struct tvec_base *base, int cpu)
+static void __init init_timer_cpu(int cpu)
{
- int j;
-
- BUG_ON(base != tbase_get_base(base));
+ struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu);
base->cpu = cpu;
- per_cpu(tvec_bases, cpu) = base;
spin_lock_init(&base->lock);
#ifdef CONFIG_PREEMPT_RT_FULL
init_waitqueue_head(&base->wait_for_running_timer);
#endif
- for (j = 0; j < TVN_SIZE; j++) {
- INIT_LIST_HEAD(base->tv5.vec + j);
- INIT_LIST_HEAD(base->tv4.vec + j);
- INIT_LIST_HEAD(base->tv3.vec + j);
- INIT_LIST_HEAD(base->tv2.vec + j);
- }
- for (j = 0; j < TVR_SIZE; j++)
- INIT_LIST_HEAD(base->tv1.vec + j);
-
base->timer_jiffies = jiffies;
base->next_timer = base->timer_jiffies;
}
static void __init init_timer_cpus(void)
{
- struct tvec_base *base;
- int local_cpu = smp_processor_id();
int cpu;
- for_each_possible_cpu(cpu) {
- if (cpu == local_cpu)
- base = &boot_tvec_bases;
-#ifdef CONFIG_SMP
- else
- base = per_cpu_ptr(&__tvec_bases, cpu);
-#endif
-
- init_timer_cpu(base, cpu);
- }
+ for_each_possible_cpu(cpu)
+ init_timer_cpu(cpu);
}
void __init init_timers(void)
{
- /* ensure there are enough low bits for flags in timer->base pointer */
- BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);
-
init_timer_cpus();
init_timer_stats();
timer_register_cpu_notifier();
@@ -1764,14 +1765,14 @@ unsigned long msleep_interruptible(unsigned int msecs)
EXPORT_SYMBOL(msleep_interruptible);
-static int __sched do_usleep_range(unsigned long min, unsigned long max)
+static void __sched do_usleep_range(unsigned long min, unsigned long max)
{
ktime_t kmin;
unsigned long delta;
kmin = ktime_set(0, min * NSEC_PER_USEC);
delta = (max - min) * NSEC_PER_USEC;
- return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
+ schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
}
/**
@@ -1779,7 +1780,7 @@ static int __sched do_usleep_range(unsigned long min, unsigned long max)
* @min: Minimum time in usecs to sleep
* @max: Maximum time in usecs to sleep
*/
-void usleep_range(unsigned long min, unsigned long max)
+void __sched usleep_range(unsigned long min, unsigned long max)
{
__set_current_state(TASK_UNINTERRUPTIBLE);
do_usleep_range(min, max);
diff --git a/kernel/kernel/time/timer_list.c b/kernel/kernel/time/timer_list.c
index e878c2e0b..ba7d8b288 100644
--- a/kernel/kernel/time/timer_list.c
+++ b/kernel/kernel/time/timer_list.c
@@ -29,19 +29,24 @@ struct timer_list_iter {
typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
-DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
-
/*
* This allows printing both to /proc/timer_list and
* to the console (on SysRq-Q):
*/
-#define SEQ_printf(m, x...) \
- do { \
- if (m) \
- seq_printf(m, x); \
- else \
- printk(x); \
- } while (0)
+__printf(2, 3)
+static void SEQ_printf(struct seq_file *m, const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+
+ if (m)
+ seq_vprintf(m, fmt, args);
+ else
+ vprintk(fmt, args);
+
+ va_end(args);
+}
static void print_name_offset(struct seq_file *m, void *sym)
{
@@ -64,7 +69,7 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
print_name_offset(m, taddr);
SEQ_printf(m, ", ");
print_name_offset(m, timer->function);
- SEQ_printf(m, ", S:%02lx", timer->state);
+ SEQ_printf(m, ", S:%02x", timer->state);
#ifdef CONFIG_TIMER_STATS
SEQ_printf(m, ", ");
print_name_offset(m, timer->start_site);
@@ -120,10 +125,10 @@ static void
print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
{
SEQ_printf(m, " .base: %pK\n", base);
- SEQ_printf(m, " .index: %d\n",
- base->index);
- SEQ_printf(m, " .resolution: %Lu nsecs\n",
- (unsigned long long)ktime_to_ns(base->resolution));
+ SEQ_printf(m, " .index: %d\n", base->index);
+
+ SEQ_printf(m, " .resolution: %u nsecs\n", (unsigned) hrtimer_resolution);
+
SEQ_printf(m, " .get_time: ");
print_name_offset(m, base->get_time);
SEQ_printf(m, "\n");
@@ -132,7 +137,7 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
(unsigned long long) ktime_to_ns(base->offset));
#endif
SEQ_printf(m, "active timers:\n");
- print_active_timers(m, base, now);
+ print_active_timers(m, base, now + ktime_to_ns(base->offset));
}
static void print_cpu(struct seq_file *m, int cpu, u64 now)
@@ -158,7 +163,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
P(nr_events);
P(nr_retries);
P(nr_hangs);
- P_ns(max_hang_time);
+ P(max_hang_time);
#endif
#undef P
#undef P_ns
@@ -184,7 +189,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
P_ns(idle_sleeptime);
P_ns(iowait_sleeptime);
P(last_jiffies);
- P(next_jiffies);
+ P(next_timer);
P_ns(idle_expires);
SEQ_printf(m, "jiffies: %Lu\n",
(unsigned long long)jiffies);
@@ -220,7 +225,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
(unsigned long long) dev->min_delta_ns);
SEQ_printf(m, " mult: %u\n", dev->mult);
SEQ_printf(m, " shift: %u\n", dev->shift);
- SEQ_printf(m, " mode: %d\n", dev->mode);
+ SEQ_printf(m, " mode: %d\n", clockevent_get_state(dev));
SEQ_printf(m, " next_event: %Ld nsecs\n",
(unsigned long long) ktime_to_ns(dev->next_event));
@@ -228,34 +233,34 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
print_name_offset(m, dev->set_next_event);
SEQ_printf(m, "\n");
- if (dev->set_mode) {
- SEQ_printf(m, " set_mode: ");
- print_name_offset(m, dev->set_mode);
+ if (dev->set_state_shutdown) {
+ SEQ_printf(m, " shutdown: ");
+ print_name_offset(m, dev->set_state_shutdown);
SEQ_printf(m, "\n");
- } else {
- if (dev->set_state_shutdown) {
- SEQ_printf(m, " shutdown: ");
- print_name_offset(m, dev->set_state_shutdown);
- SEQ_printf(m, "\n");
- }
+ }
- if (dev->set_state_periodic) {
- SEQ_printf(m, " periodic: ");
- print_name_offset(m, dev->set_state_periodic);
- SEQ_printf(m, "\n");
- }
+ if (dev->set_state_periodic) {
+ SEQ_printf(m, " periodic: ");
+ print_name_offset(m, dev->set_state_periodic);
+ SEQ_printf(m, "\n");
+ }
- if (dev->set_state_oneshot) {
- SEQ_printf(m, " oneshot: ");
- print_name_offset(m, dev->set_state_oneshot);
- SEQ_printf(m, "\n");
- }
+ if (dev->set_state_oneshot) {
+ SEQ_printf(m, " oneshot: ");
+ print_name_offset(m, dev->set_state_oneshot);
+ SEQ_printf(m, "\n");
+ }
- if (dev->tick_resume) {
- SEQ_printf(m, " resume: ");
- print_name_offset(m, dev->tick_resume);
- SEQ_printf(m, "\n");
- }
+ if (dev->set_state_oneshot_stopped) {
+ SEQ_printf(m, " oneshot stopped: ");
+ print_name_offset(m, dev->set_state_oneshot_stopped);
+ SEQ_printf(m, "\n");
+ }
+
+ if (dev->tick_resume) {
+ SEQ_printf(m, " resume: ");
+ print_name_offset(m, dev->tick_resume);
+ SEQ_printf(m, "\n");
}
SEQ_printf(m, " event_handler: ");
@@ -269,11 +274,11 @@ static void timer_list_show_tickdevices_header(struct seq_file *m)
{
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
print_tickdevice(m, tick_get_broadcast_device(), -1);
- SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
- cpumask_bits(tick_get_broadcast_mask())[0]);
+ SEQ_printf(m, "tick_broadcast_mask: %*pb\n",
+ cpumask_pr_args(tick_get_broadcast_mask()));
#ifdef CONFIG_TICK_ONESHOT
- SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
- cpumask_bits(tick_get_broadcast_oneshot_mask())[0]);
+ SEQ_printf(m, "tick_broadcast_oneshot_mask: %*pb\n",
+ cpumask_pr_args(tick_get_broadcast_oneshot_mask()));
#endif
SEQ_printf(m, "\n");
#endif
@@ -282,7 +287,7 @@ static void timer_list_show_tickdevices_header(struct seq_file *m)
static inline void timer_list_header(struct seq_file *m, u64 now)
{
- SEQ_printf(m, "Timer List Version: v0.7\n");
+ SEQ_printf(m, "Timer List Version: v0.8\n");
SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
SEQ_printf(m, "\n");
diff --git a/kernel/kernel/time/timer_stats.c b/kernel/kernel/time/timer_stats.c
index 1fb08f213..1adecb4b8 100644
--- a/kernel/kernel/time/timer_stats.c
+++ b/kernel/kernel/time/timer_stats.c
@@ -68,7 +68,7 @@ struct entry {
* Number of timeout events:
*/
unsigned long count;
- unsigned int timer_flag;
+ u32 flags;
/*
* We save the command-line string to preserve
@@ -227,13 +227,13 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
* @startf: pointer to the function which did the timer setup
* @timerf: pointer to the timer callback function of the timer
* @comm: name of the process which set up the timer
+ * @tflags: The flags field of the timer
*
* When the timer is already registered, then the event counter is
* incremented. Otherwise the timer is registered in a free slot.
*/
void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
- void *timerf, char *comm,
- unsigned int timer_flag)
+ void *timerf, char *comm, u32 tflags)
{
/*
* It doesn't matter which lock we take:
@@ -251,7 +251,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
input.start_func = startf;
input.expire_func = timerf;
input.pid = pid;
- input.timer_flag = timer_flag;
+ input.flags = tflags;
raw_spin_lock_irqsave(lock, flags);
if (!timer_stats_active)
@@ -306,7 +306,7 @@ static int tstats_show(struct seq_file *m, void *v)
for (i = 0; i < nr_entries; i++) {
entry = entries + i;
- if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
+ if (entry->flags & TIMER_DEFERRABLE) {
seq_printf(m, "%4luD, %5d %-16s ",
entry->count, entry->pid, entry->comm);
} else {