Mailing List Archive

[PATCH v6] hrtimer: avoid retrigger_next_event IPI
Setting the realtime clock triggers an IPI to all CPUs to reprogram
the clock event device.

However, only realtime and TAI clocks have their offsets updated
(and therefore potentially require a reprogram).

Instead of sending an IPI unconditionally, check each per CPU hrtimer base
whether it has active timers in the CLOCK_REALTIME and CLOCK_TAI bases. If
that's not the case, update the realtime and TAI base offsets remotely and
skip the IPI. This ensures that any subsequently armed timers on
CLOCK_REALTIME and CLOCK_TAI are evaluated with the correct offsets.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

---

v6:
- Do not take softirq_raised into account (Peter Xu).
- Include BOOTTIME as base that requires IPI (Thomas).
- Unconditional reprogram on resume path, since there is
nothing to gain in such path anyway.

v5:
- Add missing hrtimer_update_base (Peter Xu).

v4:
- Drop unused code (Thomas).

v3:
- Nicer changelog (Thomas).
- Code style fixes (Thomas).
- Compilation warning with CONFIG_HIGH_RES_TIMERS=n (Thomas).
- Shrink preemption disabled section (Thomas).

v2:
- Only REALTIME and TAI bases are affected by offset-to-monotonic changes (Thomas).
- Don't special case nohz_full CPUs (Thomas).


diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index bb5e7b0a4274..14a6e449b221 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -318,7 +318,7 @@ struct clock_event_device;

extern void hrtimer_interrupt(struct clock_event_device *dev);

-extern void clock_was_set_delayed(void);
+extern void clock_was_set_delayed(bool force_reprogram);

extern unsigned int hrtimer_resolution;

@@ -326,7 +326,7 @@ extern unsigned int hrtimer_resolution;

#define hrtimer_resolution (unsigned int)LOW_RES_NSEC

-static inline void clock_was_set_delayed(void) { }
+static inline void clock_was_set_delayed(bool force_reprogram) { }

#endif

@@ -351,7 +351,7 @@ hrtimer_expires_remaining_adjusted(const struct hrtimer *timer)
timer->base->get_time());
}

-extern void clock_was_set(void);
+extern void clock_was_set(bool);
#ifdef CONFIG_TIMERFD
extern void timerfd_clock_was_set(void);
#else
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5c9d968187ae..2258782fd714 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -758,9 +758,17 @@ static void hrtimer_switch_to_hres(void)
retrigger_next_event(NULL);
}

+static void clock_was_set_force_reprogram_work(struct work_struct *work)
+{
+ clock_was_set(true);
+}
+
+static DECLARE_WORK(hrtimer_force_reprogram_work, clock_was_set_force_reprogram_work);
+
+
static void clock_was_set_work(struct work_struct *work)
{
- clock_was_set();
+ clock_was_set(false);
}

static DECLARE_WORK(hrtimer_work, clock_was_set_work);
@@ -769,9 +777,12 @@ static DECLARE_WORK(hrtimer_work, clock_was_set_work);
* Called from timekeeping and resume code to reprogram the hrtimer
* interrupt device on all cpus.
*/
-void clock_was_set_delayed(void)
+void clock_was_set_delayed(bool force_reprogram)
{
- schedule_work(&hrtimer_work);
+ if (force_reprogram)
+ schedule_work(&hrtimer_force_reprogram_work);
+ else
+ schedule_work(&hrtimer_work);
}

#else
@@ -871,6 +882,18 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
tick_program_event(expires, 1);
}

+#define CLOCK_SET_BASES ((1U << HRTIMER_BASE_REALTIME) | \
+ (1U << HRTIMER_BASE_REALTIME_SOFT) | \
+ (1U << HRTIMER_BASE_TAI) | \
+ (1U << HRTIMER_BASE_TAI_SOFT) | \
+ (1U << HRTIMER_BASE_BOOTTIME) | \
+ (1U << HRTIMER_BASE_BOOTTIME_SOFT))
+
+static bool need_reprogram_timer(struct hrtimer_cpu_base *cpu_base)
+{
+ return (cpu_base->active_bases & CLOCK_SET_BASES) != 0;
+}
+
/*
* Clock realtime was set
*
@@ -882,11 +905,42 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
* resolution timer interrupts. On UP we just disable interrupts and
* call the high resolution interrupt code.
*/
-void clock_was_set(void)
+void clock_was_set(bool force_reprogram)
{
#ifdef CONFIG_HIGH_RES_TIMERS
- /* Retrigger the CPU local events everywhere */
- on_each_cpu(retrigger_next_event, NULL, 1);
+ cpumask_var_t mask;
+ int cpu;
+
+ if (force_reprogram == true) {
+ on_each_cpu(retrigger_next_event, NULL, 1);
+ goto set_timerfd;
+ }
+
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
+ on_each_cpu(retrigger_next_event, NULL, 1);
+ goto set_timerfd;
+ }
+
+ /* Avoid interrupting CPUs if possible */
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ unsigned long flags;
+ struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
+
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
+ if (need_reprogram_timer(cpu_base))
+ cpumask_set_cpu(cpu, mask);
+ else
+ hrtimer_update_base(cpu_base);
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+ }
+
+ preempt_disable();
+ smp_call_function_many(mask, retrigger_next_event, NULL, 1);
+ preempt_enable();
+ cpus_read_unlock();
+ free_cpumask_var(mask);
+set_timerfd:
#endif
timerfd_clock_was_set();
}
@@ -903,7 +957,7 @@ void hrtimers_resume(void)
/* Retrigger on the local CPU */
retrigger_next_event(NULL);
/* And schedule a retrigger for all others */
- clock_was_set_delayed();
+ clock_was_set_delayed(true);
}

/*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6aee5768c86f..3fef237267bd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1323,7 +1323,7 @@ int do_settimeofday64(const struct timespec64 *ts)
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

/* signal hrtimers about time change */
- clock_was_set();
+ clock_was_set(false);

if (!ret)
audit_tk_injoffset(ts_delta);
@@ -1371,7 +1371,7 @@ static int timekeeping_inject_offset(const struct timespec64 *ts)
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

/* signal hrtimers about time change */
- clock_was_set();
+ clock_was_set(false);

return ret;
}
@@ -1736,7 +1736,7 @@ void timekeeping_inject_sleeptime64(const struct timespec64 *delta)
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

/* signal hrtimers about time change */
- clock_was_set();
+ clock_was_set(true);
}
#endif

@@ -2187,7 +2187,7 @@ static void timekeeping_advance(enum timekeeping_adv_mode mode)
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
if (clock_set)
/* Have to call _delayed version, since in irq context*/
- clock_was_set_delayed();
+ clock_was_set_delayed(false);
}

/**
@@ -2425,7 +2425,7 @@ int do_adjtimex(struct __kernel_timex *txc)
timekeeping_advance(TK_ADV_FREQ);

if (tai != orig_tai)
- clock_was_set();
+ clock_was_set(false);

ntp_notify_cmos_timer();
Re: [PATCH v6] hrtimer: avoid retrigger_next_event IPI [ In reply to ]
On Mon, Apr 19 2021 at 16:39, Marcelo Tosatti wrote:
>
> +static void clock_was_set_force_reprogram_work(struct work_struct *work)
> +{
> + clock_was_set(true);
> +}
> +
> +static DECLARE_WORK(hrtimer_force_reprogram_work, clock_was_set_force_reprogram_work);
> +
> +
> static void clock_was_set_work(struct work_struct *work)
> {
> - clock_was_set();
> + clock_was_set(false);
> }
>
> static DECLARE_WORK(hrtimer_work, clock_was_set_work);
> @@ -769,9 +777,12 @@ static DECLARE_WORK(hrtimer_work, clock_was_set_work);
> * Called from timekeeping and resume code to reprogram the hrtimer
> * interrupt device on all cpus.
> */
> -void clock_was_set_delayed(void)
> +void clock_was_set_delayed(bool force_reprogram)
> {
> - schedule_work(&hrtimer_work);
> + if (force_reprogram)
> + schedule_work(&hrtimer_force_reprogram_work);
> + else
> + schedule_work(&hrtimer_work);
> }
>
> #else
> @@ -871,6 +882,18 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
> tick_program_event(expires, 1);
> }
>
> +#define CLOCK_SET_BASES ((1U << HRTIMER_BASE_REALTIME) | \
> + (1U << HRTIMER_BASE_REALTIME_SOFT) | \
> + (1U << HRTIMER_BASE_TAI) | \
> + (1U << HRTIMER_BASE_TAI_SOFT) | \
> + (1U << HRTIMER_BASE_BOOTTIME) | \
> + (1U << HRTIMER_BASE_BOOTTIME_SOFT))
> +
> +static bool need_reprogram_timer(struct hrtimer_cpu_base *cpu_base)
> +{
> + return (cpu_base->active_bases & CLOCK_SET_BASES) != 0;
> +}
> +
> /*
> * Clock realtime was set
> *
> @@ -882,11 +905,42 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
> * resolution timer interrupts. On UP we just disable interrupts and
> * call the high resolution interrupt code.
> */
> -void clock_was_set(void)
> +void clock_was_set(bool force_reprogram)
> {
> #ifdef CONFIG_HIGH_RES_TIMERS
> - /* Retrigger the CPU local events everywhere */
> - on_each_cpu(retrigger_next_event, NULL, 1);
> + cpumask_var_t mask;
> + int cpu;
> +
> + if (force_reprogram == true) {
> + on_each_cpu(retrigger_next_event, NULL, 1);
> + goto set_timerfd;
> + }
> +
> + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
> + on_each_cpu(retrigger_next_event, NULL, 1);
> + goto set_timerfd;
> + }

This is really horrible and the proper thing to do is to seperate the
s2idle/resume related functionality which allows to do some other
cleanups as well.

I already started to look at that over the weekend, but got stuck due to
a couple of correctness issues I found with the whole clock_was_set()
mechanism while looking. That stuff needs to be fixed first.

Back to the above. For the s2idle resume path there is no reason for an
IPI at all. It's just that way because it has been bolted on the
existing machinery.

So today it does:

tick_unfreeze() {
if (first_cpu_to_unfreeze()) {
timekeeping_resume();
tick_resume();
tick_resume_local();
clock_was_set_delayed();
} else {
tick_resume_local();
}
}

Then after everything is unfrozen including workqueues the delayed
clock_was_set() runs and does the IPI dance. That's just silly.

We can be smarter than that:

tick_unfreeze() {
if (first_cpu_to_unfreeze())
timekeeping_resume();
tick_resume();
tick_resume_local();
hrtimer_resume_local();
} else {
tick_resume_local();
hrtimer_resume_local();
}
}

See? No IPI required at all and no magic force argument and as a bonus
we get also understandable code.

Splitting this properly allows to be smarter about the affected clocks
because most of the clock_was_set() events only care about
CLOCK_REALTIME and CLOCK_TAI and just the sleep time injection affects
CLOCK_BOOTTIME as well.

There are a few other things which can be done to avoid even more IPIs,
but let me address the correctness issues of clock_was_set() first.

Thanks,

tglx