diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl
index a0d479d..f66f4df 100644
--- a/Documentation/DocBook/kernel-locking.tmpl
+++ b/Documentation/DocBook/kernel-locking.tmpl
@@ -1645,7 +1645,9 @@ the amount of locking which needs to be done.
all the readers who were traversing the list when we deleted the
element are finished. We use call_rcu() to
register a callback which will actually destroy the object once
- the readers are finished.
+ all pre-existing readers are finished. Alternatively,
+ synchronize_rcu() may be used to block until
+ all pre-existing are finished.
But how does Read Copy Update know when the readers are
@@ -1714,7 +1716,7 @@ the amount of locking which needs to be done.
- object_put(obj);
+ list_del_rcu(&obj->list);
cache_num--;
-+ call_rcu(&obj->rcu, cache_delete_rcu, obj);
++ call_rcu(&obj->rcu, cache_delete_rcu);
}
/* Must be holding cache_lock */
@@ -1725,14 +1727,6 @@ the amount of locking which needs to be done.
if (++cache_num > MAX_CACHE_SIZE) {
struct object *i, *outcast = NULL;
list_for_each_entry(i, &cache, list) {
-@@ -85,6 +94,7 @@
- obj->popularity = 0;
- atomic_set(&obj->refcnt, 1); /* The cache holds a reference */
- spin_lock_init(&obj->lock);
-+ INIT_RCU_HEAD(&obj->rcu);
-
- spin_lock_irqsave(&cache_lock, flags);
- __cache_add(obj);
@@ -104,12 +114,11 @@
struct object *cache_find(int id)
{
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index 790d1a8..0c134f8 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -218,13 +218,22 @@ over a rather long period of time, but improvements are always welcome!
include:
a. Keeping a count of the number of data-structure elements
- used by the RCU-protected data structure, including those
- waiting for a grace period to elapse. Enforce a limit
- on this number, stalling updates as needed to allow
- previously deferred frees to complete.
-
- Alternatively, limit only the number awaiting deferred
- free rather than the total number of elements.
+ used by the RCU-protected data structure, including
+ those waiting for a grace period to elapse. Enforce a
+ limit on this number, stalling updates as needed to allow
+ previously deferred frees to complete. Alternatively,
+ limit only the number awaiting deferred free rather than
+ the total number of elements.
+
+ One way to stall the updates is to acquire the update-side
+ mutex. (Don't try this with a spinlock -- other CPUs
+ spinning on the lock could prevent the grace period
+ from ever ending.) Another way to stall the updates
+ is for the updates to use a wrapper function around
+ the memory allocator, so that this wrapper function
+ simulates OOM when there is too much memory awaiting an
+ RCU grace period. There are of course many other
+ variations on this theme.
b. Limiting update rate. For example, if updates occur only
once per hour, then no explicit rate limiting is required,
@@ -365,3 +374,26 @@ over a rather long period of time, but improvements are always welcome!
and the compiler to freely reorder code into and out of RCU
read-side critical sections. It is the responsibility of the
RCU update-side primitives to deal with this.
+
+17. Use CONFIG_PROVE_RCU, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and
+ the __rcu sparse checks to validate your RCU code. These
+ can help find problems as follows:
+
+ CONFIG_PROVE_RCU: check that accesses to RCU-protected data
+ structures are carried out under the proper RCU
+ read-side critical section, while holding the right
+ combination of locks, or whatever other conditions
+ are appropriate.
+
+ CONFIG_DEBUG_OBJECTS_RCU_HEAD: check that you don't pass the
+ same object to call_rcu() (or friends) before an RCU
+ grace period has elapsed since the last time that you
+ passed that same object to call_rcu() (or friends).
+
+ __rcu sparse checks: tag the pointer to the RCU-protected data
+ structure with __rcu, and sparse will warn you if you
+ access that pointer without the services of one of the
+ variants of rcu_dereference().
+
+ These debugging aids can help you find problems that are
+ otherwise extremely difficult to spot.
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index 44c6dcc..862c08e 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -80,6 +80,24 @@ o A CPU looping with bottom halves disabled. This condition can
o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
without invoking schedule().
+o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
+ happen to preempt a low-priority task in the middle of an RCU
+ read-side critical section. This is especially damaging if
+ that low-priority task is not permitted to run on any other CPU,
+ in which case the next RCU grace period can never complete, which
+ will eventually cause the system to run out of memory and hang.
+ While the system is in the process of running itself out of
+ memory, you might see stall-warning messages.
+
+o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
+ is running at a higher priority than the RCU softirq threads.
+ This will prevent RCU callbacks from ever being invoked,
+ and in a CONFIG_TREE_PREEMPT_RCU kernel will further prevent
+ RCU grace periods from ever completing. Either way, the
+ system will eventually run out of memory and hang. In the
+ CONFIG_TREE_PREEMPT_RCU case, you might see stall-warning
+ messages.
+
o A bug in the RCU implementation.
o A hardware failure. This is quite unlikely, but has occurred
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index efd8cc9..e731ad2 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -1,39 +1,43 @@
CONFIG_RCU_TRACE debugfs Files and Formats
-The rcutree implementation of RCU provides debugfs trace output that
-summarizes counters and state. This information is useful for debugging
-RCU itself, and can sometimes also help to debug abuses of RCU.
-The following sections describe the debugfs files and formats.
+The rcutree and rcutiny implementations of RCU provide debugfs trace
+output that summarizes counters and state. This information is useful for
+debugging RCU itself, and can sometimes also help to debug abuses of RCU.
+The following sections describe the debugfs files and formats, first
+for rcutree and next for rcutiny.
-Hierarchical RCU debugfs Files and Formats
+CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats
-This implementation of RCU provides three debugfs files under the
+These implementations of RCU provides five debugfs files under the
top-level directory RCU: rcu/rcudata (which displays fields in struct
-rcu_data), rcu/rcugp (which displays grace-period counters), and
-rcu/rcuhier (which displays the struct rcu_node hierarchy).
+rcu_data), rcu/rcudata.csv (which is a .csv spreadsheet version of
+rcu/rcudata), rcu/rcugp (which displays grace-period counters),
+rcu/rcuhier (which displays the struct rcu_node hierarchy), and
+rcu/rcu_pending (which displays counts of the reasons that the
+rcu_pending() function decided that there was core RCU work to do).
The output of "cat rcu/rcudata" looks as follows:
rcu_sched:
- 0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10
- 1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10
- 2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10
- 3 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=6681/1 dn=0 df=1545 of=0 ri=0 ql=0 b=10
- 4 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1003/1 dn=0 df=1992 of=0 ri=0 ql=0 b=10
- 5 c=17829 g=17830 pq=1 pqc=17829 qp=1 dt=3887/1 dn=0 df=3331 of=0 ri=4 ql=2 b=10
- 6 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=859/1 dn=0 df=3224 of=0 ri=0 ql=0 b=10
- 7 c=17829 g=17830 pq=0 pqc=17829 qp=1 dt=3761/1 dn=0 df=1818 of=0 ri=0 ql=2 b=10
+ 0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1/0 df=1101 of=0 ri=36 ql=0 b=10
+ 1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1/0 df=1015 of=0 ri=0 ql=0 b=10
+ 2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1/0 df=1839 of=0 ri=0 ql=0 b=10
+ 3 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=6681/1/0 df=1545 of=0 ri=0 ql=0 b=10
+ 4 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1003/1/0 df=1992 of=0 ri=0 ql=0 b=10
+ 5 c=17829 g=17830 pq=1 pqc=17829 qp=1 dt=3887/1/0 df=3331 of=0 ri=4 ql=2 b=10
+ 6 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=859/1/0 df=3224 of=0 ri=0 ql=0 b=10
+ 7 c=17829 g=17830 pq=0 pqc=17829 qp=1 dt=3761/1/0 df=1818 of=0 ri=0 ql=2 b=10
rcu_bh:
- 0 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=10951/1 dn=0 df=0 of=0 ri=0 ql=0 b=10
- 1 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=16117/1 dn=0 df=13 of=0 ri=0 ql=0 b=10
- 2 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1445/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
- 3 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=6681/1 dn=0 df=9 of=0 ri=0 ql=0 b=10
- 4 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1003/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
- 5 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3887/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
- 6 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=859/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
- 7 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3761/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
+ 0 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=10951/1/0 df=0 of=0 ri=0 ql=0 b=10
+ 1 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=16117/1/0 df=13 of=0 ri=0 ql=0 b=10
+ 2 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1445/1/0 df=15 of=0 ri=0 ql=0 b=10
+ 3 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=6681/1/0 df=9 of=0 ri=0 ql=0 b=10
+ 4 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1003/1/0 df=15 of=0 ri=0 ql=0 b=10
+ 5 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3887/1/0 df=15 of=0 ri=0 ql=0 b=10
+ 6 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=859/1/0 df=15 of=0 ri=0 ql=0 b=10
+ 7 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3761/1/0 df=15 of=0 ri=0 ql=0 b=10
The first section lists the rcu_data structures for rcu_sched, the second
for rcu_bh. Note that CONFIG_TREE_PREEMPT_RCU kernels will have an
@@ -81,18 +85,10 @@ o "qp" indicates that RCU still expects a quiescent state from
o "dt" is the current value of the dyntick counter that is incremented
when entering or leaving dynticks idle state, either by the
- scheduler or by irq. The number after the "/" is the interrupt
- nesting depth when in dyntick-idle state, or one greater than
- the interrupt-nesting depth otherwise.
-
- This field is displayed only for CONFIG_NO_HZ kernels.
-
-o "dn" is the current value of the dyntick counter that is incremented
- when entering or leaving dynticks idle state via NMI. If both
- the "dt" and "dn" values are even, then this CPU is in dynticks
- idle mode and may be ignored by RCU. If either of these two
- counters is odd, then RCU must be alert to the possibility of
- an RCU read-side critical section running on this CPU.
+ scheduler or by irq. The number after the first "/" is the
+ interrupt nesting depth when in dyntick-idle state, or one
+ greater than the interrupt-nesting depth otherwise. The number
+ after the second "/" is the NMI nesting depth.
This field is displayed only for CONFIG_NO_HZ kernels.
@@ -125,6 +121,18 @@ o "b" is the batch limit for this CPU. If more than this number
of RCU callbacks is ready to invoke, then the remainder will
be deferred.
+o "ci" is the number of RCU callbacks that have been invoked for
+ this CPU. Note that ci+ql is the number of callbacks that have
+ been registered in absence of CPU-hotplug activity.
+
+o "co" is the number of RCU callbacks that have been orphaned due to
+ this CPU going offline. These orphaned callbacks have been moved
+ to an arbitrarily chosen online CPU.
+
+o "ca" is the number of RCU callbacks that have been adopted due to
+ other CPUs going offline. Note that ci+co-ca+ql is the number of
+ RCU callbacks registered on this CPU.
+
There is also an rcu/rcudata.csv file with the same information in
comma-separated-variable spreadsheet format.
@@ -157,12 +165,12 @@ o "gpnum" is the number of grace periods that have started. It is
The output of "cat rcu/rcuhier" looks as follows, with very long lines:
-c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 oqlen=0
+c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6
1/1 .>. 0:127 ^0
3/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3
3/3f .>. 0:5 ^0 2/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3
rcu_bh:
-c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 oqlen=0
+c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
0/1 .>. 0:127 ^0
0/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3
0/3f .>. 0:5 ^0 0/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3
@@ -180,7 +188,7 @@ o "s" is the "signaled" state that drives force_quiescent_state()'s
o "jfq" is the number of jiffies remaining for this grace period
before force_quiescent_state() is invoked to help push things
- along. Note that CPUs in dyntick-idle mode thoughout the grace
+ along. Note that CPUs in dyntick-idle mode throughout the grace
period will not report on their own, but rather must be check by
some other CPU via force_quiescent_state().
@@ -201,11 +209,6 @@ o "fqlh" is the number of calls to force_quiescent_state() that
exited immediately (without even being counted in nfqs above)
due to contention on ->fqslock.
-o "oqlen" is the number of callbacks on the "orphan" callback
- list. RCU callbacks are placed on this list by CPUs going
- offline, and are "adopted" either by the CPU helping the outgoing
- CPU or by the next rcu_barrier*() call, whichever comes first.
-
o Each element of the form "1/1 0:127 ^0" represents one struct
rcu_node. Each line represents one level of the hierarchy, from
root to leaves. It is best to think of the rcu_data structures
@@ -315,3 +318,115 @@ o "nn" is the number of times that this CPU needed nothing. Alert
readers will note that the rcu "nn" number for a given CPU very
closely matches the rcu_bh "np" number for that same CPU. This
is due to short-circuit evaluation in rcu_pending().
+
+
+CONFIG_TINY_RCU and CONFIG_TINY_PREEMPT_RCU debugfs Files and Formats
+
+These implementations of RCU provides a single debugfs file under the
+top-level directory RCU, namely rcu/rcudata, which displays fields in
+rcu_bh_ctrlblk, rcu_sched_ctrlblk and, for CONFIG_TINY_PREEMPT_RCU,
+rcu_preempt_ctrlblk.
+
+The output of "cat rcu/rcudata" is as follows:
+
+rcu_preempt: qlen=24 gp=1097669 g197/p197/c197 tasks=...
+ ttb=. btg=no ntb=184 neb=0 nnb=183 j=01f7 bt=0274
+ normal balk: nt=1097669 gt=0 bt=371 b=0 ny=25073378 nos=0
+ exp balk: bt=0 nos=0
+rcu_sched: qlen: 0
+rcu_bh: qlen: 0
+
+This is split into rcu_preempt, rcu_sched, and rcu_bh sections, with the
+rcu_preempt section appearing only in CONFIG_TINY_PREEMPT_RCU builds.
+The last three lines of the rcu_preempt section appear only in
+CONFIG_RCU_BOOST kernel builds. The fields are as follows:
+
+o "qlen" is the number of RCU callbacks currently waiting either
+ for an RCU grace period or waiting to be invoked. This is the
+ only field present for rcu_sched and rcu_bh, due to the
+ short-circuiting of grace period in those two cases.
+
+o "gp" is the number of grace periods that have completed.
+
+o "g197/p197/c197" displays the grace-period state, with the
+ "g" number being the number of grace periods that have started
+ (mod 256), the "p" number being the number of grace periods
+ that the CPU has responded to (also mod 256), and the "c"
+ number being the number of grace periods that have completed
+ (once again mode 256).
+
+ Why have both "gp" and "g"? Because the data flowing into
+ "gp" is only present in a CONFIG_RCU_TRACE kernel.
+
+o "tasks" is a set of bits. The first bit is "T" if there are
+ currently tasks that have recently blocked within an RCU
+ read-side critical section, the second bit is "N" if any of the
+ aforementioned tasks are blocking the current RCU grace period,
+ and the third bit is "E" if any of the aforementioned tasks are
+ blocking the current expedited grace period. Each bit is "."
+ if the corresponding condition does not hold.
+
+o "ttb" is a single bit. It is "B" if any of the blocked tasks
+ need to be priority boosted and "." otherwise.
+
+o "btg" indicates whether boosting has been carried out during
+ the current grace period, with "exp" indicating that boosting
+ is in progress for an expedited grace period, "no" indicating
+ that boosting has not yet started for a normal grace period,
+ "begun" indicating that boosting has bebug for a normal grace
+ period, and "done" indicating that boosting has completed for
+ a normal grace period.
+
+o "ntb" is the total number of tasks subjected to RCU priority boosting
+ periods since boot.
+
+o "neb" is the number of expedited grace periods that have had
+ to resort to RCU priority boosting since boot.
+
+o "nnb" is the number of normal grace periods that have had
+ to resort to RCU priority boosting since boot.
+
+o "j" is the low-order 12 bits of the jiffies counter in hexadecimal.
+
+o "bt" is the low-order 12 bits of the value that the jiffies counter
+ will have at the next time that boosting is scheduled to begin.
+
+o In the line beginning with "normal balk", the fields are as follows:
+
+ o "nt" is the number of times that the system balked from
+ boosting because there were no blocked tasks to boost.
+ Note that the system will balk from boosting even if the
+ grace period is overdue when the currently running task
+ is looping within an RCU read-side critical section.
+ There is no point in boosting in this case, because
+ boosting a running task won't make it run any faster.
+
+ o "gt" is the number of times that the system balked
+ from boosting because, although there were blocked tasks,
+ none of them were preventing the current grace period
+ from completing.
+
+ o "bt" is the number of times that the system balked
+ from boosting because boosting was already in progress.
+
+ o "b" is the number of times that the system balked from
+ boosting because boosting had already completed for
+ the grace period in question.
+
+ o "ny" is the number of times that the system balked from
+ boosting because it was not yet time to start boosting
+ the grace period in question.
+
+ o "nos" is the number of times that the system balked from
+ boosting for inexplicable ("not otherwise specified")
+ reasons. This can actually happen due to races involving
+ increments of the jiffies counter.
+
+o In the line beginning with "exp balk", the fields are as follows:
+
+ o "bt" is the number of times that the system balked from
+ boosting because there were no blocked tasks to boost.
+
+ o "nos" is the number of times that the system balked from
+ boosting for inexplicable ("not otherwise specified")
+ reasons.
diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index c908c5f..5808731 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -28,7 +28,7 @@ struct evdev {
int minor;
struct input_handle handle;
wait_queue_head_t wait;
- struct evdev_client *grab;
+ struct evdev_client __rcu *grab;
struct list_head client_list;
spinlock_t client_lock; /* protects client_list */
struct mutex mutex;
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 7c80082..17927b1 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -127,7 +127,10 @@ static void handle_tx(struct vhost_net *net)
size_t len, total_len = 0;
int err, wmem;
size_t hdr_size;
- struct socket *sock = rcu_dereference(vq->private_data);
+ struct socket *sock;
+
+ sock = rcu_dereference_check(vq->private_data,
+ lockdep_is_held(&vq->mutex));
if (!sock)
return;
@@ -582,7 +585,10 @@ static void vhost_net_disable_vq(struct vhost_net *n,
static void vhost_net_enable_vq(struct vhost_net *n,
struct vhost_virtqueue *vq)
{
- struct socket *sock = vq->private_data;
+ struct socket *sock;
+
+ sock = rcu_dereference_protected(vq->private_data,
+ lockdep_is_held(&vq->mutex));
if (!sock)
return;
if (vq == n->vqs + VHOST_NET_VQ_TX) {
@@ -598,7 +604,8 @@ static struct socket *vhost_net_stop_vq(struct vhost_net *n,
struct socket *sock;
mutex_lock(&vq->mutex);
- sock = vq->private_data;
+ sock = rcu_dereference_protected(vq->private_data,
+ lockdep_is_held(&vq->mutex));
vhost_net_disable_vq(n, vq);
rcu_assign_pointer(vq->private_data, NULL);
mutex_unlock(&vq->mutex);
@@ -736,7 +743,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
}
/* start polling new socket */
- oldsock = vq->private_data;
+ oldsock = rcu_dereference_protected(vq->private_data,
+ lockdep_is_held(&vq->mutex));
if (sock != oldsock) {
vhost_net_disable_vq(n, vq);
rcu_assign_pointer(vq->private_data, sock);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index dd3d6f7..8b5a1b3 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -320,7 +320,7 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
vhost_dev_cleanup(dev);
memory->nregions = 0;
- dev->memory = memory;
+ RCU_INIT_POINTER(dev->memory, memory);
return 0;
}
@@ -352,8 +352,9 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
fput(dev->log_file);
dev->log_file = NULL;
/* No one will access memory at this point */
- kfree(dev->memory);
- dev->memory = NULL;
+ kfree(rcu_dereference_protected(dev->memory,
+ lockdep_is_held(&dev->mutex)));
+ RCU_INIT_POINTER(dev->memory, NULL);
if (dev->mm)
mmput(dev->mm);
dev->mm = NULL;
@@ -440,14 +441,22 @@ static int vq_access_ok(unsigned int num,
/* Caller should have device mutex but not vq mutex */
int vhost_log_access_ok(struct vhost_dev *dev)
{
- return memory_access_ok(dev, dev->memory, 1);
+ struct vhost_memory *mp;
+
+ mp = rcu_dereference_protected(dev->memory,
+ lockdep_is_held(&dev->mutex));
+ return memory_access_ok(dev, mp, 1);
}
/* Verify access for write logging. */
/* Caller should have vq mutex and device mutex */
static int vq_log_access_ok(struct vhost_virtqueue *vq, void __user *log_base)
{
- return vq_memory_access_ok(log_base, vq->dev->memory,
+ struct vhost_memory *mp;
+
+ mp = rcu_dereference_protected(vq->dev->memory,
+ lockdep_is_held(&vq->mutex));
+ return vq_memory_access_ok(log_base, mp,
vhost_has_feature(vq->dev, VHOST_F_LOG_ALL)) &&
(!vq->log_used || log_access_ok(log_base, vq->log_addr,
sizeof *vq->used +
@@ -487,7 +496,8 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
kfree(newmem);
return -EFAULT;
}
- oldmem = d->memory;
+ oldmem = rcu_dereference_protected(d->memory,
+ lockdep_is_held(&d->mutex));
rcu_assign_pointer(d->memory, newmem);
synchronize_rcu();
kfree(oldmem);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index afd7729..af3c11d 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -106,7 +106,7 @@ struct vhost_virtqueue {
* vhost_work execution acts instead of rcu_read_lock() and the end of
* vhost_work execution acts instead of rcu_read_lock().
* Writers use virtqueue mutex. */
- void *private_data;
+ void __rcu *private_data;
/* Log write descriptors */
void __user *log_base;
struct vhost_log log[VHOST_NET_MAX_SG];
@@ -116,7 +116,7 @@ struct vhost_dev {
/* Readers use RCU to access memory table pointer
* log base pointer and features.
* Writers use mutex below.*/
- struct vhost_memory *memory;
+ struct vhost_memory __rcu *memory;
struct mm_struct *mm;
struct mutex mutex;
unsigned acked_features;
@@ -173,7 +173,11 @@ enum {
static inline int vhost_has_feature(struct vhost_dev *dev, int bit)
{
- unsigned acked_features = rcu_dereference(dev->acked_features);
+ unsigned acked_features;
+
+ acked_features =
+ rcu_dereference_index_check(dev->acked_features,
+ lockdep_is_held(&dev->mutex));
return acked_features & (1 << bit);
}
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 0c99102..709dfb9 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -75,7 +75,7 @@ struct cgroup_subsys_state {
unsigned long flags;
/* ID for this css, if possible */
- struct css_id *id;
+ struct css_id __rcu *id;
};
/* bits in struct cgroup_subsys_state flags field */
@@ -205,7 +205,7 @@ struct cgroup {
struct list_head children; /* my children */
struct cgroup *parent; /* my parent */
- struct dentry *dentry; /* cgroup fs entry, RCU protected */
+ struct dentry __rcu *dentry; /* cgroup fs entry, RCU protected */
/* Private pointers for each registered subsystem */
struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index c1a62c5..320d6c9 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -16,7 +16,11 @@
# define __release(x) __context__(x,-1)
# define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0)
# define __percpu __attribute__((noderef, address_space(3)))
+#ifdef CONFIG_SPARSE_RCU_POINTER
+# define __rcu __attribute__((noderef, address_space(4)))
+#else
# define __rcu
+#endif
extern void __chk_user_ptr(const volatile void __user *);
extern void __chk_io_ptr(const volatile void __iomem *);
#else
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 4d2c395..4aaeab3 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -84,7 +84,7 @@ struct thread_group_cred {
atomic_t usage;
pid_t tgid; /* thread group process ID */
spinlock_t lock;
- struct key *session_keyring; /* keyring inherited over fork */
+ struct key __rcu *session_keyring; /* keyring inherited over fork */
struct key *process_keyring; /* keyring private to this process */
struct rcu_head rcu; /* RCU deletion hook */
};
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index f59ed29..133c0ba 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -31,7 +31,7 @@ struct embedded_fd_set {
struct fdtable {
unsigned int max_fds;
- struct file ** fd; /* current fd array */
+ struct file __rcu **fd; /* current fd array */
fd_set *close_on_exec;
fd_set *open_fds;
struct rcu_head rcu;
@@ -46,7 +46,7 @@ struct files_struct {
* read mostly part
*/
atomic_t count;
- struct fdtable *fdt;
+ struct fdtable __rcu *fdt;
struct fdtable fdtab;
/*
* written part on a separate cache line in SMP
@@ -55,7 +55,7 @@ struct files_struct {
int next_fd;
struct embedded_fd_set close_on_exec_init;
struct embedded_fd_set open_fds_init;
- struct file * fd_array[NR_OPEN_DEFAULT];
+ struct file __rcu * fd_array[NR_OPEN_DEFAULT];
};
#define rcu_dereference_check_fdtable(files, fdtfd) \
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 63d069b..3168dcf 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1384,7 +1384,7 @@ struct super_block {
* Saved mount options for lazy filesystems using
* generic_show_options()
*/
- char *s_options;
+ char __rcu *s_options;
};
extern struct timespec current_fs_time(struct super_block *sb);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 5f2f4c4..af3f06b 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -129,8 +129,8 @@ struct blk_scsi_cmd_filter {
struct disk_part_tbl {
struct rcu_head rcu_head;
int len;
- struct hd_struct *last_lookup;
- struct hd_struct *part[];
+ struct hd_struct __rcu *last_lookup;
+ struct hd_struct __rcu *part[];
};
struct gendisk {
@@ -149,7 +149,7 @@ struct gendisk {
* non-critical accesses use RCU. Always access through
* helpers.
*/
- struct disk_part_tbl *part_tbl;
+ struct disk_part_tbl __rcu *part_tbl;
struct hd_struct part0;
const struct block_device_operations *fops;
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index d5b3876..1f4517d 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -139,7 +139,7 @@ static inline void account_system_vtime(struct task_struct *tsk)
#endif
#if defined(CONFIG_NO_HZ)
-#if defined(CONFIG_TINY_RCU)
+#if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
extern void rcu_enter_nohz(void);
extern void rcu_exit_nohz(void);
diff --git a/include/linux/idr.h b/include/linux/idr.h
index e968db7..cdb715e 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -50,14 +50,14 @@
struct idr_layer {
unsigned long bitmap; /* A zero bit means "space here" */
- struct idr_layer *ary[1<mm == mm
* new_owner->alloc_lock is held
*/
- struct task_struct *owner;
+ struct task_struct __rcu *owner;
#endif
#ifdef CONFIG_PROC_FS
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 508f8cf..d0edf7d 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -185,7 +185,7 @@ struct nfs_inode {
struct nfs4_cached_acl *nfs4_acl;
/* NFSv4 state */
struct list_head open_states;
- struct nfs_delegation *delegation;
+ struct nfs_delegation __rcu *delegation;
fmode_t delegation_state;
struct rw_semaphore rwsem;
#endif /* CONFIG_NFS_V4*/
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index b2f1a4d..2026f9e 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -49,28 +49,28 @@
struct notifier_block {
int (*notifier_call)(struct notifier_block *, unsigned long, void *);
- struct notifier_block *next;
+ struct notifier_block __rcu *next;
int priority;
};
struct atomic_notifier_head {
spinlock_t lock;
- struct notifier_block *head;
+ struct notifier_block __rcu *head;
};
struct blocking_notifier_head {
struct rw_semaphore rwsem;
- struct notifier_block *head;
+ struct notifier_block __rcu *head;
};
struct raw_notifier_head {
- struct notifier_block *head;
+ struct notifier_block __rcu *head;
};
struct srcu_notifier_head {
struct mutex mutex;
struct srcu_struct srcu;
- struct notifier_block *head;
+ struct notifier_block __rcu *head;
};
#define ATOMIC_INIT_NOTIFIER_HEAD(name) do { \
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 634b8e6..a39cbed 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -47,6 +47,8 @@ static inline void *radix_tree_indirect_to_ptr(void *ptr)
{
return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
}
+#define radix_tree_indirect_to_ptr(ptr) \
+ radix_tree_indirect_to_ptr((void __force *)(ptr))
static inline int radix_tree_is_indirect_ptr(void *ptr)
{
@@ -61,7 +63,7 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
struct radix_tree_root {
unsigned int height;
gfp_t gfp_mask;
- struct radix_tree_node *rnode;
+ struct radix_tree_node __rcu *rnode;
};
#define RADIX_TREE_INIT(mask) { \
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 4ec3b38..f31ef61 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -10,6 +10,21 @@
#include
/*
+ * Why is there no list_empty_rcu()? Because list_empty() serves this
+ * purpose. The list_empty() function fetches the RCU-protected pointer
+ * and compares it to the address of the list head, but neither dereferences
+ * this pointer itself nor provides this pointer to the caller. Therefore,
+ * it is not necessary to use rcu_dereference(), so that list_empty() can
+ * be used anywhere you would want to use a list_empty_rcu().
+ */
+
+/*
+ * return the ->next pointer of a list_head in an rcu safe
+ * way, we must not access it directly
+ */
+#define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next)))
+
+/*
* Insert a new entry between two known consecutive entries.
*
* This is only for internal list manipulation where we know
@@ -20,7 +35,7 @@ static inline void __list_add_rcu(struct list_head *new,
{
new->next = next;
new->prev = prev;
- rcu_assign_pointer(prev->next, new);
+ rcu_assign_pointer(list_next_rcu(prev), new);
next->prev = new;
}
@@ -138,7 +153,7 @@ static inline void list_replace_rcu(struct list_head *old,
{
new->next = old->next;
new->prev = old->prev;
- rcu_assign_pointer(new->prev->next, new);
+ rcu_assign_pointer(list_next_rcu(new->prev), new);
new->next->prev = new;
old->prev = LIST_POISON2;
}
@@ -193,7 +208,7 @@ static inline void list_splice_init_rcu(struct list_head *list,
*/
last->next = at;
- rcu_assign_pointer(head->next, first);
+ rcu_assign_pointer(list_next_rcu(head), first);
first->prev = head;
at->prev = last;
}
@@ -208,7 +223,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
* primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
*/
#define list_entry_rcu(ptr, type, member) \
- container_of(rcu_dereference_raw(ptr), type, member)
+ ({typeof (*ptr) __rcu *__ptr = (typeof (*ptr) __rcu __force *)ptr; \
+ container_of((typeof(ptr))rcu_dereference_raw(__ptr), type, member); \
+ })
/**
* list_first_entry_rcu - get the first element from a list
@@ -225,9 +242,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
list_entry_rcu((ptr)->next, type, member)
#define __list_for_each_rcu(pos, head) \
- for (pos = rcu_dereference_raw((head)->next); \
+ for (pos = rcu_dereference_raw(list_next_rcu(head)); \
pos != (head); \
- pos = rcu_dereference_raw(pos->next))
+ pos = rcu_dereference_raw(list_next_rcu((pos)))
/**
* list_for_each_entry_rcu - iterate over rcu list of given type
@@ -257,9 +274,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
* as long as the traversal is guarded by rcu_read_lock().
*/
#define list_for_each_continue_rcu(pos, head) \
- for ((pos) = rcu_dereference_raw((pos)->next); \
+ for ((pos) = rcu_dereference_raw(list_next_rcu(pos)); \
prefetch((pos)->next), (pos) != (head); \
- (pos) = rcu_dereference_raw((pos)->next))
+ (pos) = rcu_dereference_raw(list_next_rcu(pos)))
/**
* list_for_each_entry_continue_rcu - continue iteration over list of given type
@@ -314,12 +331,19 @@ static inline void hlist_replace_rcu(struct hlist_node *old,
new->next = next;
new->pprev = old->pprev;
- rcu_assign_pointer(*new->pprev, new);
+ rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new);
if (next)
new->next->pprev = &new->next;
old->pprev = LIST_POISON2;
}
+/*
+ * return the first or the next element in an RCU protected hlist
+ */
+#define hlist_first_rcu(head) (*((struct hlist_node __rcu **)(&(head)->first)))
+#define hlist_next_rcu(node) (*((struct hlist_node __rcu **)(&(node)->next)))
+#define hlist_pprev_rcu(node) (*((struct hlist_node __rcu **)((node)->pprev)))
+
/**
* hlist_add_head_rcu
* @n: the element to add to the hash list.
@@ -346,7 +370,7 @@ static inline void hlist_add_head_rcu(struct hlist_node *n,
n->next = first;
n->pprev = &h->first;
- rcu_assign_pointer(h->first, n);
+ rcu_assign_pointer(hlist_first_rcu(h), n);
if (first)
first->pprev = &n->next;
}
@@ -374,7 +398,7 @@ static inline void hlist_add_before_rcu(struct hlist_node *n,
{
n->pprev = next->pprev;
n->next = next;
- rcu_assign_pointer(*(n->pprev), n);
+ rcu_assign_pointer(hlist_pprev_rcu(n), n);
next->pprev = &n->next;
}
@@ -401,15 +425,15 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
{
n->next = prev->next;
n->pprev = &prev->next;
- rcu_assign_pointer(prev->next, n);
+ rcu_assign_pointer(hlist_next_rcu(prev), n);
if (n->next)
n->next->pprev = &n->next;
}
-#define __hlist_for_each_rcu(pos, head) \
- for (pos = rcu_dereference((head)->first); \
- pos && ({ prefetch(pos->next); 1; }); \
- pos = rcu_dereference(pos->next))
+#define __hlist_for_each_rcu(pos, head) \
+ for (pos = rcu_dereference(hlist_first_rcu(head)); \
+ pos && ({ prefetch(pos->next); 1; }); \
+ pos = rcu_dereference(hlist_next_rcu(pos)))
/**
* hlist_for_each_entry_rcu - iterate over rcu list of given type
@@ -422,11 +446,11 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
* the _rcu list-mutation primitives such as hlist_add_head_rcu()
* as long as the traversal is guarded by rcu_read_lock().
*/
-#define hlist_for_each_entry_rcu(tpos, pos, head, member) \
- for (pos = rcu_dereference_raw((head)->first); \
+#define hlist_for_each_entry_rcu(tpos, pos, head, member) \
+ for (pos = rcu_dereference_raw(hlist_first_rcu(head)); \
pos && ({ prefetch(pos->next); 1; }) && \
({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
- pos = rcu_dereference_raw(pos->next))
+ pos = rcu_dereference_raw(hlist_next_rcu(pos)))
/**
* hlist_for_each_entry_rcu_bh - iterate over rcu list of given type
diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index b70ffe5..2ae1371 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -37,6 +37,12 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
}
}
+#define hlist_nulls_first_rcu(head) \
+ (*((struct hlist_nulls_node __rcu __force **)&(head)->first))
+
+#define hlist_nulls_next_rcu(node) \
+ (*((struct hlist_nulls_node __rcu __force **)&(node)->next))
+
/**
* hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
* @n: the element to delete from the hash list.
@@ -88,7 +94,7 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
n->next = first;
n->pprev = &h->first;
- rcu_assign_pointer(h->first, n);
+ rcu_assign_pointer(hlist_nulls_first_rcu(h), n);
if (!is_a_nulls(first))
first->pprev = &n->next;
}
@@ -100,11 +106,11 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
* @member: the name of the hlist_nulls_node within the struct.
*
*/
-#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \
- for (pos = rcu_dereference_raw((head)->first); \
- (!is_a_nulls(pos)) && \
+#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \
+ for (pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \
+ (!is_a_nulls(pos)) && \
({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \
- pos = rcu_dereference_raw(pos->next))
+ pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)))
#endif
#endif
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 83af1f8..af56148 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -41,11 +41,17 @@
#include
#include
#include
+#include
#ifdef CONFIG_RCU_TORTURE_TEST
extern int rcutorture_runnable; /* for sysctl */
#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
+#define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b))
+#define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b))
+#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
+#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
+
/**
* struct rcu_head - callback structure for use with RCU
* @next: next update requests in a list
@@ -57,29 +63,92 @@ struct rcu_head {
};
/* Exported common interfaces */
-extern void rcu_barrier(void);
+extern void call_rcu_sched(struct rcu_head *head,
+ void (*func)(struct rcu_head *rcu));
+extern void synchronize_sched(void);
extern void rcu_barrier_bh(void);
extern void rcu_barrier_sched(void);
-extern void synchronize_sched_expedited(void);
extern int sched_expedited_torture_stats(char *page);
+static inline void __rcu_read_lock_bh(void)
+{
+ local_bh_disable();
+}
+
+static inline void __rcu_read_unlock_bh(void)
+{
+ local_bh_enable();
+}
+
+#ifdef CONFIG_PREEMPT_RCU
+
+extern void __rcu_read_lock(void);
+extern void __rcu_read_unlock(void);
+void synchronize_rcu(void);
+
+/*
+ * Defined as a macro as it is a very low level header included from
+ * areas that don't even know about current. This gives the rcu_read_lock()
+ * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other
+ * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
+ */
+#define rcu_preempt_depth() (current->rcu_read_lock_nesting)
+
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+
+static inline void __rcu_read_lock(void)
+{
+ preempt_disable();
+}
+
+static inline void __rcu_read_unlock(void)
+{
+ preempt_enable();
+}
+
+static inline void synchronize_rcu(void)
+{
+ synchronize_sched();
+}
+
+static inline int rcu_preempt_depth(void)
+{
+ return 0;
+}
+
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
/* Internal to kernel */
-extern void rcu_init(void);
+extern void rcu_sched_qs(int cpu);
+extern void rcu_bh_qs(int cpu);
+extern void rcu_check_callbacks(int cpu, int user);
+struct notifier_block;
+
+#ifdef CONFIG_NO_HZ
+
+extern void rcu_enter_nohz(void);
+extern void rcu_exit_nohz(void);
+
+#else /* #ifdef CONFIG_NO_HZ */
+
+static inline void rcu_enter_nohz(void)
+{
+}
+
+static inline void rcu_exit_nohz(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_NO_HZ */
#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
#include
-#elif defined(CONFIG_TINY_RCU)
+#elif defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
#include
#else
#error "Unknown RCU implementation specified to kernel configuration"
#endif
-#define RCU_HEAD_INIT { .next = NULL, .func = NULL }
-#define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
-#define INIT_RCU_HEAD(ptr) do { \
- (ptr)->next = NULL; (ptr)->func = NULL; \
-} while (0)
-
/*
* init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic
* initialization and destruction of rcu_head on the stack. rcu_head structures
@@ -120,14 +189,15 @@ extern struct lockdep_map rcu_sched_lock_map;
extern int debug_lockdep_rcu_enabled(void);
/**
- * rcu_read_lock_held - might we be in RCU read-side critical section?
+ * rcu_read_lock_held() - might we be in RCU read-side critical section?
*
* If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU
* read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC,
* this assumes we are in an RCU read-side critical section unless it can
- * prove otherwise.
+ * prove otherwise. This is useful for debug checks in functions that
+ * require that they be called within an RCU read-side critical section.
*
- * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
+ * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
* and while lockdep is disabled.
*/
static inline int rcu_read_lock_held(void)
@@ -144,14 +214,16 @@ static inline int rcu_read_lock_held(void)
extern int rcu_read_lock_bh_held(void);
/**
- * rcu_read_lock_sched_held - might we be in RCU-sched read-side critical section?
+ * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
*
* If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an
* RCU-sched read-side critical section. In absence of
* CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
* critical section unless it can prove otherwise. Note that disabling
* of preemption (including disabling irqs) counts as an RCU-sched
- * read-side critical section.
+ * read-side critical section. This is useful for debug checks in functions
+ * that required that they be called within an RCU-sched read-side
+ * critical section.
*
* Check debug_lockdep_rcu_enabled() to prevent false positives during boot
* and while lockdep is disabled.
@@ -211,7 +283,11 @@ static inline int rcu_read_lock_sched_held(void)
extern int rcu_my_thread_group_empty(void);
-#define __do_rcu_dereference_check(c) \
+/**
+ * rcu_lockdep_assert - emit lockdep splat if specified condition not met
+ * @c: condition to check
+ */
+#define rcu_lockdep_assert(c) \
do { \
static bool __warned; \
if (debug_lockdep_rcu_enabled() && !__warned && !(c)) { \
@@ -220,41 +296,163 @@ extern int rcu_my_thread_group_empty(void);
} \
} while (0)
+#else /* #ifdef CONFIG_PROVE_RCU */
+
+#define rcu_lockdep_assert(c) do { } while (0)
+
+#endif /* #else #ifdef CONFIG_PROVE_RCU */
+
+/*
+ * Helper functions for rcu_dereference_check(), rcu_dereference_protected()
+ * and rcu_assign_pointer(). Some of these could be folded into their
+ * callers, but they are left separate in order to ease introduction of
+ * multiple flavors of pointers to match the multiple flavors of RCU
+ * (e.g., __rcu_bh, * __rcu_sched, and __srcu), should this make sense in
+ * the future.
+ */
+
+#ifdef __CHECKER__
+#define rcu_dereference_sparse(p, space) \
+ ((void)(((typeof(*p) space *)p) == p))
+#else /* #ifdef __CHECKER__ */
+#define rcu_dereference_sparse(p, space)
+#endif /* #else #ifdef __CHECKER__ */
+
+#define __rcu_access_pointer(p, space) \
+ ({ \
+ typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \
+ rcu_dereference_sparse(p, space); \
+ ((typeof(*p) __force __kernel *)(_________p1)); \
+ })
+#define __rcu_dereference_check(p, c, space) \
+ ({ \
+ typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \
+ rcu_lockdep_assert(c); \
+ rcu_dereference_sparse(p, space); \
+ smp_read_barrier_depends(); \
+ ((typeof(*p) __force __kernel *)(_________p1)); \
+ })
+#define __rcu_dereference_protected(p, c, space) \
+ ({ \
+ rcu_lockdep_assert(c); \
+ rcu_dereference_sparse(p, space); \
+ ((typeof(*p) __force __kernel *)(p)); \
+ })
+
+#define __rcu_dereference_index_check(p, c) \
+ ({ \
+ typeof(p) _________p1 = ACCESS_ONCE(p); \
+ rcu_lockdep_assert(c); \
+ smp_read_barrier_depends(); \
+ (_________p1); \
+ })
+#define __rcu_assign_pointer(p, v, space) \
+ ({ \
+ if (!__builtin_constant_p(v) || \
+ ((v) != NULL)) \
+ smp_wmb(); \
+ (p) = (typeof(*v) __force space *)(v); \
+ })
+
+
+/**
+ * rcu_access_pointer() - fetch RCU pointer with no dereferencing
+ * @p: The pointer to read
+ *
+ * Return the value of the specified RCU-protected pointer, but omit the
+ * smp_read_barrier_depends() and keep the ACCESS_ONCE(). This is useful
+ * when the value of this pointer is accessed, but the pointer is not
+ * dereferenced, for example, when testing an RCU-protected pointer against
+ * NULL. Although rcu_access_pointer() may also be used in cases where
+ * update-side locks prevent the value of the pointer from changing, you
+ * should instead use rcu_dereference_protected() for this use case.
+ */
+#define rcu_access_pointer(p) __rcu_access_pointer((p), __rcu)
+
/**
- * rcu_dereference_check - rcu_dereference with debug checking
+ * rcu_dereference_check() - rcu_dereference with debug checking
* @p: The pointer to read, prior to dereferencing
* @c: The conditions under which the dereference will take place
*
* Do an rcu_dereference(), but check that the conditions under which the
- * dereference will take place are correct. Typically the conditions indicate
- * the various locking conditions that should be held at that point. The check
- * should return true if the conditions are satisfied.
+ * dereference will take place are correct. Typically the conditions
+ * indicate the various locking conditions that should be held at that
+ * point. The check should return true if the conditions are satisfied.
+ * An implicit check for being in an RCU read-side critical section
+ * (rcu_read_lock()) is included.
*
* For example:
*
- * bar = rcu_dereference_check(foo->bar, rcu_read_lock_held() ||
- * lockdep_is_held(&foo->lock));
+ * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock));
*
* could be used to indicate to lockdep that foo->bar may only be dereferenced
- * if either the RCU read lock is held, or that the lock required to replace
+ * if either rcu_read_lock() is held, or that the lock required to replace
* the bar struct at foo->bar is held.
*
* Note that the list of conditions may also include indications of when a lock
* need not be held, for example during initialisation or destruction of the
* target struct:
*
- * bar = rcu_dereference_check(foo->bar, rcu_read_lock_held() ||
- * lockdep_is_held(&foo->lock) ||
+ * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) ||
* atomic_read(&foo->usage) == 0);
+ *
+ * Inserts memory barriers on architectures that require them
+ * (currently only the Alpha), prevents the compiler from refetching
+ * (and from merging fetches), and, more importantly, documents exactly
+ * which pointers are protected by RCU and checks that the pointer is
+ * annotated as __rcu.
*/
#define rcu_dereference_check(p, c) \
- ({ \
- __do_rcu_dereference_check(c); \
- rcu_dereference_raw(p); \
- })
+ __rcu_dereference_check((p), rcu_read_lock_held() || (c), __rcu)
+
+/**
+ * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
+ *
+ * This is the RCU-bh counterpart to rcu_dereference_check().
+ */
+#define rcu_dereference_bh_check(p, c) \
+ __rcu_dereference_check((p), rcu_read_lock_bh_held() || (c), __rcu)
/**
- * rcu_dereference_protected - fetch RCU pointer when updates prevented
+ * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
+ *
+ * This is the RCU-sched counterpart to rcu_dereference_check().
+ */
+#define rcu_dereference_sched_check(p, c) \
+ __rcu_dereference_check((p), rcu_read_lock_sched_held() || (c), \
+ __rcu)
+
+#define rcu_dereference_raw(p) rcu_dereference_check(p, 1) /*@@@ needed? @@@*/
+
+/**
+ * rcu_dereference_index_check() - rcu_dereference for indices with debug checking
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
+ *
+ * Similar to rcu_dereference_check(), but omits the sparse checking.
+ * This allows rcu_dereference_index_check() to be used on integers,
+ * which can then be used as array indices. Attempting to use
+ * rcu_dereference_check() on an integer will give compiler warnings
+ * because the sparse address-space mechanism relies on dereferencing
+ * the RCU-protected pointer. Dereferencing integers is not something
+ * that even gcc will put up with.
+ *
+ * Note that this function does not implicitly check for RCU read-side
+ * critical sections. If this function gains lots of uses, it might
+ * make sense to provide versions for each flavor of RCU, but it does
+ * not make sense as of early 2010.
+ */
+#define rcu_dereference_index_check(p, c) \
+ __rcu_dereference_index_check((p), (c))
+
+/**
+ * rcu_dereference_protected() - fetch RCU pointer when updates prevented
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
*
* Return the value of the specified RCU-protected pointer, but omit
* both the smp_read_barrier_depends() and the ACCESS_ONCE(). This
@@ -263,35 +461,61 @@ extern int rcu_my_thread_group_empty(void);
* prevent the compiler from repeating this reference or combining it
* with other references, so it should not be used without protection
* of appropriate locks.
+ *
+ * This function is only for update-side use. Using this function
+ * when protected only by rcu_read_lock() will result in infrequent
+ * but very ugly failures.
*/
#define rcu_dereference_protected(p, c) \
- ({ \
- __do_rcu_dereference_check(c); \
- (p); \
- })
+ __rcu_dereference_protected((p), (c), __rcu)
-#else /* #ifdef CONFIG_PROVE_RCU */
+/**
+ * rcu_dereference_bh_protected() - fetch RCU-bh pointer when updates prevented
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
+ *
+ * This is the RCU-bh counterpart to rcu_dereference_protected().
+ */
+#define rcu_dereference_bh_protected(p, c) \
+ __rcu_dereference_protected((p), (c), __rcu)
-#define rcu_dereference_check(p, c) rcu_dereference_raw(p)
-#define rcu_dereference_protected(p, c) (p)
+/**
+ * rcu_dereference_sched_protected() - fetch RCU-sched pointer when updates prevented
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
+ *
+ * This is the RCU-sched counterpart to rcu_dereference_protected().
+ */
+#define rcu_dereference_sched_protected(p, c) \
+ __rcu_dereference_protected((p), (c), __rcu)
-#endif /* #else #ifdef CONFIG_PROVE_RCU */
/**
- * rcu_access_pointer - fetch RCU pointer with no dereferencing
+ * rcu_dereference() - fetch RCU-protected pointer for dereferencing
+ * @p: The pointer to read, prior to dereferencing
*
- * Return the value of the specified RCU-protected pointer, but omit the
- * smp_read_barrier_depends() and keep the ACCESS_ONCE(). This is useful
- * when the value of this pointer is accessed, but the pointer is not
- * dereferenced, for example, when testing an RCU-protected pointer against
- * NULL. This may also be used in cases where update-side locks prevent
- * the value of the pointer from changing, but rcu_dereference_protected()
- * is a lighter-weight primitive for this use case.
+ * This is a simple wrapper around rcu_dereference_check().
+ */
+#define rcu_dereference(p) rcu_dereference_check(p, 0)
+
+/**
+ * rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing
+ * @p: The pointer to read, prior to dereferencing
+ *
+ * Makes rcu_dereference_check() do the dirty work.
+ */
+#define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0)
+
+/**
+ * rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing
+ * @p: The pointer to read, prior to dereferencing
+ *
+ * Makes rcu_dereference_check() do the dirty work.
*/
-#define rcu_access_pointer(p) ACCESS_ONCE(p)
+#define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0)
/**
- * rcu_read_lock - mark the beginning of an RCU read-side critical section.
+ * rcu_read_lock() - mark the beginning of an RCU read-side critical section
*
* When synchronize_rcu() is invoked on one CPU while other CPUs
* are within RCU read-side critical sections, then the
@@ -302,7 +526,7 @@ extern int rcu_my_thread_group_empty(void);
* until after the all the other CPUs exit their critical sections.
*
* Note, however, that RCU callbacks are permitted to run concurrently
- * with RCU read-side critical sections. One way that this can happen
+ * with new RCU read-side critical sections. One way that this can happen
* is via the following sequence of events: (1) CPU 0 enters an RCU
* read-side critical section, (2) CPU 1 invokes call_rcu() to register
* an RCU callback, (3) CPU 0 exits the RCU read-side critical section,
@@ -317,7 +541,20 @@ extern int rcu_my_thread_group_empty(void);
* will be deferred until the outermost RCU read-side critical section
* completes.
*
- * It is illegal to block while in an RCU read-side critical section.
+ * You can avoid reading and understanding the next paragraph by
+ * following this rule: don't put anything in an rcu_read_lock() RCU
+ * read-side critical section that would block in a !PREEMPT kernel.
+ * But if you want the full story, read on!
+ *
+ * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU), it
+ * is illegal to block while in an RCU read-side critical section. In
+ * preemptible RCU implementations (TREE_PREEMPT_RCU and TINY_PREEMPT_RCU)
+ * in CONFIG_PREEMPT kernel builds, RCU read-side critical sections may
+ * be preempted, but explicit blocking is illegal. Finally, in preemptible
+ * RCU implementations in real-time (CONFIG_PREEMPT_RT) kernel builds,
+ * RCU read-side critical sections may be preempted and they may also
+ * block, but only when acquiring spinlocks that are subject to priority
+ * inheritance.
*/
static inline void rcu_read_lock(void)
{
@@ -337,7 +574,7 @@ static inline void rcu_read_lock(void)
*/
/**
- * rcu_read_unlock - marks the end of an RCU read-side critical section.
+ * rcu_read_unlock() - marks the end of an RCU read-side critical section.
*
* See rcu_read_lock() for more information.
*/
@@ -349,15 +586,16 @@ static inline void rcu_read_unlock(void)
}
/**
- * rcu_read_lock_bh - mark the beginning of a softirq-only RCU critical section
+ * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section
*
* This is equivalent of rcu_read_lock(), but to be used when updates
- * are being done using call_rcu_bh(). Since call_rcu_bh() callbacks
- * consider completion of a softirq handler to be a quiescent state,
- * a process in RCU read-side critical section must be protected by
- * disabling softirqs. Read-side critical sections in interrupt context
- * can use just rcu_read_lock().
- *
+ * are being done using call_rcu_bh() or synchronize_rcu_bh(). Since
+ * both call_rcu_bh() and synchronize_rcu_bh() consider completion of a
+ * softirq handler to be a quiescent state, a process in RCU read-side
+ * critical section must be protected by disabling softirqs. Read-side
+ * critical sections in interrupt context can use just rcu_read_lock(),
+ * though this should at least be commented to avoid confusing people
+ * reading the code.
*/
static inline void rcu_read_lock_bh(void)
{
@@ -379,13 +617,12 @@ static inline void rcu_read_unlock_bh(void)
}
/**
- * rcu_read_lock_sched - mark the beginning of a RCU-classic critical section
+ * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section
*
- * Should be used with either
- * - synchronize_sched()
- * or
- * - call_rcu_sched() and rcu_barrier_sched()
- * on the write-side to insure proper synchronization.
+ * This is equivalent of rcu_read_lock(), but to be used when updates
+ * are being done using call_rcu_sched() or synchronize_rcu_sched().
+ * Read-side critical sections can also be introduced by anything that
+ * disables preemption, including local_irq_disable() and friends.
*/
static inline void rcu_read_lock_sched(void)
{
@@ -420,54 +657,14 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
preempt_enable_notrace();
}
-
/**
- * rcu_dereference_raw - fetch an RCU-protected pointer
+ * rcu_assign_pointer() - assign to RCU-protected pointer
+ * @p: pointer to assign to
+ * @v: value to assign (publish)
*
- * The caller must be within some flavor of RCU read-side critical
- * section, or must be otherwise preventing the pointer from changing,
- * for example, by holding an appropriate lock. This pointer may later
- * be safely dereferenced. It is the caller's responsibility to have
- * done the right thing, as this primitive does no checking of any kind.
- *
- * Inserts memory barriers on architectures that require them
- * (currently only the Alpha), and, more importantly, documents
- * exactly which pointers are protected by RCU.
- */
-#define rcu_dereference_raw(p) ({ \
- typeof(p) _________p1 = ACCESS_ONCE(p); \
- smp_read_barrier_depends(); \
- (_________p1); \
- })
-
-/**
- * rcu_dereference - fetch an RCU-protected pointer, checking for RCU
- *
- * Makes rcu_dereference_check() do the dirty work.
- */
-#define rcu_dereference(p) \
- rcu_dereference_check(p, rcu_read_lock_held())
-
-/**
- * rcu_dereference_bh - fetch an RCU-protected pointer, checking for RCU-bh
- *
- * Makes rcu_dereference_check() do the dirty work.
- */
-#define rcu_dereference_bh(p) \
- rcu_dereference_check(p, rcu_read_lock_bh_held() || irqs_disabled())
-
-/**
- * rcu_dereference_sched - fetch RCU-protected pointer, checking for RCU-sched
- *
- * Makes rcu_dereference_check() do the dirty work.
- */
-#define rcu_dereference_sched(p) \
- rcu_dereference_check(p, rcu_read_lock_sched_held())
-
-/**
- * rcu_assign_pointer - assign (publicize) a pointer to a newly
- * initialized structure that will be dereferenced by RCU read-side
- * critical sections. Returns the value assigned.
+ * Assigns the specified value to the specified RCU-protected
+ * pointer, ensuring that any concurrent RCU readers will see
+ * any prior initialization. Returns the value assigned.
*
* Inserts memory barriers on architectures that require them
* (pretty much all of them other than x86), and also prevents
@@ -476,14 +673,17 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
* call documents which pointers will be dereferenced by RCU read-side
* code.
*/
-
#define rcu_assign_pointer(p, v) \
- ({ \
- if (!__builtin_constant_p(v) || \
- ((v) != NULL)) \
- smp_wmb(); \
- (p) = (v); \
- })
+ __rcu_assign_pointer((p), (v), __rcu)
+
+/**
+ * RCU_INIT_POINTER() - initialize an RCU protected pointer
+ *
+ * Initialize an RCU-protected pointer in such a way to avoid RCU-lockdep
+ * splats.
+ */
+#define RCU_INIT_POINTER(p, v) \
+ p = (typeof(*v) __force __rcu *)(v)
/* Infrastructure to implement the synchronize_() primitives. */
@@ -494,26 +694,37 @@ struct rcu_synchronize {
extern void wakeme_after_rcu(struct rcu_head *head);
+#ifdef CONFIG_PREEMPT_RCU
+
/**
- * call_rcu - Queue an RCU callback for invocation after a grace period.
+ * call_rcu() - Queue an RCU callback for invocation after a grace period.
* @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
+ * @func: actual callback function to be invoked after the grace period
*
- * The update function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. RCU read-side critical
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all pre-existing RCU read-side
+ * critical sections have completed. However, the callback function
+ * might well execute concurrently with RCU read-side critical sections
+ * that started after call_rcu() was invoked. RCU read-side critical
* sections are delimited by rcu_read_lock() and rcu_read_unlock(),
* and may be nested.
*/
extern void call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *head));
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+
+/* In classic RCU, call_rcu() is just call_rcu_sched(). */
+#define call_rcu call_rcu_sched
+
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
/**
- * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
+ * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
* @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
+ * @func: actual callback function to be invoked after the grace period
*
- * The update function will be invoked some time after a full grace
+ * The callback function will be invoked some time after a full grace
* period elapses, in other words after all currently executing RCU
* read-side critical sections have completed. call_rcu_bh() assumes
* that the read-side critical sections end on completion of a softirq
@@ -566,37 +777,4 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
}
#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-#ifndef CONFIG_PROVE_RCU
-#define __do_rcu_dereference_check(c) do { } while (0)
-#endif /* #ifdef CONFIG_PROVE_RCU */
-
-#define __rcu_dereference_index_check(p, c) \
- ({ \
- typeof(p) _________p1 = ACCESS_ONCE(p); \
- __do_rcu_dereference_check(c); \
- smp_read_barrier_depends(); \
- (_________p1); \
- })
-
-/**
- * rcu_dereference_index_check() - rcu_dereference for indices with debug checking
- * @p: The pointer to read, prior to dereferencing
- * @c: The conditions under which the dereference will take place
- *
- * Similar to rcu_dereference_check(), but omits the sparse checking.
- * This allows rcu_dereference_index_check() to be used on integers,
- * which can then be used as array indices. Attempting to use
- * rcu_dereference_check() on an integer will give compiler warnings
- * because the sparse address-space mechanism relies on dereferencing
- * the RCU-protected pointer. Dereferencing integers is not something
- * that even gcc will put up with.
- *
- * Note that this function does not implicitly check for RCU read-side
- * critical sections. If this function gains lots of uses, it might
- * make sense to provide versions for each flavor of RCU, but it does
- * not make sense as of early 2010.
- */
-#define rcu_dereference_index_check(p, c) \
- __rcu_dereference_index_check((p), (c))
-
#endif /* __LINUX_RCUPDATE_H */
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index e2e8931..30ebd7c 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -27,116 +27,117 @@
#include
-void rcu_sched_qs(int cpu);
-void rcu_bh_qs(int cpu);
-static inline void rcu_note_context_switch(int cpu)
+static inline void rcu_init(void)
{
- rcu_sched_qs(cpu);
}
-#define __rcu_read_lock() preempt_disable()
-#define __rcu_read_unlock() preempt_enable()
-#define __rcu_read_lock_bh() local_bh_disable()
-#define __rcu_read_unlock_bh() local_bh_enable()
-#define call_rcu_sched call_rcu
-
-#define rcu_init_sched() do { } while (0)
-extern void rcu_check_callbacks(int cpu, int user);
+#ifdef CONFIG_TINY_RCU
-static inline int rcu_needs_cpu(int cpu)
+static inline void synchronize_rcu_expedited(void)
{
- return 0;
+ synchronize_sched(); /* Only one CPU, so pretty fast anyway!!! */
}
-/*
- * Return the number of grace periods.
- */
-static inline long rcu_batches_completed(void)
+static inline void rcu_barrier(void)
{
- return 0;
+ rcu_barrier_sched(); /* Only one CPU, so only one list of callbacks! */
}
-/*
- * Return the number of bottom-half grace periods.
- */
-static inline long rcu_batches_completed_bh(void)
-{
- return 0;
-}
+#else /* #ifdef CONFIG_TINY_RCU */
-static inline void rcu_force_quiescent_state(void)
+void rcu_barrier(void);
+void synchronize_rcu_expedited(void);
+
+#endif /* #else #ifdef CONFIG_TINY_RCU */
+
+static inline void synchronize_rcu_bh(void)
{
+ synchronize_sched();
}
-static inline void rcu_bh_force_quiescent_state(void)
+static inline void synchronize_rcu_bh_expedited(void)
{
+ synchronize_sched();
}
-static inline void rcu_sched_force_quiescent_state(void)
+static inline void synchronize_sched_expedited(void)
{
+ synchronize_sched();
}
-extern void synchronize_sched(void);
+#ifdef CONFIG_TINY_RCU
-static inline void synchronize_rcu(void)
+static inline void rcu_preempt_note_context_switch(void)
{
- synchronize_sched();
}
-static inline void synchronize_rcu_bh(void)
+static inline void exit_rcu(void)
{
- synchronize_sched();
}
-static inline void synchronize_rcu_expedited(void)
+static inline int rcu_needs_cpu(int cpu)
{
- synchronize_sched();
+ return 0;
}
-static inline void synchronize_rcu_bh_expedited(void)
+#else /* #ifdef CONFIG_TINY_RCU */
+
+void rcu_preempt_note_context_switch(void);
+extern void exit_rcu(void);
+int rcu_preempt_needs_cpu(void);
+
+static inline int rcu_needs_cpu(int cpu)
{
- synchronize_sched();
+ return rcu_preempt_needs_cpu();
}
-struct notifier_block;
-
-#ifdef CONFIG_NO_HZ
+#endif /* #else #ifdef CONFIG_TINY_RCU */
-extern void rcu_enter_nohz(void);
-extern void rcu_exit_nohz(void);
+static inline void rcu_note_context_switch(int cpu)
+{
+ rcu_sched_qs(cpu);
+ rcu_preempt_note_context_switch();
+}
-#else /* #ifdef CONFIG_NO_HZ */
+/*
+ * Return the number of grace periods.
+ */
+static inline long rcu_batches_completed(void)
+{
+ return 0;
+}
-static inline void rcu_enter_nohz(void)
+/*
+ * Return the number of bottom-half grace periods.
+ */
+static inline long rcu_batches_completed_bh(void)
{
+ return 0;
}
-static inline void rcu_exit_nohz(void)
+static inline void rcu_force_quiescent_state(void)
{
}
-#endif /* #else #ifdef CONFIG_NO_HZ */
+static inline void rcu_bh_force_quiescent_state(void)
+{
+}
-static inline void exit_rcu(void)
+static inline void rcu_sched_force_quiescent_state(void)
{
}
-static inline int rcu_preempt_depth(void)
+static inline void rcu_cpu_stall_reset(void)
{
- return 0;
}
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-
extern int rcu_scheduler_active __read_mostly;
extern void rcu_scheduler_starting(void);
-
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
static inline void rcu_scheduler_starting(void)
{
}
-
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
#endif /* __LINUX_RCUTINY_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index c0ed1c0..3a93348 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -30,64 +30,25 @@
#ifndef __LINUX_RCUTREE_H
#define __LINUX_RCUTREE_H
-struct notifier_block;
-
-extern void rcu_sched_qs(int cpu);
-extern void rcu_bh_qs(int cpu);
+extern void rcu_init(void);
extern void rcu_note_context_switch(int cpu);
extern int rcu_needs_cpu(int cpu);
+extern void rcu_cpu_stall_reset(void);
#ifdef CONFIG_TREE_PREEMPT_RCU
-extern void __rcu_read_lock(void);
-extern void __rcu_read_unlock(void);
-extern void synchronize_rcu(void);
extern void exit_rcu(void);
-/*
- * Defined as macro as it is a very low level header
- * included from areas that don't even know about current
- */
-#define rcu_preempt_depth() (current->rcu_read_lock_nesting)
-
#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-static inline void __rcu_read_lock(void)
-{
- preempt_disable();
-}
-
-static inline void __rcu_read_unlock(void)
-{
- preempt_enable();
-}
-
-#define synchronize_rcu synchronize_sched
-
static inline void exit_rcu(void)
{
}
-static inline int rcu_preempt_depth(void)
-{
- return 0;
-}
-
#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
-static inline void __rcu_read_lock_bh(void)
-{
- local_bh_disable();
-}
-static inline void __rcu_read_unlock_bh(void)
-{
- local_bh_enable();
-}
-
-extern void call_rcu_sched(struct rcu_head *head,
- void (*func)(struct rcu_head *rcu));
extern void synchronize_rcu_bh(void);
-extern void synchronize_sched(void);
+extern void synchronize_sched_expedited(void);
extern void synchronize_rcu_expedited(void);
static inline void synchronize_rcu_bh_expedited(void)
@@ -95,7 +56,7 @@ static inline void synchronize_rcu_bh_expedited(void)
synchronize_sched_expedited();
}
-extern void rcu_check_callbacks(int cpu, int user);
+extern void rcu_barrier(void);
extern long rcu_batches_completed(void);
extern long rcu_batches_completed_bh(void);
@@ -104,18 +65,6 @@ extern void rcu_force_quiescent_state(void);
extern void rcu_bh_force_quiescent_state(void);
extern void rcu_sched_force_quiescent_state(void);
-#ifdef CONFIG_NO_HZ
-void rcu_enter_nohz(void);
-void rcu_exit_nohz(void);
-#else /* CONFIG_NO_HZ */
-static inline void rcu_enter_nohz(void)
-{
-}
-static inline void rcu_exit_nohz(void)
-{
-}
-#endif /* CONFIG_NO_HZ */
-
/* A context switch is a grace period for RCU-sched and RCU-bh. */
static inline int rcu_blocking_is_gp(void)
{
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1e2a6db..ed1a9bc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1202,12 +1202,17 @@ struct task_struct {
unsigned int policy;
cpumask_t cpus_allowed;
-#ifdef CONFIG_TREE_PREEMPT_RCU
+#ifdef CONFIG_PREEMPT_RCU
int rcu_read_lock_nesting;
char rcu_read_unlock_special;
- struct rcu_node *rcu_blocked_node;
struct list_head rcu_node_entry;
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
+#ifdef CONFIG_TREE_PREEMPT_RCU
+ struct rcu_node *rcu_blocked_node;
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+ struct rt_mutex *rcu_boost_mutex;
+#endif /* #ifdef CONFIG_RCU_BOOST */
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info sched_info;
@@ -1288,9 +1293,9 @@ struct task_struct {
struct list_head cpu_timers[3];
/* process credentials */
- const struct cred *real_cred; /* objective and real subjective task
+ const struct cred __rcu *real_cred; /* objective and real subjective task
* credentials (COW) */
- const struct cred *cred; /* effective (overridable) subjective task
+ const struct cred __rcu *cred; /* effective (overridable) subjective task
* credentials (COW) */
struct mutex cred_guard_mutex; /* guard against foreign influences on
* credential calculations
@@ -1418,7 +1423,7 @@ struct task_struct {
#endif
#ifdef CONFIG_CGROUPS
/* Control Group info protected by css_set_lock */
- struct css_set *cgroups;
+ struct css_set __rcu *cgroups;
/* cg_list protected by css_set_lock and tsk->alloc_lock */
struct list_head cg_list;
#endif
@@ -1740,16 +1745,22 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
#define used_math() tsk_used_math(current)
-#ifdef CONFIG_TREE_PREEMPT_RCU
+#ifdef CONFIG_PREEMPT_RCU
#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
-#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
+#define RCU_READ_UNLOCK_BOOSTED (1 << 1) /* boosted while in RCU read-side. */
+#define RCU_READ_UNLOCK_NEED_QS (1 << 2) /* RCU core needs CPU response. */
static inline void rcu_copy_process(struct task_struct *p)
{
p->rcu_read_lock_nesting = 0;
p->rcu_read_unlock_special = 0;
+#ifdef CONFIG_TREE_PREEMPT_RCU
p->rcu_blocked_node = NULL;
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+ p->rcu_boost_mutex = NULL;
+#endif /* #ifdef CONFIG_RCU_BOOST */
INIT_LIST_HEAD(&p->rcu_node_entry);
}
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 4d5d2f5..58971e8 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -108,19 +108,43 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp)
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/**
- * srcu_dereference - fetch SRCU-protected pointer with checking
+ * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing
+ * @p: the pointer to fetch and protect for later dereferencing
+ * @sp: pointer to the srcu_struct, which is used to check that we
+ * really are in an SRCU read-side critical section.
+ * @c: condition to check for update-side use
*
- * Makes rcu_dereference_check() do the dirty work.
+ * If PROVE_RCU is enabled, invoking this outside of an RCU read-side
+ * critical section will result in an RCU-lockdep splat, unless @c evaluates
+ * to 1. The @c argument will normally be a logical expression containing
+ * lockdep_is_held() calls.
*/
-#define srcu_dereference(p, sp) \
- rcu_dereference_check(p, srcu_read_lock_held(sp))
+#define srcu_dereference_check(p, sp, c) \
+ __rcu_dereference_check((p), srcu_read_lock_held(sp) || (c), __rcu)
+
+/**
+ * srcu_dereference - fetch SRCU-protected pointer for later dereferencing
+ * @p: the pointer to fetch and protect for later dereferencing
+ * @sp: pointer to the srcu_struct, which is used to check that we
+ * really are in an SRCU read-side critical section.
+ *
+ * Makes rcu_dereference_check() do the dirty work. If PROVE_RCU
+ * is enabled, invoking this outside of an RCU read-side critical
+ * section will result in an RCU-lockdep splat.
+ */
+#define srcu_dereference(p, sp) srcu_dereference_check((p), (sp), 0)
/**
* srcu_read_lock - register a new reader for an SRCU-protected structure.
* @sp: srcu_struct in which to register the new reader.
*
* Enter an SRCU read-side critical section. Note that SRCU read-side
- * critical sections may be nested.
+ * critical sections may be nested. However, it is illegal to
+ * call anything that waits on an SRCU grace period for the same
+ * srcu_struct, whether directly or indirectly. Please note that
+ * one way to indirectly wait on an SRCU grace period is to acquire
+ * a mutex that is held elsewhere while calling synchronize_srcu() or
+ * synchronize_srcu_expedited().
*/
static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp)
{
diff --git a/include/linux/sunrpc/auth_gss.h b/include/linux/sunrpc/auth_gss.h
index 671538d..8eee9db 100644
--- a/include/linux/sunrpc/auth_gss.h
+++ b/include/linux/sunrpc/auth_gss.h
@@ -69,7 +69,7 @@ struct gss_cl_ctx {
enum rpc_gss_proc gc_proc;
u32 gc_seq;
spinlock_t gc_seq_lock;
- struct gss_ctx *gc_gss_ctx;
+ struct gss_ctx __rcu *gc_gss_ctx;
struct xdr_netobj gc_wire_ctx;
u32 gc_win;
unsigned long gc_expiry;
@@ -80,7 +80,7 @@ struct gss_upcall_msg;
struct gss_cred {
struct rpc_cred gc_base;
enum rpc_gss_svc gc_service;
- struct gss_cl_ctx *gc_ctx;
+ struct gss_cl_ctx __rcu *gc_ctx;
struct gss_upcall_msg *gc_upcall;
unsigned long gc_upcall_timestamp;
unsigned char gc_machine_cred : 1;
diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
index ef6c24a..a4dc5b0 100644
--- a/include/net/cls_cgroup.h
+++ b/include/net/cls_cgroup.h
@@ -51,7 +51,8 @@ static inline u32 task_cls_classid(struct task_struct *p)
return 0;
rcu_read_lock();
- id = rcu_dereference(net_cls_subsys_id);
+ id = rcu_dereference_index_check(net_cls_subsys_id,
+ rcu_read_lock_held());
if (id >= 0)
classid = container_of(task_subsys_state(p, id),
struct cgroup_cls_state, css)->classid;
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index e624dae..caf17db 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -75,7 +75,7 @@ struct nf_conntrack_helper;
/* nf_conn feature for connections that have a helper */
struct nf_conn_help {
/* Helper. if any */
- struct nf_conntrack_helper *helper;
+ struct nf_conntrack_helper __rcu *helper;
union nf_conntrack_help help;
diff --git a/init/Kconfig b/init/Kconfig
index 2de5b1c..046f963 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -340,6 +340,7 @@ choice
config TREE_RCU
bool "Tree-based hierarchical RCU"
+ depends on !PREEMPT && SMP
help
This option selects the RCU implementation that is
designed for very large SMP system with hundreds or
@@ -347,7 +348,7 @@ config TREE_RCU
smaller systems.
config TREE_PREEMPT_RCU
- bool "Preemptable tree-based hierarchical RCU"
+ bool "Preemptible tree-based hierarchical RCU"
depends on PREEMPT
help
This option selects the RCU implementation that is
@@ -365,11 +366,24 @@ config TINY_RCU
is not required. This option greatly reduces the
memory footprint of RCU.
+config TINY_PREEMPT_RCU
+ bool "Preemptible UP-only small-memory-footprint RCU"
+ depends on !SMP && PREEMPT
+ help
+ This option selects the RCU implementation that is designed
+ for real-time UP systems. This option greatly reduces the
+ memory footprint of RCU.
+
endchoice
+config PREEMPT_RCU
+ def_bool ( TREE_PREEMPT_RCU || TINY_PREEMPT_RCU )
+ help
+ This option enables preemptible-RCU code that is common between
+ the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations.
+
config RCU_TRACE
bool "Enable tracing for RCU"
- depends on TREE_RCU || TREE_PREEMPT_RCU
help
This option provides tracing in RCU which presents stats
in debugfs for debugging RCU implementation.
@@ -387,9 +401,12 @@ config RCU_FANOUT
help
This option controls the fanout of hierarchical implementations
of RCU, allowing RCU to work efficiently on machines with
- large numbers of CPUs. This value must be at least the cube
- root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
- systems and up to 262,144 for 64-bit systems.
+ large numbers of CPUs. This value must be at least the fourth
+ root of NR_CPUS, which allows NR_CPUS to be insanely large.
+ The default value of RCU_FANOUT should be used for production
+ systems, but if you are stress-testing the RCU implementation
+ itself, small RCU_FANOUT values allow you to test large-system
+ code paths on small(er) systems.
Select a specific number if testing RCU itself.
Take the default if unsure.
@@ -432,6 +449,45 @@ config TREE_RCU_TRACE
TREE_PREEMPT_RCU implementations, permitting Makefile to
trivially select kernel/rcutree_trace.c.
+config RCU_BOOST
+ bool "Enable RCU priority boosting"
+ depends on TINY_PREEMPT_RCU
+ default n
+ help
+ This option boosts the priority of preempted RCU readers that
+ block the current preemptible RCU grace period for too long.
+ This option also prevents heavy loads from blocking RCU
+ callback invocation for all flavors of RCU.
+
+ Say Y here if you are working with real-time apps or heavy loads
+ Say N here if you are unsure.
+
+config RCU_BOOST_PRIO
+ int "Real-time priority to boost RCU readers to"
+ range 1 99
+ depends on RCU_BOOST
+ default 1
+ help
+ This option specifies the real-time priority to which preempted
+ RCU readers are to be boosted. If you are working with CPU-bound
+ real-time applications, you should specify a priority higher then
+ the highest-priority CPU-bound application.
+
+ Specify the real-time priority, or take the default if unsure.
+
+config RCU_BOOST_DELAY
+ int "Milliseconds to delay boosting after RCU grace-period start"
+ range 0 3000
+ depends on RCU_BOOST
+ default 500
+ help
+ This option specifies the time to wait after the beginning of
+ a given grace period before priority-boosting preempted RCU
+ readers blocking that grace period. Note that any RCU reader
+ blocking an expedited RCU grace period is boosted immediately.
+
+ Accept the default if unsure.
+
endmenu # "RCU Subsystem"
config IKCONFIG
diff --git a/kernel/Makefile b/kernel/Makefile
index 0b72d1a..17046b6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -86,6 +86,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o
obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
obj-$(CONFIG_TINY_RCU) += rcutiny.o
+obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c9483d8..291ba3d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -138,7 +138,7 @@ struct css_id {
* is called after synchronize_rcu(). But for safe use, css_is_removed()
* css_tryget() should be used for avoiding race.
*/
- struct cgroup_subsys_state *css;
+ struct cgroup_subsys_state __rcu *css;
/*
* ID of this css.
*/
diff --git a/kernel/pid.c b/kernel/pid.c
index d55c6fb..39b65b6 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -401,7 +401,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
struct task_struct *result = NULL;
if (pid) {
struct hlist_node *first;
- first = rcu_dereference_check(pid->tasks[type].first,
+ first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
rcu_read_lock_held() ||
lockdep_tasklist_lock_is_held());
if (first)
@@ -416,6 +416,7 @@ EXPORT_SYMBOL(pid_task);
*/
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
{
+ rcu_lockdep_assert(rcu_read_lock_held());
return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
}
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 4d16983..a23a57a 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -73,12 +73,14 @@ int debug_lockdep_rcu_enabled(void)
EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
/**
- * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section?
+ * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
*
* Check for bottom half being disabled, which covers both the
* CONFIG_PROVE_RCU and not cases. Note that if someone uses
* rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
- * will show the situation.
+ * will show the situation. This is useful for debug checks in functions
+ * that require that they be called within an RCU read-side critical
+ * section.
*
* Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
*/
@@ -86,7 +88,7 @@ int rcu_read_lock_bh_held(void)
{
if (!debug_lockdep_rcu_enabled())
return 1;
- return in_softirq();
+ return in_softirq() || irqs_disabled();
}
EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 196ec02..0344937 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -36,28 +36,21 @@
#include
#include
-/* Global control variables for rcupdate callback mechanism. */
-struct rcu_ctrlblk {
- struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
- struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
- struct rcu_head **curtail; /* ->next pointer of last CB. */
-};
-
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_sched_ctrlblk = {
- .donetail = &rcu_sched_ctrlblk.rcucblist,
- .curtail = &rcu_sched_ctrlblk.rcucblist,
-};
-
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
- .donetail = &rcu_bh_ctrlblk.rcucblist,
- .curtail = &rcu_bh_ctrlblk.rcucblist,
-};
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
+static struct task_struct *rcu_kthread_task;
+static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
+static unsigned long have_rcu_kthread_work;
+static void invoke_rcu_kthread(void);
+
+/* Forward declarations for rcutiny_plugin.h. */
+struct rcu_ctrlblk;
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static int rcu_kthread(void *arg);
+static void __call_rcu(struct rcu_head *head,
+ void (*func)(struct rcu_head *rcu),
+ struct rcu_ctrlblk *rcp);
+
+#include "rcutiny_plugin.h"
#ifdef CONFIG_NO_HZ
@@ -115,7 +108,7 @@ void rcu_sched_qs(int cpu)
{
if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
rcu_qsctr_help(&rcu_bh_ctrlblk))
- raise_softirq(RCU_SOFTIRQ);
+ invoke_rcu_kthread();
}
/*
@@ -124,7 +117,7 @@ void rcu_sched_qs(int cpu)
void rcu_bh_qs(int cpu)
{
if (rcu_qsctr_help(&rcu_bh_ctrlblk))
- raise_softirq(RCU_SOFTIRQ);
+ invoke_rcu_kthread();
}
/*
@@ -140,16 +133,18 @@ void rcu_check_callbacks(int cpu, int user)
rcu_sched_qs(cpu);
else if (!in_softirq())
rcu_bh_qs(cpu);
+ rcu_preempt_check_callbacks();
}
/*
- * Helper function for rcu_process_callbacks() that operates on the
- * specified rcu_ctrlkblk structure.
+ * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
+ * whose grace period has elapsed.
*/
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
{
struct rcu_head *next, *list;
unsigned long flags;
+ RCU_TRACE(int cb_count = 0);
/* If no RCU callbacks ready to invoke, just return. */
if (&rcp->rcucblist == rcp->donetail)
@@ -162,6 +157,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
*rcp->donetail = NULL;
if (rcp->curtail == rcp->donetail)
rcp->curtail = &rcp->rcucblist;
+ rcu_preempt_remove_callbacks(rcp);
rcp->donetail = &rcp->rcucblist;
local_irq_restore(flags);
@@ -170,18 +166,58 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
next = list->next;
prefetch(next);
debug_rcu_head_unqueue(list);
+ local_bh_disable();
list->func(list);
+ local_bh_enable();
list = next;
+ RCU_TRACE(cb_count++);
+ }
+ RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
+}
+
+/*
+ * This kthread invokes RCU callbacks whose grace periods have
+ * elapsed. It is awakened as needed, and takes the place of the
+ * RCU_SOFTIRQ that was used previously for this purpose.
+ * This is a kthread, but it is never stopped, at least not until
+ * the system goes down.
+ */
+static int rcu_kthread(void *arg)
+{
+ unsigned long work;
+ unsigned long morework;
+ unsigned long flags;
+
+ for (;;) {
+ wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0);
+ morework = rcu_boost();
+ local_irq_save(flags);
+ work = have_rcu_kthread_work;
+ have_rcu_kthread_work = morework;
+ local_irq_restore(flags);
+ if (work) {
+ rcu_process_callbacks(&rcu_sched_ctrlblk);
+ rcu_process_callbacks(&rcu_bh_ctrlblk);
+ rcu_preempt_process_callbacks();
+ }
+ schedule_timeout_interruptible(1); /* Leave CPU for others. */
}
+
+ return 0; /* Not reached, but needed to shut gcc up. */
}
/*
- * Invoke any callbacks whose grace period has completed.
+ * Wake up rcu_kthread() to process callbacks now eligible for invocation
+ * or to boost readers.
*/
-static void rcu_process_callbacks(struct softirq_action *unused)
+static void invoke_rcu_kthread(void)
{
- __rcu_process_callbacks(&rcu_sched_ctrlblk);
- __rcu_process_callbacks(&rcu_bh_ctrlblk);
+ unsigned long flags;
+
+ local_irq_save(flags);
+ have_rcu_kthread_work = 1;
+ wake_up(&rcu_kthread_wq);
+ local_irq_restore(flags);
}
/*
@@ -219,19 +255,20 @@ static void __call_rcu(struct rcu_head *head,
local_irq_save(flags);
*rcp->curtail = head;
rcp->curtail = &head->next;
+ RCU_TRACE(rcp->qlen++);
local_irq_restore(flags);
}
/*
- * Post an RCU callback to be invoked after the end of an RCU grace
+ * Post an RCU callback to be invoked after the end of an RCU-sched grace
* period. But since we have but one CPU, that would be after any
* quiescent state.
*/
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
{
__call_rcu(head, func, &rcu_sched_ctrlblk);
}
-EXPORT_SYMBOL_GPL(call_rcu);
+EXPORT_SYMBOL_GPL(call_rcu_sched);
/*
* Post an RCU bottom-half callback to be invoked after any subsequent
@@ -243,20 +280,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
}
EXPORT_SYMBOL_GPL(call_rcu_bh);
-void rcu_barrier(void)
-{
- struct rcu_synchronize rcu;
-
- init_rcu_head_on_stack(&rcu.head);
- init_completion(&rcu.completion);
- /* Will wake me after RCU finished. */
- call_rcu(&rcu.head, wakeme_after_rcu);
- /* Wait for it. */
- wait_for_completion(&rcu.completion);
- destroy_rcu_head_on_stack(&rcu.head);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier);
-
void rcu_barrier_bh(void)
{
struct rcu_synchronize rcu;
@@ -285,9 +308,16 @@ void rcu_barrier_sched(void)
}
EXPORT_SYMBOL_GPL(rcu_barrier_sched);
-void __init rcu_init(void)
+/*
+ * Spawn the kthread that invokes RCU callbacks.
+ */
+static int __init rcu_spawn_kthreads(void)
{
- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
-}
+ struct sched_param sp;
-#include "rcutiny_plugin.h"
+ rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
+ sp.sched_priority = RCU_BOOST_PRIO;
+ sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
+ return 0;
+}
+early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index d223a92..8cd197c 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -1,7 +1,7 @@
/*
- * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
* Internal non-public definitions that provide either classic
- * or preemptable semantics.
+ * or preemptible semantics.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -17,23 +17,1004 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
- * Copyright IBM Corporation, 2009
+ * Copyright (c) 2010 Linaro
*
* Author: Paul E. McKenney
*/
+#include
+#include
+#include
+
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(stmt) stmt
+#else /* #ifdef CONFIG_RCU_TRACE */
+#define RCU_TRACE(stmt)
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+ struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
+ struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
+ struct rcu_head **curtail; /* ->next pointer of last CB. */
+ RCU_TRACE(long qlen); /* Number of pending CBs. */
+};
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+ .donetail = &rcu_sched_ctrlblk.rcucblist,
+ .curtail = &rcu_sched_ctrlblk.rcucblist,
+};
+
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+ .donetail = &rcu_bh_ctrlblk.rcucblist,
+ .curtail = &rcu_bh_ctrlblk.rcucblist,
+};
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+#ifdef CONFIG_TINY_PREEMPT_RCU
+
+#include
+
+/* Global control variables for preemptible RCU. */
+struct rcu_preempt_ctrlblk {
+ struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */
+ struct rcu_head **nexttail;
+ /* Tasks blocked in a preemptible RCU */
+ /* read-side critical section while an */
+ /* preemptible-RCU grace period is in */
+ /* progress must wait for a later grace */
+ /* period. This pointer points to the */
+ /* ->next pointer of the last task that */
+ /* must wait for a later grace period, or */
+ /* to &->rcb.rcucblist if there is no */
+ /* such task. */
+ struct list_head blkd_tasks;
+ /* Tasks blocked in RCU read-side critical */
+ /* section. Tasks are placed at the head */
+ /* of this list and age towards the tail. */
+ struct list_head *gp_tasks;
+ /* Pointer to the first task blocking the */
+ /* current grace period, or NULL if there */
+ /* is not such task. */
+ struct list_head *exp_tasks;
+ /* Pointer to first task blocking the */
+ /* current expedited grace period, or NULL */
+ /* if there is no such task. If there */
+ /* is no current expedited grace period, */
+ /* then there cannot be any such task. */
+#ifdef CONFIG_RCU_BOOST
+ struct list_head *boost_tasks;
+ /* Pointer to first task that needs to be */
+ /* priority-boosted, or NULL if no priority */
+ /* boosting is needed. If there is no */
+ /* current or expedited grace period, there */
+ /* can be no such task. */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+ u8 gpnum; /* Current grace period. */
+ u8 gpcpu; /* Last grace period blocked by the CPU. */
+ u8 completed; /* Last grace period completed. */
+ /* If all three are equal, RCU is idle. */
+#ifdef CONFIG_RCU_BOOST
+ s8 boosted_this_gp; /* Has boosting already happened? */
+ unsigned long boost_time; /* When to start boosting (jiffies) */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#ifdef CONFIG_RCU_TRACE
+ unsigned long n_grace_periods;
+#ifdef CONFIG_RCU_BOOST
+ unsigned long n_tasks_boosted;
+ unsigned long n_exp_boosts;
+ unsigned long n_normal_boosts;
+ unsigned long n_normal_balk_blkd_tasks;
+ unsigned long n_normal_balk_gp_tasks;
+ unsigned long n_normal_balk_boost_tasks;
+ unsigned long n_normal_balk_boosted;
+ unsigned long n_normal_balk_notyet;
+ unsigned long n_normal_balk_nos;
+ unsigned long n_exp_balk_blkd_tasks;
+ unsigned long n_exp_balk_nos;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#endif /* #ifdef CONFIG_RCU_TRACE */
+};
+
+static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
+ .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist,
+ .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
+ .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
+ .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
+};
+
+static int rcu_preempted_readers_exp(void);
+static void rcu_report_exp_done(void);
+
+/*
+ * Return true if the CPU has not yet responded to the current grace period.
+ */
+static int rcu_cpu_blocking_cur_gp(void)
+{
+ return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum;
+}
+
+/*
+ * Check for a running RCU reader. Because there is only one CPU,
+ * there can be but one running RCU reader at a time. ;-)
+ */
+static int rcu_preempt_running_reader(void)
+{
+ return current->rcu_read_lock_nesting;
+}
+
+/*
+ * Check for preempted RCU readers blocking any grace period.
+ * If the caller needs a reliable answer, it must disable hard irqs.
+ */
+static int rcu_preempt_blocked_readers_any(void)
+{
+ return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks);
+}
+
+/*
+ * Check for preempted RCU readers blocking the current grace period.
+ * If the caller needs a reliable answer, it must disable hard irqs.
+ */
+static int rcu_preempt_blocked_readers_cgp(void)
+{
+ return rcu_preempt_ctrlblk.gp_tasks != NULL;
+}
+
+/*
+ * Return true if another preemptible-RCU grace period is needed.
+ */
+static int rcu_preempt_needs_another_gp(void)
+{
+ return *rcu_preempt_ctrlblk.rcb.curtail != NULL;
+}
+
+/*
+ * Return true if a preemptible-RCU grace period is in progress.
+ * The caller must disable hardirqs.
+ */
+static int rcu_preempt_gp_in_progress(void)
+{
+ return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum;
+}
+
+/*
+ * Advance a ->blkd_tasks-list pointer to the next entry, instead
+ * returning NULL if at the end of the list.
+ */
+static struct list_head *rcu_next_node_entry(struct task_struct *t)
+{
+ struct list_head *np;
+
+ np = t->rcu_node_entry.next;
+ if (np == &rcu_preempt_ctrlblk.blkd_tasks)
+ np = NULL;
+ return np;
+}
+
+#ifdef CONFIG_RCU_TRACE
+
+#ifdef CONFIG_RCU_BOOST
+static void rcu_initiate_boost_trace(void);
+static void rcu_initiate_exp_boost_trace(void);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * Dump additional statistice for TINY_PREEMPT_RCU.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+ seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
+ rcu_preempt_ctrlblk.rcb.qlen,
+ rcu_preempt_ctrlblk.n_grace_periods,
+ rcu_preempt_ctrlblk.gpnum,
+ rcu_preempt_ctrlblk.gpcpu,
+ rcu_preempt_ctrlblk.completed,
+ "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
+ "N."[!rcu_preempt_ctrlblk.gp_tasks],
+ "E."[!rcu_preempt_ctrlblk.exp_tasks]);
+#ifdef CONFIG_RCU_BOOST
+ seq_printf(m, " ttb=%c btg=",
+ "B."[!rcu_preempt_ctrlblk.boost_tasks]);
+ switch (rcu_preempt_ctrlblk.boosted_this_gp) {
+ case -1:
+ seq_puts(m, "exp");
+ break;
+ case 0:
+ seq_puts(m, "no");
+ break;
+ case 1:
+ seq_puts(m, "begun");
+ break;
+ case 2:
+ seq_puts(m, "done");
+ break;
+ default:
+ seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
+ }
+ seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
+ rcu_preempt_ctrlblk.n_tasks_boosted,
+ rcu_preempt_ctrlblk.n_exp_boosts,
+ rcu_preempt_ctrlblk.n_normal_boosts,
+ (int)(jiffies & 0xffff),
+ (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
+ seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n",
+ "normal balk",
+ rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks,
+ rcu_preempt_ctrlblk.n_normal_balk_gp_tasks,
+ rcu_preempt_ctrlblk.n_normal_balk_boost_tasks,
+ rcu_preempt_ctrlblk.n_normal_balk_boosted,
+ rcu_preempt_ctrlblk.n_normal_balk_notyet,
+ rcu_preempt_ctrlblk.n_normal_balk_nos);
+ seq_printf(m, " exp balk: bt=%lu nos=%lu\n",
+ rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
+ rcu_preempt_ctrlblk.n_exp_balk_nos);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+#ifdef CONFIG_RCU_BOOST
+
+#include "rtmutex_common.h"
+
+/*
+ * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
+ * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
+ */
+static int rcu_boost(void)
+{
+ unsigned long flags;
+ struct rt_mutex mtx;
+ struct list_head *np;
+ struct task_struct *t;
+
+ if (rcu_preempt_ctrlblk.boost_tasks == NULL)
+ return 0; /* Nothing to boost. */
+ raw_local_irq_save(flags);
+ rcu_preempt_ctrlblk.boosted_this_gp++;
+ t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct,
+ rcu_node_entry);
+ np = rcu_next_node_entry(t);
+ rt_mutex_init_proxy_locked(&mtx, t);
+ t->rcu_boost_mutex = &mtx;
+ t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
+ raw_local_irq_restore(flags);
+ rt_mutex_lock(&mtx);
+ RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
+ rcu_preempt_ctrlblk.boosted_this_gp++;
+ rt_mutex_unlock(&mtx);
+ return rcu_preempt_ctrlblk.boost_tasks != NULL;
+}
+
+/*
+ * Check to see if it is now time to start boosting RCU readers blocking
+ * the current grace period, and, if so, tell the rcu_kthread_task to
+ * start boosting them. If there is an expedited boost in progress,
+ * we wait for it to complete.
+ *
+ * If there are no blocked readers blocking the current grace period,
+ * return 0 to let the caller know, otherwise return 1. Note that this
+ * return value is independent of whether or not boosting was done.
+ */
+static int rcu_initiate_boost(void)
+{
+ if (!rcu_preempt_blocked_readers_cgp()) {
+ RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++);
+ return 0;
+ }
+ if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
+ rcu_preempt_ctrlblk.boost_tasks == NULL &&
+ rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
+ ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
+ rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
+ invoke_rcu_kthread();
+ RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
+ } else
+ RCU_TRACE(rcu_initiate_boost_trace());
+ return 1;
+}
+
+/*
+ * Initiate boosting for an expedited grace period.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
+ rcu_preempt_ctrlblk.boost_tasks =
+ rcu_preempt_ctrlblk.blkd_tasks.next;
+ rcu_preempt_ctrlblk.boosted_this_gp = -1;
+ invoke_rcu_kthread();
+ RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
+ } else
+ RCU_TRACE(rcu_initiate_exp_boost_trace());
+ raw_local_irq_restore(flags);
+}
+
+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
+
+/*
+ * Do priority-boost accounting for the start of a new grace period.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+ rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
+ if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
+ rcu_preempt_ctrlblk.boosted_this_gp = 0;
+}
+
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * If there is no RCU priority boosting, we don't boost.
+ */
+static int rcu_boost(void)
+{
+ return 0;
+}
+
+/*
+ * If there is no RCU priority boosting, we don't initiate boosting,
+ * but we do indicate whether there are blocked readers blocking the
+ * current grace period.
+ */
+static int rcu_initiate_boost(void)
+{
+ return rcu_preempt_blocked_readers_cgp();
+}
+
+/*
+ * If there is no RCU priority boosting, we don't initiate expedited boosting.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+}
+
+/*
+ * If there is no RCU priority boosting, nothing to do at grace-period start.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+}
+
+#endif /* else #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * Record a preemptible-RCU quiescent state for the specified CPU. Note
+ * that this just means that the task currently running on the CPU is
+ * in a quiescent state. There might be any number of tasks blocked
+ * while in an RCU read-side critical section.
+ *
+ * Unlike the other rcu_*_qs() functions, callers to this function
+ * must disable irqs in order to protect the assignment to
+ * ->rcu_read_unlock_special.
+ *
+ * Because this is a single-CPU implementation, the only way a grace
+ * period can end is if the CPU is in a quiescent state. The reason is
+ * that a blocked preemptible-RCU reader can exit its critical section
+ * only if the CPU is running it at the time. Therefore, when the
+ * last task blocking the current grace period exits its RCU read-side
+ * critical section, neither the CPU nor blocked tasks will be stopping
+ * the current grace period. (In contrast, SMP implementations
+ * might have CPUs running in RCU read-side critical sections that
+ * block later grace periods -- but this is not possible given only
+ * one CPU.)
+ */
+static void rcu_preempt_cpu_qs(void)
+{
+ /* Record both CPU and task as having responded to current GP. */
+ rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
+ current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+
+ /* If there is no GP then there is nothing more to do. */
+ if (!rcu_preempt_gp_in_progress())
+ return;
+ /*
+ * Check up on boosting. If there are no readers blocking the
+ * current grace period, leave.
+ */
+ if (rcu_initiate_boost())
+ return;
+
+ /* Advance callbacks. */
+ rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum;
+ rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail;
+ rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail;
+
+ /* If there are no blocked readers, next GP is done instantly. */
+ if (!rcu_preempt_blocked_readers_any())
+ rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
+
+ /* If there are done callbacks, cause them to be invoked. */
+ if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
+ invoke_rcu_kthread();
+}
+
+/*
+ * Start a new RCU grace period if warranted. Hard irqs must be disabled.
+ */
+static void rcu_preempt_start_gp(void)
+{
+ if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) {
+
+ /* Official start of GP. */
+ rcu_preempt_ctrlblk.gpnum++;
+ RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
+
+ /* Any blocked RCU readers block new GP. */
+ if (rcu_preempt_blocked_readers_any())
+ rcu_preempt_ctrlblk.gp_tasks =
+ rcu_preempt_ctrlblk.blkd_tasks.next;
+
+ /* Set up for RCU priority boosting. */
+ rcu_preempt_boost_start_gp();
+
+ /* If there is no running reader, CPU is done with GP. */
+ if (!rcu_preempt_running_reader())
+ rcu_preempt_cpu_qs();
+ }
+}
+
+/*
+ * We have entered the scheduler, and the current task might soon be
+ * context-switched away from. If this task is in an RCU read-side
+ * critical section, we will no longer be able to rely on the CPU to
+ * record that fact, so we enqueue the task on the blkd_tasks list.
+ * If the task started after the current grace period began, as recorded
+ * by ->gpcpu, we enqueue at the beginning of the list. Otherwise
+ * before the element referenced by ->gp_tasks (or at the tail if
+ * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element.
+ * The task will dequeue itself when it exits the outermost enclosing
+ * RCU read-side critical section. Therefore, the current grace period
+ * cannot be permitted to complete until the ->gp_tasks pointer becomes
+ * NULL.
+ *
+ * Caller must disable preemption.
+ */
+void rcu_preempt_note_context_switch(void)
+{
+ struct task_struct *t = current;
+ unsigned long flags;
+
+ local_irq_save(flags); /* must exclude scheduler_tick(). */
+ if (rcu_preempt_running_reader() &&
+ (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
+
+ /* Possibly blocking in an RCU read-side critical section. */
+ t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
+
+ /*
+ * If this CPU has already checked in, then this task
+ * will hold up the next grace period rather than the
+ * current grace period. Queue the task accordingly.
+ * If the task is queued for the current grace period
+ * (i.e., this CPU has not yet passed through a quiescent
+ * state for the current grace period), then as long
+ * as that task remains queued, the current grace period
+ * cannot end.
+ */
+ list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
+ if (rcu_cpu_blocking_cur_gp())
+ rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
+ }
+
+ /*
+ * Either we were not in an RCU read-side critical section to
+ * begin with, or we have now recorded that critical section
+ * globally. Either way, we can now note a quiescent state
+ * for this CPU. Again, if we were in an RCU read-side critical
+ * section, and if that critical section was blocking the current
+ * grace period, then the fact that the task has been enqueued
+ * means that current grace period continues to be blocked.
+ */
+ rcu_preempt_cpu_qs();
+ local_irq_restore(flags);
+}
+
+/*
+ * Tiny-preemptible RCU implementation for rcu_read_lock().
+ * Just increment ->rcu_read_lock_nesting, shared state will be updated
+ * if we block.
+ */
+void __rcu_read_lock(void)
+{
+ current->rcu_read_lock_nesting++;
+ barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+
+/*
+ * Handle special cases during rcu_read_unlock(), such as needing to
+ * notify RCU core processing or task having blocked during the RCU
+ * read-side critical section.
+ */
+static void rcu_read_unlock_special(struct task_struct *t)
+{
+ int empty;
+ int empty_exp;
+ unsigned long flags;
+ struct list_head *np;
+ int special;
+
+ /*
+ * NMI handlers cannot block and cannot safely manipulate state.
+ * They therefore cannot possibly be special, so just leave.
+ */
+ if (in_nmi())
+ return;
+
+ local_irq_save(flags);
+
+ /*
+ * If RCU core is waiting for this CPU to exit critical section,
+ * let it know that we have done so.
+ */
+ special = t->rcu_read_unlock_special;
+ if (special & RCU_READ_UNLOCK_NEED_QS)
+ rcu_preempt_cpu_qs();
+
+ /* Hardware IRQ handlers cannot block. */
+ if (in_irq()) {
+ local_irq_restore(flags);
+ return;
+ }
+
+ /* Clean up if blocked during RCU read-side critical section. */
+ if (special & RCU_READ_UNLOCK_BLOCKED) {
+ t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
+
+ /*
+ * Remove this task from the ->blkd_tasks list and adjust
+ * any pointers that might have been referencing it.
+ */
+ empty = !rcu_preempt_blocked_readers_cgp();
+ empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
+ np = rcu_next_node_entry(t);
+ list_del(&t->rcu_node_entry);
+ if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
+ rcu_preempt_ctrlblk.gp_tasks = np;
+ if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
+ rcu_preempt_ctrlblk.exp_tasks = np;
+#ifdef CONFIG_RCU_BOOST
+ if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
+ rcu_preempt_ctrlblk.boost_tasks = np;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+ INIT_LIST_HEAD(&t->rcu_node_entry);
+
+ /*
+ * If this was the last task on the current list, and if
+ * we aren't waiting on the CPU, report the quiescent state
+ * and start a new grace period if needed.
+ */
+ if (!empty && !rcu_preempt_blocked_readers_cgp()) {
+ rcu_preempt_cpu_qs();
+ rcu_preempt_start_gp();
+ }
+
+ /*
+ * If this was the last task on the expedited lists,
+ * then we need wake up the waiting task.
+ */
+ if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
+ rcu_report_exp_done();
+ }
+#ifdef CONFIG_RCU_BOOST
+ /* Unboost self if was boosted. */
+ if (special & RCU_READ_UNLOCK_BOOSTED) {
+ t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
+ rt_mutex_unlock(t->rcu_boost_mutex);
+ t->rcu_boost_mutex = NULL;
+ }
+#endif /* #ifdef CONFIG_RCU_BOOST */
+ local_irq_restore(flags);
+}
+
+/*
+ * Tiny-preemptible RCU implementation for rcu_read_unlock().
+ * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
+ * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
+ * invoke rcu_read_unlock_special() to clean up after a context switch
+ * in an RCU read-side critical section and other special cases.
+ */
+void __rcu_read_unlock(void)
+{
+ struct task_struct *t = current;
+
+ barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
+ --t->rcu_read_lock_nesting;
+ barrier(); /* decrement before load of ->rcu_read_unlock_special */
+ if (t->rcu_read_lock_nesting == 0 &&
+ unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+ rcu_read_unlock_special(t);
+#ifdef CONFIG_PROVE_LOCKING
+ WARN_ON_ONCE(t->rcu_read_lock_nesting < 0);
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+
+/*
+ * Check for a quiescent state from the current CPU. When a task blocks,
+ * the task is recorded in the rcu_preempt_ctrlblk structure, which is
+ * checked elsewhere. This is called from the scheduling-clock interrupt.
+ *
+ * Caller must disable hard irqs.
+ */
+static void rcu_preempt_check_callbacks(void)
+{
+ struct task_struct *t = current;
+
+ if (rcu_preempt_gp_in_progress() &&
+ (!rcu_preempt_running_reader() ||
+ !rcu_cpu_blocking_cur_gp()))
+ rcu_preempt_cpu_qs();
+ if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
+ rcu_preempt_ctrlblk.rcb.donetail)
+ invoke_rcu_kthread();
+ if (rcu_preempt_gp_in_progress() &&
+ rcu_cpu_blocking_cur_gp() &&
+ rcu_preempt_running_reader())
+ t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
+}
+
+/*
+ * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
+ * update, so this is invoked from rcu_process_callbacks() to
+ * handle that case. Of course, it is invoked for all flavors of
+ * RCU, but RCU callbacks can appear only on one of the lists, and
+ * neither ->nexttail nor ->donetail can possibly be NULL, so there
+ * is no need for an explicit check.
+ */
+static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
+{
+ if (rcu_preempt_ctrlblk.nexttail == rcp->donetail)
+ rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist;
+}
+
+/*
+ * Process callbacks for preemptible RCU.
+ */
+static void rcu_preempt_process_callbacks(void)
+{
+ rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+}
+
+/*
+ * Queue a preemptible -RCU callback for invocation after a grace period.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+ unsigned long flags;
+
+ debug_rcu_head_queue(head);
+ head->func = func;
+ head->next = NULL;
+
+ local_irq_save(flags);
+ *rcu_preempt_ctrlblk.nexttail = head;
+ rcu_preempt_ctrlblk.nexttail = &head->next;
+ RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
+ rcu_preempt_start_gp(); /* checks to see if GP needed. */
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+void rcu_barrier(void)
+{
+ struct rcu_synchronize rcu;
+
+ init_rcu_head_on_stack(&rcu.head);
+ init_completion(&rcu.completion);
+ /* Will wake me after RCU finished. */
+ call_rcu(&rcu.head, wakeme_after_rcu);
+ /* Wait for it. */
+ wait_for_completion(&rcu.completion);
+ destroy_rcu_head_on_stack(&rcu.head);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier);
+
+/*
+ * synchronize_rcu - wait until a grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed. RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+void synchronize_rcu(void)
+{
#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ if (!rcu_scheduler_active)
+ return;
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+ WARN_ON_ONCE(rcu_preempt_running_reader());
+ if (!rcu_preempt_blocked_readers_any())
+ return;
+
+ /* Once we get past the fastpath checks, same code as rcu_barrier(). */
+ rcu_barrier();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu);
+
+static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
+static unsigned long sync_rcu_preempt_exp_count;
+static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
+
+/*
+ * Return non-zero if there are any tasks in RCU read-side critical
+ * sections blocking the current preemptible-RCU expedited grace period.
+ * If there is no preemptible-RCU expedited grace period currently in
+ * progress, returns zero unconditionally.
+ */
+static int rcu_preempted_readers_exp(void)
+{
+ return rcu_preempt_ctrlblk.exp_tasks != NULL;
+}
+
+/*
+ * Report the exit from RCU read-side critical section for the last task
+ * that queued itself during or before the current expedited preemptible-RCU
+ * grace period.
+ */
+static void rcu_report_exp_done(void)
+{
+ wake_up(&sync_rcu_preempt_exp_wq);
+}
+
+/*
+ * Wait for an rcu-preempt grace period, but expedite it. The basic idea
+ * is to rely in the fact that there is but one CPU, and that it is
+ * illegal for a task to invoke synchronize_rcu_expedited() while in a
+ * preemptible-RCU read-side critical section. Therefore, any such
+ * critical sections must correspond to blocked tasks, which must therefore
+ * be on the ->blkd_tasks list. So just record the current head of the
+ * list in the ->exp_tasks pointer, and wait for all tasks including and
+ * after the task pointed to by ->exp_tasks to drain.
+ */
+void synchronize_rcu_expedited(void)
+{
+ unsigned long flags;
+ struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk;
+ unsigned long snap;
+
+ barrier(); /* ensure prior action seen before grace period. */
+
+ WARN_ON_ONCE(rcu_preempt_running_reader());
+
+ /*
+ * Acquire lock so that there is only one preemptible RCU grace
+ * period in flight. Of course, if someone does the expedited
+ * grace period for us while we are acquiring the lock, just leave.
+ */
+ snap = sync_rcu_preempt_exp_count + 1;
+ mutex_lock(&sync_rcu_preempt_exp_mutex);
+ if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count))
+ goto unlock_mb_ret; /* Others did our work for us. */
+
+ local_irq_save(flags);
+
+ /*
+ * All RCU readers have to already be on blkd_tasks because
+ * we cannot legally be executing in an RCU read-side critical
+ * section.
+ */
+
+ /* Snapshot current head of ->blkd_tasks list. */
+ rpcp->exp_tasks = rpcp->blkd_tasks.next;
+ if (rpcp->exp_tasks == &rpcp->blkd_tasks)
+ rpcp->exp_tasks = NULL;
+ local_irq_restore(flags);
+
+ /* Wait for tail of ->blkd_tasks list to drain. */
+ if (rcu_preempted_readers_exp())
+ rcu_initiate_expedited_boost();
+ wait_event(sync_rcu_preempt_exp_wq,
+ !rcu_preempted_readers_exp());
+
+ /* Clean up and exit. */
+ barrier(); /* ensure expedited GP seen before counter increment. */
+ sync_rcu_preempt_exp_count++;
+unlock_mb_ret:
+ mutex_unlock(&sync_rcu_preempt_exp_mutex);
+ barrier(); /* ensure subsequent action seen after grace period. */
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+/*
+ * Does preemptible RCU need the CPU to stay out of dynticks mode?
+ */
+int rcu_preempt_needs_cpu(void)
+{
+ if (!rcu_preempt_running_reader())
+ rcu_preempt_cpu_qs();
+ return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
+}
+
+/*
+ * Check for a task exiting while in a preemptible -RCU read-side
+ * critical section, clean up if so. No need to issue warnings,
+ * as debug_check_no_locks_held() already does this if lockdep
+ * is enabled.
+ */
+void exit_rcu(void)
+{
+ struct task_struct *t = current;
+
+ if (t->rcu_read_lock_nesting == 0)
+ return;
+ t->rcu_read_lock_nesting = 1;
+ rcu_read_unlock();
+}
+
+#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
+
+#ifdef CONFIG_RCU_TRACE
+
+/*
+ * Because preemptible RCU does not exist, it is not necessary to
+ * dump out its statistics.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+/*
+ * Because preemptible RCU does not exist, it is never necessary to
+ * boost preempted RCU readers.
+ */
+static int rcu_boost(void)
+{
+ return 0;
+}
+
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to check.
+ */
+static void rcu_preempt_check_callbacks(void)
+{
+}
+
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to remove.
+ */
+static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
+{
+}
+
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to process.
+ */
+static void rcu_preempt_process_callbacks(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
#include
/*
* During boot, we forgive RCU lockdep issues. After this function is
* invoked, we start taking RCU lockdep issues seriously.
*/
-void rcu_scheduler_starting(void)
+void __init rcu_scheduler_starting(void)
{
WARN_ON(nr_context_switches() > 0);
rcu_scheduler_active = 1;
}
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+#ifdef CONFIG_RCU_BOOST
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define RCU_BOOST_PRIO 1
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
+#ifdef CONFIG_RCU_TRACE
+
+#ifdef CONFIG_RCU_BOOST
+
+static void rcu_initiate_boost_trace(void)
+{
+ if (rcu_preempt_ctrlblk.gp_tasks == NULL)
+ rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++;
+ else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
+ rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++;
+ else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
+ rcu_preempt_ctrlblk.n_normal_balk_boosted++;
+ else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
+ rcu_preempt_ctrlblk.n_normal_balk_notyet++;
+ else
+ rcu_preempt_ctrlblk.n_normal_balk_nos++;
+}
+
+static void rcu_initiate_exp_boost_trace(void)
+{
+ if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
+ rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
+ else
+ rcu_preempt_ctrlblk.n_exp_balk_nos++;
+}
+
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
+{
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ rcp->qlen -= n;
+ raw_local_irq_restore(flags);
+}
+
+/*
+ * Dump statistics for TINY_RCU, such as they are.
+ */
+static int show_tiny_stats(struct seq_file *m, void *unused)
+{
+ show_tiny_preempt_stats(m);
+ seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
+ seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
+ return 0;
+}
+
+static int show_tiny_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, show_tiny_stats, NULL);
+}
+
+static const struct file_operations show_tiny_stats_fops = {
+ .owner = THIS_MODULE,
+ .open = show_tiny_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct dentry *rcudir;
+
+static int __init rcutiny_trace_init(void)
+{
+ struct dentry *retval;
+
+ rcudir = debugfs_create_dir("rcu", NULL);
+ if (!rcudir)
+ goto free_out;
+ retval = debugfs_create_file("rcudata", 0444, rcudir,
+ NULL, &show_tiny_stats_fops);
+ if (!retval)
+ goto free_out;
+ return 0;
+free_out:
+ debugfs_remove_recursive(rcudir);
+ return 1;
+}
+
+static void __exit rcutiny_trace_cleanup(void)
+{
+ debugfs_remove_recursive(rcudir);
+}
+
+module_init(rcutiny_trace_init);
+module_exit(rcutiny_trace_cleanup);
+
+MODULE_AUTHOR("Paul E. McKenney");
+MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
+MODULE_LICENSE("GPL");
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 2e2726d..89613f9 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,6 +47,7 @@
#include
#include
#include
+#include
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney and "
@@ -64,6 +65,9 @@ static int irqreader = 1; /* RCU readers from irq (timers). */
static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */
static int fqs_holdoff = 0; /* Hold time within burst (us). */
static int fqs_stutter = 3; /* Wait time between bursts (s). */
+static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
+static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
+static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
static char *torture_type = "rcu"; /* What RCU implementation to torture. */
module_param(nreaders, int, 0444);
@@ -88,6 +92,12 @@ module_param(fqs_holdoff, int, 0444);
MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
module_param(fqs_stutter, int, 0444);
MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+module_param(test_boost, int, 0444);
+MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
+module_param(test_boost_interval, int, 0444);
+MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
+module_param(test_boost_duration, int, 0444);
+MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
@@ -109,6 +119,7 @@ static struct task_struct *stats_task;
static struct task_struct *shuffler_task;
static struct task_struct *stutter_task;
static struct task_struct *fqs_task;
+static struct task_struct *boost_tasks[NR_CPUS];
#define RCU_TORTURE_PIPE_LEN 10
@@ -120,7 +131,7 @@ struct rcu_torture {
};
static LIST_HEAD(rcu_torture_freelist);
-static struct rcu_torture *rcu_torture_current;
+static struct rcu_torture __rcu *rcu_torture_current;
static long rcu_torture_current_version;
static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
static DEFINE_SPINLOCK(rcu_torture_lock);
@@ -134,6 +145,12 @@ static atomic_t n_rcu_torture_alloc_fail;
static atomic_t n_rcu_torture_free;
static atomic_t n_rcu_torture_mberror;
static atomic_t n_rcu_torture_error;
+static long n_rcu_torture_boost_ktrerror;
+static long n_rcu_torture_boost_rterror;
+static long n_rcu_torture_boost_allocerror;
+static long n_rcu_torture_boost_afferror;
+static long n_rcu_torture_boost_failure;
+static long n_rcu_torture_boosts;
static long n_rcu_torture_timers;
static struct list_head rcu_torture_removed;
static cpumask_var_t shuffle_tmp_mask;
@@ -147,14 +164,26 @@ static int stutter_pause_test;
#endif
int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
+#ifdef CONFIG_RCU_BOOST
+#define rcu_can_boost() 1
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define rcu_can_boost() 0
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
+static unsigned long boost_starttime; /* jiffies of next boost test start. */
+DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
+ /* and boost task create/destroy. */
+
/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
#define FULLSTOP_DONTSTOP 0 /* Normal operation. */
#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */
#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */
static int fullstop = FULLSTOP_RMMOD;
-DEFINE_MUTEX(fullstop_mutex); /* Protect fullstop transitions and spawning */
- /* of kthreads. */
+/*
+ * Protect fullstop transitions and spawning of kthreads.
+ */
+static DEFINE_MUTEX(fullstop_mutex);
/*
* Detect and respond to a system shutdown.
@@ -275,6 +304,7 @@ struct rcu_torture_ops {
void (*fqs)(void);
int (*stats)(char *page);
int irq_capable;
+ int can_boost;
char *name;
};
@@ -303,6 +333,10 @@ static void rcu_read_delay(struct rcu_random_state *rrsp)
mdelay(longdelay_ms);
if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
udelay(shortdelay_us);
+#ifdef CONFIG_PREEMPT
+ if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000)))
+ preempt_schedule(); /* No QS if preempt_disable() in effect */
+#endif
}
static void rcu_torture_read_unlock(int idx) __releases(RCU)
@@ -360,6 +394,7 @@ static struct rcu_torture_ops rcu_ops = {
.fqs = rcu_force_quiescent_state,
.stats = NULL,
.irq_capable = 1,
+ .can_boost = rcu_can_boost(),
.name = "rcu"
};
@@ -402,6 +437,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
.fqs = rcu_force_quiescent_state,
.stats = NULL,
.irq_capable = 1,
+ .can_boost = rcu_can_boost(),
.name = "rcu_sync"
};
@@ -418,6 +454,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
.fqs = rcu_force_quiescent_state,
.stats = NULL,
.irq_capable = 1,
+ .can_boost = rcu_can_boost(),
.name = "rcu_expedited"
};
@@ -536,6 +573,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
if (!delay)
schedule_timeout_interruptible(longdelay);
+ else
+ rcu_read_delay(rrsp);
}
static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
@@ -676,6 +715,110 @@ static struct rcu_torture_ops sched_expedited_ops = {
};
/*
+ * RCU torture priority-boost testing. Runs one real-time thread per
+ * CPU for moderate bursts, repeatedly registering RCU callbacks and
+ * spinning waiting for them to be invoked. If a given callback takes
+ * too long to be invoked, we assume that priority inversion has occurred.
+ */
+
+struct rcu_boost_inflight {
+ struct rcu_head rcu;
+ int inflight;
+};
+
+static void rcu_torture_boost_cb(struct rcu_head *head)
+{
+ struct rcu_boost_inflight *rbip =
+ container_of(head, struct rcu_boost_inflight, rcu);
+
+ smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
+ rbip->inflight = 0;
+}
+
+static int rcu_torture_boost(void *arg)
+{
+ unsigned long call_rcu_time;
+ unsigned long endtime;
+ unsigned long oldstarttime;
+ struct rcu_boost_inflight rbi = { .inflight = 0 };
+ struct sched_param sp;
+
+ VERBOSE_PRINTK_STRING("rcu_torture_boost started");
+
+ /* Set real-time priority. */
+ sp.sched_priority = 1;
+ if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
+ VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
+ n_rcu_torture_boost_rterror++;
+ }
+
+ /* Each pass through the following loop does one boost-test cycle. */
+ do {
+ /* Wait for the next test interval. */
+ oldstarttime = boost_starttime;
+ while (jiffies - oldstarttime > ULONG_MAX / 2) {
+ schedule_timeout_uninterruptible(1);
+ rcu_stutter_wait("rcu_torture_boost");
+ if (kthread_should_stop() ||
+ fullstop != FULLSTOP_DONTSTOP)
+ goto checkwait;
+ }
+
+ /* Do one boost-test interval. */
+ endtime = oldstarttime + test_boost_duration * HZ;
+ call_rcu_time = jiffies;
+ while (jiffies - endtime > ULONG_MAX / 2) {
+ /* If we don't have a callback in flight, post one. */
+ if (!rbi.inflight) {
+ smp_mb(); /* RCU core before ->inflight = 1. */
+ rbi.inflight = 1;
+ call_rcu(&rbi.rcu, rcu_torture_boost_cb);
+ if (jiffies - call_rcu_time >
+ test_boost_duration * HZ - HZ / 2) {
+ VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
+ n_rcu_torture_boost_failure++;
+ }
+ call_rcu_time = jiffies;
+ }
+ cond_resched();
+ rcu_stutter_wait("rcu_torture_boost");
+ if (kthread_should_stop() ||
+ fullstop != FULLSTOP_DONTSTOP)
+ goto checkwait;
+ }
+
+ /*
+ * Set the start time of the next test interval.
+ * Yes, this is vulnerable to long delays, but such
+ * delays simply cause a false negative for the next
+ * interval. Besides, we are running at RT priority,
+ * so delays should be relatively rare.
+ */
+ while (oldstarttime == boost_starttime) {
+ if (mutex_trylock(&boost_mutex)) {
+ boost_starttime = jiffies +
+ test_boost_interval * HZ;
+ n_rcu_torture_boosts++;
+ mutex_unlock(&boost_mutex);
+ break;
+ }
+ schedule_timeout_uninterruptible(1);
+ }
+
+ /* Go do the stutter. */
+checkwait: rcu_stutter_wait("rcu_torture_boost");
+ } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+
+ /* Clean up and exit. */
+ VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
+ rcutorture_shutdown_absorb("rcu_torture_boost");
+ while (!kthread_should_stop() || rbi.inflight)
+ schedule_timeout_uninterruptible(1);
+ smp_mb(); /* order accesses to ->inflight before stack-frame death. */
+ return 0;
+}
+
+/*
* RCU torture force-quiescent-state kthread. Repeatedly induces
* bursts of calls to force_quiescent_state(), increasing the probability
* of occurrence of some important types of race conditions.
@@ -731,7 +874,8 @@ rcu_torture_writer(void *arg)
continue;
rp->rtort_pipe_count = 0;
udelay(rcu_random(&rand) & 0x3ff);
- old_rp = rcu_torture_current;
+ old_rp = rcu_dereference_check(rcu_torture_current,
+ current == writer_task);
rp->rtort_mbtest = 1;
rcu_assign_pointer(rcu_torture_current, rp);
smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
@@ -924,7 +1068,8 @@ rcu_torture_printk(char *page)
cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
cnt += sprintf(&page[cnt],
"rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
- "rtmbe: %d nt: %ld",
+ "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld "
+ "rtbf: %ld rtb: %ld nt: %ld",
rcu_torture_current,
rcu_torture_current_version,
list_empty(&rcu_torture_freelist),
@@ -932,8 +1077,19 @@ rcu_torture_printk(char *page)
atomic_read(&n_rcu_torture_alloc_fail),
atomic_read(&n_rcu_torture_free),
atomic_read(&n_rcu_torture_mberror),
+ n_rcu_torture_boost_ktrerror,
+ n_rcu_torture_boost_rterror,
+ n_rcu_torture_boost_allocerror,
+ n_rcu_torture_boost_afferror,
+ n_rcu_torture_boost_failure,
+ n_rcu_torture_boosts,
n_rcu_torture_timers);
- if (atomic_read(&n_rcu_torture_mberror) != 0)
+ if (atomic_read(&n_rcu_torture_mberror) != 0 ||
+ n_rcu_torture_boost_ktrerror != 0 ||
+ n_rcu_torture_boost_rterror != 0 ||
+ n_rcu_torture_boost_allocerror != 0 ||
+ n_rcu_torture_boost_afferror != 0 ||
+ n_rcu_torture_boost_failure != 0)
cnt += sprintf(&page[cnt], " !!!");
cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
if (i > 1) {
@@ -1085,22 +1241,91 @@ rcu_torture_stutter(void *arg)
}
static inline void
-rcu_torture_print_module_parms(char *tag)
+rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
{
printk(KERN_ALERT "%s" TORTURE_FLAG
"--- %s: nreaders=%d nfakewriters=%d "
"stat_interval=%d verbose=%d test_no_idle_hz=%d "
"shuffle_interval=%d stutter=%d irqreader=%d "
- "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
+ "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
+ "test_boost=%d/%d test_boost_interval=%d "
+ "test_boost_duration=%d\n",
torture_type, tag, nrealreaders, nfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
- stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
+ stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
+ test_boost, cur_ops->can_boost,
+ test_boost_interval, test_boost_duration);
}
-static struct notifier_block rcutorture_nb = {
+static struct notifier_block rcutorture_shutdown_nb = {
.notifier_call = rcutorture_shutdown_notify,
};
+static void rcutorture_booster_cleanup(int cpu)
+{
+ struct task_struct *t;
+
+ if (boost_tasks[cpu] == NULL)
+ return;
+ mutex_lock(&boost_mutex);
+ VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
+ t = boost_tasks[cpu];
+ boost_tasks[cpu] = NULL;
+ mutex_unlock(&boost_mutex);
+
+ /* This must be outside of the mutex, otherwise deadlock! */
+ kthread_stop(t);
+}
+
+static int rcutorture_booster_init(int cpu)
+{
+ int retval;
+
+ if (boost_tasks[cpu] != NULL)
+ return 0; /* Already created, nothing more to do. */
+
+ /* Don't allow time recalculation while creating a new task. */
+ mutex_lock(&boost_mutex);
+ VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
+ boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
+ "rcu_torture_boost");
+ if (IS_ERR(boost_tasks[cpu])) {
+ retval = PTR_ERR(boost_tasks[cpu]);
+ VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
+ n_rcu_torture_boost_ktrerror++;
+ boost_tasks[cpu] = NULL;
+ mutex_unlock(&boost_mutex);
+ return retval;
+ }
+ kthread_bind(boost_tasks[cpu], cpu);
+ wake_up_process(boost_tasks[cpu]);
+ mutex_unlock(&boost_mutex);
+ return 0;
+}
+
+static int rcutorture_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ (void)rcutorture_booster_init(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ rcutorture_booster_cleanup(cpu);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block rcutorture_cpu_nb = {
+ .notifier_call = rcutorture_cpu_notify,
+};
+
static void
rcu_torture_cleanup(void)
{
@@ -1118,7 +1343,7 @@ rcu_torture_cleanup(void)
}
fullstop = FULLSTOP_RMMOD;
mutex_unlock(&fullstop_mutex);
- unregister_reboot_notifier(&rcutorture_nb);
+ unregister_reboot_notifier(&rcutorture_shutdown_nb);
if (stutter_task) {
VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
kthread_stop(stutter_task);
@@ -1175,6 +1400,12 @@ rcu_torture_cleanup(void)
kthread_stop(fqs_task);
}
fqs_task = NULL;
+ if ((test_boost == 1 && cur_ops->can_boost) ||
+ test_boost == 2) {
+ unregister_cpu_notifier(&rcutorture_cpu_nb);
+ for_each_possible_cpu(i)
+ rcutorture_booster_cleanup(i);
+ }
/* Wait for all RCU callbacks to fire. */
@@ -1186,9 +1417,9 @@ rcu_torture_cleanup(void)
if (cur_ops->cleanup)
cur_ops->cleanup();
if (atomic_read(&n_rcu_torture_error))
- rcu_torture_print_module_parms("End of test: FAILURE");
+ rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
else
- rcu_torture_print_module_parms("End of test: SUCCESS");
+ rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
}
static int __init
@@ -1233,7 +1464,7 @@ rcu_torture_init(void)
nrealreaders = nreaders;
else
nrealreaders = 2 * num_online_cpus();
- rcu_torture_print_module_parms("Start of test");
+ rcu_torture_print_module_parms(cur_ops, "Start of test");
fullstop = FULLSTOP_DONTSTOP;
/* Set up the freelist. */
@@ -1254,6 +1485,12 @@ rcu_torture_init(void)
atomic_set(&n_rcu_torture_free, 0);
atomic_set(&n_rcu_torture_mberror, 0);
atomic_set(&n_rcu_torture_error, 0);
+ n_rcu_torture_boost_ktrerror = 0;
+ n_rcu_torture_boost_rterror = 0;
+ n_rcu_torture_boost_allocerror = 0;
+ n_rcu_torture_boost_afferror = 0;
+ n_rcu_torture_boost_failure = 0;
+ n_rcu_torture_boosts = 0;
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
atomic_set(&rcu_torture_wcount[i], 0);
for_each_possible_cpu(cpu) {
@@ -1367,7 +1604,27 @@ rcu_torture_init(void)
goto unwind;
}
}
- register_reboot_notifier(&rcutorture_nb);
+ if (test_boost_interval < 1)
+ test_boost_interval = 1;
+ if (test_boost_duration < 2)
+ test_boost_duration = 2;
+ if ((test_boost == 1 && cur_ops->can_boost) ||
+ test_boost == 2) {
+ int retval;
+
+ boost_starttime = jiffies + test_boost_interval * HZ;
+ register_cpu_notifier(&rcutorture_cpu_nb);
+ for_each_possible_cpu(i) {
+ if (cpu_is_offline(i))
+ continue; /* Heuristic: CPU can go offline. */
+ retval = rcutorture_booster_init(i);
+ if (retval < 0) {
+ firsterr = retval;
+ goto unwind;
+ }
+ }
+ }
+ register_reboot_notifier(&rcutorture_shutdown_nb);
mutex_unlock(&fullstop_mutex);
return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d5bc439..9213003 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
.gpnum = -300, \
.completed = -300, \
.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
- .orphan_cbs_list = NULL, \
- .orphan_cbs_tail = &structname.orphan_cbs_list, \
- .orphan_qlen = 0, \
.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
.n_force_qs = 0, \
.n_force_qs_ngp = 0, \
@@ -131,7 +128,7 @@ void rcu_note_context_switch(int cpu)
#ifdef CONFIG_NO_HZ
DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
.dynticks_nesting = 1,
- .dynticks = 1,
+ .dynticks = ATOMIC_INIT(1),
};
#endif /* #ifdef CONFIG_NO_HZ */
@@ -143,6 +140,11 @@ module_param(blimit, int, 0);
module_param(qhimark, int, 0);
module_param(qlowmark, int, 0);
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT;
+module_param(rcu_cpu_stall_suppress, int, 0644);
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
static int rcu_pending(int cpu);
@@ -262,12 +264,14 @@ void rcu_enter_nohz(void)
unsigned long flags;
struct rcu_dynticks *rdtp;
- smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
local_irq_save(flags);
rdtp = &__get_cpu_var(rcu_dynticks);
- rdtp->dynticks++;
+ /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+ smp_mb__before_atomic_inc(); /* See above. */
+ atomic_inc(&rdtp->dynticks);
+ smp_mb__after_atomic_inc(); /* Force delay to next write. */
rdtp->dynticks_nesting--;
- WARN_ON_ONCE(rdtp->dynticks & 0x1);
+ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
local_irq_restore(flags);
}
@@ -284,11 +288,13 @@ void rcu_exit_nohz(void)
local_irq_save(flags);
rdtp = &__get_cpu_var(rcu_dynticks);
- rdtp->dynticks++;
+ smp_mb__before_atomic_inc(); /* Force delay from prior write. */
+ atomic_inc(&rdtp->dynticks);
+ /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
+ smp_mb__after_atomic_inc(); /* See above. */
rdtp->dynticks_nesting++;
- WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
+ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
local_irq_restore(flags);
- smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
}
/**
@@ -302,11 +308,15 @@ void rcu_nmi_enter(void)
{
struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
- if (rdtp->dynticks & 0x1)
+ if (rdtp->dynticks_nmi_nesting == 0 &&
+ (atomic_read(&rdtp->dynticks) & 0x1))
return;
- rdtp->dynticks_nmi++;
- WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1));
- smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+ rdtp->dynticks_nmi_nesting++;
+ smp_mb__before_atomic_inc(); /* Force delay from prior write. */
+ atomic_inc(&rdtp->dynticks);
+ /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
+ smp_mb__after_atomic_inc(); /* See above. */
+ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
}
/**
@@ -320,11 +330,14 @@ void rcu_nmi_exit(void)
{
struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
- if (rdtp->dynticks & 0x1)
+ if (rdtp->dynticks_nmi_nesting == 0 ||
+ --rdtp->dynticks_nmi_nesting != 0)
return;
- smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
- rdtp->dynticks_nmi++;
- WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1);
+ /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+ smp_mb__before_atomic_inc(); /* See above. */
+ atomic_inc(&rdtp->dynticks);
+ smp_mb__after_atomic_inc(); /* Force delay to next write. */
+ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
}
/**
@@ -339,9 +352,11 @@ void rcu_irq_enter(void)
if (rdtp->dynticks_nesting++)
return;
- rdtp->dynticks++;
- WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
- smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+ smp_mb__before_atomic_inc(); /* Force delay from prior write. */
+ atomic_inc(&rdtp->dynticks);
+ /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
+ smp_mb__after_atomic_inc(); /* See above. */
+ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
}
/**
@@ -357,13 +372,16 @@ void rcu_irq_exit(void)
if (--rdtp->dynticks_nesting)
return;
- smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
- rdtp->dynticks++;
- WARN_ON_ONCE(rdtp->dynticks & 0x1);
+ /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+ smp_mb__before_atomic_inc(); /* See above. */
+ atomic_inc(&rdtp->dynticks);
+ smp_mb__after_atomic_inc(); /* Force delay to next write. */
+ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
/* If the interrupt queued a callback, get out of dyntick mode. */
if (__get_cpu_var(rcu_sched_data).nxtlist ||
- __get_cpu_var(rcu_bh_data).nxtlist)
+ __get_cpu_var(rcu_bh_data).nxtlist ||
+ rcu_preempt_needs_cpu(smp_processor_id()))
set_need_resched();
}
@@ -376,19 +394,8 @@ void rcu_irq_exit(void)
*/
static int dyntick_save_progress_counter(struct rcu_data *rdp)
{
- int ret;
- int snap;
- int snap_nmi;
-
- snap = rdp->dynticks->dynticks;
- snap_nmi = rdp->dynticks->dynticks_nmi;
- smp_mb(); /* Order sampling of snap with end of grace period. */
- rdp->dynticks_snap = snap;
- rdp->dynticks_nmi_snap = snap_nmi;
- ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
- if (ret)
- rdp->dynticks_fqs++;
- return ret;
+ rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
+ return 0;
}
/*
@@ -399,16 +406,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
*/
static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
{
- long curr;
- long curr_nmi;
- long snap;
- long snap_nmi;
+ unsigned long curr;
+ unsigned long snap;
- curr = rdp->dynticks->dynticks;
- snap = rdp->dynticks_snap;
- curr_nmi = rdp->dynticks->dynticks_nmi;
- snap_nmi = rdp->dynticks_nmi_snap;
- smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+ curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks);
+ snap = (unsigned long)rdp->dynticks_snap;
/*
* If the CPU passed through or entered a dynticks idle phase with
@@ -418,8 +420,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
* read-side critical section that started before the beginning
* of the current RCU grace period.
*/
- if ((curr != snap || (curr & 0x1) == 0) &&
- (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
+ if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) {
rdp->dynticks_fqs++;
return 1;
}
@@ -450,7 +451,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-int rcu_cpu_stall_panicking __read_mostly;
+int rcu_cpu_stall_suppress __read_mostly;
static void record_gp_stall_check_time(struct rcu_state *rsp)
{
@@ -482,8 +483,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
rcu_print_task_stall(rnp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- /* OK, time to rat on our buddy... */
-
+ /*
+ * OK, time to rat on our buddy...
+ * See Documentation/RCU/stallwarn.txt for info on how to debug
+ * RCU CPU stall warnings.
+ */
printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
rsp->name);
rcu_for_each_leaf_node(rsp, rnp) {
@@ -512,6 +516,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
unsigned long flags;
struct rcu_node *rnp = rcu_get_root(rsp);
+ /*
+ * OK, time to rat on ourselves...
+ * See Documentation/RCU/stallwarn.txt for info on how to debug
+ * RCU CPU stall warnings.
+ */
printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
trigger_all_cpu_backtrace();
@@ -530,11 +539,11 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
long delta;
struct rcu_node *rnp;
- if (rcu_cpu_stall_panicking)
+ if (rcu_cpu_stall_suppress)
return;
- delta = jiffies - rsp->jiffies_stall;
+ delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
rnp = rdp->mynode;
- if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
+ if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) {
/* We haven't checked in, so go dump stack. */
print_cpu_stall(rsp);
@@ -548,10 +557,26 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
{
- rcu_cpu_stall_panicking = 1;
+ rcu_cpu_stall_suppress = 1;
return NOTIFY_DONE;
}
+/**
+ * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
+ *
+ * Set the stall-warning timeout way off into the future, thus preventing
+ * any RCU CPU stall-warning messages from appearing in the current set of
+ * RCU grace periods.
+ *
+ * The caller must disable hard irqs.
+ */
+void rcu_cpu_stall_reset(void)
+{
+ rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+ rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+ rcu_preempt_stall_reset();
+}
+
static struct notifier_block rcu_panic_block = {
.notifier_call = rcu_panic,
};
@@ -571,6 +596,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
{
}
+void rcu_cpu_stall_reset(void)
+{
+}
+
static void __init check_cpu_stall_init(void)
{
}
@@ -712,7 +741,7 @@ static void
rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
__releases(rcu_get_root(rsp)->lock)
{
- struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+ struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
struct rcu_node *rnp = rcu_get_root(rsp);
if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
@@ -809,6 +838,12 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
__releases(rcu_get_root(rsp)->lock)
{
WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
+
+ /*
+ * Ensure that all grace-period and pre-grace-period activity
+ * is seen before the assignment to rsp->completed.
+ */
+ smp_mb(); /* See above block comment. */
rsp->completed = rsp->gpnum;
rsp->signaled = RCU_GP_IDLE;
rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
@@ -951,51 +986,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
#ifdef CONFIG_HOTPLUG_CPU
/*
- * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the
- * specified flavor of RCU. The callbacks will be adopted by the next
- * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever
- * comes first. Because this is invoked from the CPU_DYING notifier,
- * irqs are already disabled.
+ * Move a dying CPU's RCU callbacks to online CPU's callback list.
+ * Synchronization is not required because this function executes
+ * in stop_machine() context.
*/
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
{
int i;
- struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+ /* current DYING CPU is cleared in the cpu_online_mask */
+ int receive_cpu = cpumask_any(cpu_online_mask);
+ struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+ struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
if (rdp->nxtlist == NULL)
return; /* irqs disabled, so comparison is stable. */
- raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
- *rsp->orphan_cbs_tail = rdp->nxtlist;
- rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
+
+ *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
+ receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+ receive_rdp->qlen += rdp->qlen;
+ receive_rdp->n_cbs_adopted += rdp->qlen;
+ rdp->n_cbs_orphaned += rdp->qlen;
+
rdp->nxtlist = NULL;
for (i = 0; i < RCU_NEXT_SIZE; i++)
rdp->nxttail[i] = &rdp->nxtlist;
- rsp->orphan_qlen += rdp->qlen;
rdp->qlen = 0;
- raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
-}
-
-/*
- * Adopt previously orphaned RCU callbacks.
- */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
-{
- unsigned long flags;
- struct rcu_data *rdp;
-
- raw_spin_lock_irqsave(&rsp->onofflock, flags);
- rdp = rsp->rda[smp_processor_id()];
- if (rsp->orphan_cbs_list == NULL) {
- raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
- return;
- }
- *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
- rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
- rdp->qlen += rsp->orphan_qlen;
- rsp->orphan_cbs_list = NULL;
- rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
- rsp->orphan_qlen = 0;
- raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
}
/*
@@ -1007,7 +1022,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
unsigned long flags;
unsigned long mask;
int need_report = 0;
- struct rcu_data *rdp = rsp->rda[cpu];
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp;
/* Exclude any attempts to start a new grace period. */
@@ -1046,8 +1061,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
raw_spin_unlock_irqrestore(&rnp->lock, flags);
if (need_report & RCU_OFL_TASKS_EXP_GP)
rcu_report_exp_rnp(rsp, rnp);
-
- rcu_adopt_orphan_cbs(rsp);
}
/*
@@ -1065,11 +1078,7 @@ static void rcu_offline_cpu(int cpu)
#else /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
-{
-}
-
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
{
}
@@ -1123,6 +1132,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
/* Update count, and requeue any remaining callbacks. */
rdp->qlen -= count;
+ rdp->n_cbs_invoked += count;
if (list != NULL) {
*tail = rdp->nxtlist;
rdp->nxtlist = list;
@@ -1226,7 +1236,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
cpu = rnp->grplo;
bit = 1;
for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
- if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
+ if ((rnp->qsmask & bit) != 0 &&
+ f(per_cpu_ptr(rsp->rda, cpu)))
mask |= bit;
}
if (mask != 0) {
@@ -1359,25 +1370,11 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
*/
static void rcu_process_callbacks(struct softirq_action *unused)
{
- /*
- * Memory references from any prior RCU read-side critical sections
- * executed by the interrupted code must be seen before any RCU
- * grace-period manipulations below.
- */
- smp_mb(); /* See above block comment. */
-
__rcu_process_callbacks(&rcu_sched_state,
&__get_cpu_var(rcu_sched_data));
__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
rcu_preempt_process_callbacks();
- /*
- * Memory references from any later RCU read-side critical sections
- * executed by the interrupted code must be seen after any RCU
- * grace-period manipulations above.
- */
- smp_mb(); /* See above block comment. */
-
/* If we are last CPU on way to dyntick-idle mode, accelerate it. */
rcu_needs_cpu_flush();
}
@@ -1402,7 +1399,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
* a quiescent state betweentimes.
*/
local_irq_save(flags);
- rdp = rsp->rda[smp_processor_id()];
+ rdp = this_cpu_ptr(rsp->rda);
rcu_process_gp_end(rsp, rdp);
check_for_new_grace_period(rsp, rdp);
@@ -1662,13 +1659,12 @@ static void _rcu_barrier(struct rcu_state *rsp,
* decrement rcu_barrier_cpu_count -- otherwise the first CPU
* might complete its grace period before all of the other CPUs
* did their increment, causing this function to return too
- * early.
+ * early. Note that on_each_cpu() disables irqs, which prevents
+ * any CPUs from coming online or going offline until each online
+ * CPU has queued its RCU-barrier callback.
*/
atomic_set(&rcu_barrier_cpu_count, 1);
- preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
- rcu_adopt_orphan_cbs(rsp);
on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
- preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
if (atomic_dec_and_test(&rcu_barrier_cpu_count))
complete(&rcu_barrier_completion);
wait_for_completion(&rcu_barrier_completion);
@@ -1701,7 +1697,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
{
unsigned long flags;
int i;
- struct rcu_data *rdp = rsp->rda[cpu];
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rcu_get_root(rsp);
/* Set up local state, ensuring consistent view of global state. */
@@ -1729,7 +1725,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
{
unsigned long flags;
unsigned long mask;
- struct rcu_data *rdp = rsp->rda[cpu];
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rcu_get_root(rsp);
/* Set up local state, ensuring consistent view of global state. */
@@ -1794,18 +1790,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
case CPU_DYING:
case CPU_DYING_FROZEN:
/*
- * preempt_disable() in _rcu_barrier() prevents stop_machine(),
- * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
- * returns, all online cpus have queued rcu_barrier_func().
- * The dying CPU clears its cpu_online_mask bit and
- * moves all of its RCU callbacks to ->orphan_cbs_list
- * in the context of stop_machine(), so subsequent calls
- * to _rcu_barrier() will adopt these callbacks and only
- * then queue rcu_barrier_func() on all remaining CPUs.
+ * The whole machine is "stopped" except this CPU, so we can
+ * touch any data without introducing corruption. We send the
+ * dying CPU's callbacks to an arbitrarily chosen online CPU.
*/
- rcu_send_cbs_to_orphanage(&rcu_bh_state);
- rcu_send_cbs_to_orphanage(&rcu_sched_state);
- rcu_preempt_send_cbs_to_orphanage();
+ rcu_send_cbs_to_online(&rcu_bh_state);
+ rcu_send_cbs_to_online(&rcu_sched_state);
+ rcu_preempt_send_cbs_to_online();
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
@@ -1865,7 +1856,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
/*
* Helper function for rcu_init() that initializes one rcu_state structure.
*/
-static void __init rcu_init_one(struct rcu_state *rsp)
+static void __init rcu_init_one(struct rcu_state *rsp,
+ struct rcu_data __percpu *rda)
{
static char *buf[] = { "rcu_node_level_0",
"rcu_node_level_1",
@@ -1918,37 +1910,23 @@ static void __init rcu_init_one(struct rcu_state *rsp)
}
}
+ rsp->rda = rda;
rnp = rsp->level[NUM_RCU_LVLS - 1];
for_each_possible_cpu(i) {
while (i > rnp->grphi)
rnp++;
- rsp->rda[i]->mynode = rnp;
+ per_cpu_ptr(rsp->rda, i)->mynode = rnp;
rcu_boot_init_percpu_data(i, rsp);
}
}
-/*
- * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used
- * nowhere else! Assigns leaf node pointers into each CPU's rcu_data
- * structure.
- */
-#define RCU_INIT_FLAVOR(rsp, rcu_data) \
-do { \
- int i; \
- \
- for_each_possible_cpu(i) { \
- (rsp)->rda[i] = &per_cpu(rcu_data, i); \
- } \
- rcu_init_one(rsp); \
-} while (0)
-
void __init rcu_init(void)
{
int cpu;
rcu_bootup_announce();
- RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
- RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
+ rcu_init_one(&rcu_sched_state, &rcu_sched_data);
+ rcu_init_one(&rcu_bh_state, &rcu_bh_data);
__rcu_init_preempt();
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 14c040b..1677276 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -79,11 +79,9 @@
* Dynticks per-CPU state.
*/
struct rcu_dynticks {
- int dynticks_nesting; /* Track nesting level, sort of. */
- int dynticks; /* Even value for dynticks-idle, else odd. */
- int dynticks_nmi; /* Even value for either dynticks-idle or */
- /* not in nmi handler, else odd. So this */
- /* remains even for nmi from irq handler. */
+ int dynticks_nesting; /* Track irq/process nesting level. */
+ int dynticks_nmi_nesting; /* Track NMI nesting level. */
+ atomic_t dynticks; /* Even value for dynticks-idle, else odd. */
};
/*
@@ -202,6 +200,9 @@ struct rcu_data {
long qlen; /* # of queued callbacks */
long qlen_last_fqs_check;
/* qlen at last check for QS forcing */
+ unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
+ unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
+ unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
unsigned long n_force_qs_snap;
/* did other CPU force QS recently? */
long blimit; /* Upper limit on a processed batch */
@@ -210,7 +211,6 @@ struct rcu_data {
/* 3) dynticks interface. */
struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
int dynticks_snap; /* Per-GP tracking for dynticks. */
- int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */
#endif /* #ifdef CONFIG_NO_HZ */
/* 4) reasons this CPU needed to be kicked by force_quiescent_state */
@@ -254,19 +254,23 @@ struct rcu_data {
#define RCU_STALL_DELAY_DELTA 0
#endif
-#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA)
+#define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
+ RCU_STALL_DELAY_DELTA)
/* for rsp->jiffies_stall */
-#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA)
+#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
/* for rsp->jiffies_stall */
#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
/* to take at least one */
/* scheduling clock irq */
/* before ratting on them. */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE
+#define RCU_CPU_STALL_SUPPRESS_INIT 0
+#else
+#define RCU_CPU_STALL_SUPPRESS_INIT 1
+#endif
-#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
-#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
/*
* RCU global state, including node hierarchy. This hierarchy is
@@ -283,7 +287,7 @@ struct rcu_state {
struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
- struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */
+ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
/* The following fields are guarded by the root rcu_node's lock. */
@@ -302,15 +306,7 @@ struct rcu_state {
/* End of fields guarded by root rcu_node's lock. */
raw_spinlock_t onofflock; /* exclude on/offline and */
- /* starting new GP. Also */
- /* protects the following */
- /* orphan_cbs fields. */
- struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */
- /* orphaned by all CPUs in */
- /* a given leaf rcu_node */
- /* going offline. */
- struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
- long orphan_qlen; /* Number of orphaned cbs. */
+ /* starting new GP. */
raw_spinlock_t fqslock; /* Only one task forcing */
/* quiescent states. */
unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -365,6 +361,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
static void rcu_print_detail_task_stall(struct rcu_state *rsp);
static void rcu_print_task_stall(struct rcu_node *rnp);
+static void rcu_preempt_stall_reset(void);
#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
@@ -382,7 +379,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
static int rcu_preempt_pending(int cpu);
static int rcu_preempt_needs_cpu(int cpu);
static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
-static void rcu_preempt_send_cbs_to_orphanage(void);
+static void rcu_preempt_send_cbs_to_online(void);
static void __init __rcu_init_preempt(void);
static void rcu_needs_cpu_flush(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 0e4f420..074dc22 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
*/
#include
+#include
/*
* Check the RCU kernel configuration parameters and print informative
@@ -57,7 +58,7 @@ static void __init rcu_bootup_announce_oddness(void)
printk(KERN_INFO
"\tRCU-based detection of stalled CPUs is disabled.\n");
#endif
-#ifndef CONFIG_RCU_CPU_STALL_VERBOSE
+#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
#endif
#if NUM_RCU_LVL_4 != 0
@@ -154,7 +155,7 @@ static void rcu_preempt_note_context_switch(int cpu)
(t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
/* Possibly blocking in an RCU read-side critical section. */
- rdp = rcu_preempt_state.rda[cpu];
+ rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags);
t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -201,7 +202,7 @@ static void rcu_preempt_note_context_switch(int cpu)
*/
void __rcu_read_lock(void)
{
- ACCESS_ONCE(current->rcu_read_lock_nesting)++;
+ current->rcu_read_lock_nesting++;
barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */
}
EXPORT_SYMBOL_GPL(__rcu_read_lock);
@@ -344,7 +345,9 @@ void __rcu_read_unlock(void)
struct task_struct *t = current;
barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
- if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
+ --t->rcu_read_lock_nesting;
+ barrier(); /* decrement before load of ->rcu_read_unlock_special */
+ if (t->rcu_read_lock_nesting == 0 &&
unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
rcu_read_unlock_special(t);
#ifdef CONFIG_PROVE_LOCKING
@@ -417,6 +420,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
}
}
+/*
+ * Suppress preemptible RCU's CPU stall warnings by pushing the
+ * time of the next stall-warning message comfortably far into the
+ * future.
+ */
+static void rcu_preempt_stall_reset(void)
+{
+ rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+}
+
#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
/*
@@ -546,9 +559,11 @@ EXPORT_SYMBOL_GPL(call_rcu);
*
* Control will return to the caller some time after a full grace
* period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed. RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
+ * read-side critical sections have completed. Note, however, that
+ * upon return from synchronize_rcu(), the caller might well be executing
+ * concurrently with new RCU read-side critical sections that began while
+ * synchronize_rcu() was waiting. RCU read-side critical sections are
+ * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
*/
void synchronize_rcu(void)
{
@@ -759,11 +774,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
}
/*
- * Move preemptable RCU's callbacks to ->orphan_cbs_list.
+ * Move preemptable RCU's callbacks from dying CPU to other online CPU.
*/
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
{
- rcu_send_cbs_to_orphanage(&rcu_preempt_state);
+ rcu_send_cbs_to_online(&rcu_preempt_state);
}
/*
@@ -771,7 +786,7 @@ static void rcu_preempt_send_cbs_to_orphanage(void)
*/
static void __init __rcu_init_preempt(void)
{
- RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
+ rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
}
/*
@@ -865,6 +880,14 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
{
}
+/*
+ * Because preemptible RCU does not exist, there is no need to suppress
+ * its CPU stall warnings.
+ */
+static void rcu_preempt_stall_reset(void)
+{
+}
+
#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
/*
@@ -919,15 +942,6 @@ static void rcu_preempt_process_callbacks(void)
}
/*
- * In classic RCU, call_rcu() is just call_rcu_sched().
- */
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
-{
- call_rcu_sched(head, func);
-}
-EXPORT_SYMBOL_GPL(call_rcu);
-
-/*
* Wait for an rcu-preempt grace period, but make it happen quickly.
* But because preemptable RCU does not exist, map to rcu-sched.
*/
@@ -988,7 +1002,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
/*
* Because there is no preemptable RCU, there are no callbacks to move.
*/
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
{
}
@@ -1001,6 +1015,132 @@ static void __init __rcu_init_preempt(void)
#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifndef CONFIG_SMP
+
+void synchronize_sched_expedited(void)
+{
+ cond_resched();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#else /* #ifndef CONFIG_SMP */
+
+static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
+static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
+
+static int synchronize_sched_expedited_cpu_stop(void *data)
+{
+ /*
+ * There must be a full memory barrier on each affected CPU
+ * between the time that try_stop_cpus() is called and the
+ * time that it returns.
+ *
+ * In the current initial implementation of cpu_stop, the
+ * above condition is already met when the control reaches
+ * this point and the following smp_mb() is not strictly
+ * necessary. Do smp_mb() anyway for documentation and
+ * robustness against future implementation changes.
+ */
+ smp_mb(); /* See above comment block. */
+ return 0;
+}
+
+/*
+ * Wait for an rcu-sched grace period to elapse, but use "big hammer"
+ * approach to force grace period to end quickly. This consumes
+ * significant time on all CPUs, and is thus not recommended for
+ * any sort of common-case code.
+ *
+ * Note that it is illegal to call this function while holding any
+ * lock that is acquired by a CPU-hotplug notifier. Failing to
+ * observe this restriction will result in deadlock.
+ *
+ * This implementation can be thought of as an application of ticket
+ * locking to RCU, with sync_sched_expedited_started and
+ * sync_sched_expedited_done taking on the roles of the halves
+ * of the ticket-lock word. Each task atomically increments
+ * sync_sched_expedited_started upon entry, snapshotting the old value,
+ * then attempts to stop all the CPUs. If this succeeds, then each
+ * CPU will have executed a context switch, resulting in an RCU-sched
+ * grace period. We are then done, so we use atomic_cmpxchg() to
+ * update sync_sched_expedited_done to match our snapshot -- but
+ * only if someone else has not already advanced past our snapshot.
+ *
+ * On the other hand, if try_stop_cpus() fails, we check the value
+ * of sync_sched_expedited_done. If it has advanced past our
+ * initial snapshot, then someone else must have forced a grace period
+ * some time after we took our snapshot. In this case, our work is
+ * done for us, and we can simply return. Otherwise, we try again,
+ * but keep our initial snapshot for purposes of checking for someone
+ * doing our work for us.
+ *
+ * If we fail too many times in a row, we fall back to synchronize_sched().
+ */
+void synchronize_sched_expedited(void)
+{
+ int firstsnap, s, snap, trycount = 0;
+
+ /* Note that atomic_inc_return() implies full memory barrier. */
+ firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
+ get_online_cpus();
+
+ /*
+ * Each pass through the following loop attempts to force a
+ * context switch on each CPU.
+ */
+ while (try_stop_cpus(cpu_online_mask,
+ synchronize_sched_expedited_cpu_stop,
+ NULL) == -EAGAIN) {
+ put_online_cpus();
+
+ /* No joy, try again later. Or just synchronize_sched(). */
+ if (trycount++ < 10)
+ udelay(trycount * num_online_cpus());
+ else {
+ synchronize_sched();
+ return;
+ }
+
+ /* Check to see if someone else did our work for us. */
+ s = atomic_read(&sync_sched_expedited_done);
+ if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
+ smp_mb(); /* ensure test happens before caller kfree */
+ return;
+ }
+
+ /*
+ * Refetching sync_sched_expedited_started allows later
+ * callers to piggyback on our grace period. We subtract
+ * 1 to get the same token that the last incrementer got.
+ * We retry after they started, so our grace period works
+ * for them, and they started after our first try, so their
+ * grace period works for us.
+ */
+ get_online_cpus();
+ snap = atomic_read(&sync_sched_expedited_started) - 1;
+ smp_mb(); /* ensure read is before try_stop_cpus(). */
+ }
+
+ /*
+ * Everyone up to our most recent fetch is covered by our grace
+ * period. Update the counter, but only if our work is still
+ * relevant -- which it won't be if someone who started later
+ * than we did beat us to the punch.
+ */
+ do {
+ s = atomic_read(&sync_sched_expedited_done);
+ if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
+ smp_mb(); /* ensure test happens before caller kfree */
+ break;
+ }
+ } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
+
+ put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#endif /* #else #ifndef CONFIG_SMP */
+
#if !defined(CONFIG_RCU_FAST_NO_HZ)
/*
@@ -1054,7 +1194,6 @@ int rcu_needs_cpu(int cpu)
{
int c = 0;
int snap;
- int snap_nmi;
int thatcpu;
/* Check for being in the holdoff period. */
@@ -1065,10 +1204,10 @@ int rcu_needs_cpu(int cpu)
for_each_online_cpu(thatcpu) {
if (thatcpu == cpu)
continue;
- snap = per_cpu(rcu_dynticks, thatcpu).dynticks;
- snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi;
+ snap = atomic_add_return(0, &per_cpu(rcu_dynticks,
+ thatcpu).dynticks);
smp_mb(); /* Order sampling of snap with end of grace period. */
- if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) {
+ if ((snap & 0x1) != 0) {
per_cpu(rcu_dyntick_drain, cpu) = 0;
per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
return rcu_needs_cpu_quick_check(cpu);
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 36c95b4..4a21ca5 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -57,14 +57,16 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
rdp->passed_quiesc, rdp->passed_quiesc_completed,
rdp->qs_pending);
#ifdef CONFIG_NO_HZ
- seq_printf(m, " dt=%d/%d dn=%d df=%lu",
- rdp->dynticks->dynticks,
+ seq_printf(m, " dt=%d/%d/%d df=%lu",
+ atomic_read(&rdp->dynticks->dynticks),
rdp->dynticks->dynticks_nesting,
- rdp->dynticks->dynticks_nmi,
+ rdp->dynticks->dynticks_nmi_nesting,
rdp->dynticks_fqs);
#endif /* #ifdef CONFIG_NO_HZ */
seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
- seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit);
+ seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit);
+ seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
+ rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
}
#define PRINT_RCU_DATA(name, func, m) \
@@ -113,22 +115,24 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
rdp->qs_pending);
#ifdef CONFIG_NO_HZ
seq_printf(m, ",%d,%d,%d,%lu",
- rdp->dynticks->dynticks,
+ atomic_read(&rdp->dynticks->dynticks),
rdp->dynticks->dynticks_nesting,
- rdp->dynticks->dynticks_nmi,
+ rdp->dynticks->dynticks_nmi_nesting,
rdp->dynticks_fqs);
#endif /* #ifdef CONFIG_NO_HZ */
seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
- seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit);
+ seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit);
+ seq_printf(m, ",%lu,%lu,%lu\n",
+ rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
}
static int show_rcudata_csv(struct seq_file *m, void *unused)
{
seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
#ifdef CONFIG_NO_HZ
- seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
+ seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
#endif /* #ifdef CONFIG_NO_HZ */
- seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
+ seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n");
#ifdef CONFIG_TREE_PREEMPT_RCU
seq_puts(m, "\"rcu_preempt:\"\n");
PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
@@ -162,13 +166,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
gpnum = rsp->gpnum;
seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
- "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
+ "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
rsp->completed, gpnum, rsp->signaled,
(long)(rsp->jiffies_force_qs - jiffies),
(int)(jiffies & 0xffff),
rsp->n_force_qs, rsp->n_force_qs_ngp,
rsp->n_force_qs - rsp->n_force_qs_ngp,
- rsp->n_force_qs_lh, rsp->orphan_qlen);
+ rsp->n_force_qs_lh);
for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
if (rnp->level != level) {
seq_puts(m, "\n");
@@ -262,7 +266,7 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
struct rcu_data *rdp;
for_each_possible_cpu(cpu) {
- rdp = rsp->rda[cpu];
+ rdp = per_cpu_ptr(rsp->rda, cpu);
if (rdp->beenonline)
print_one_rcu_pending(m, rdp);
}
@@ -296,7 +300,7 @@ static const struct file_operations rcu_pending_fops = {
static struct dentry *rcudir;
-static int __init rcuclassic_trace_init(void)
+static int __init rcutree_trace_init(void)
{
struct dentry *retval;
@@ -333,14 +337,14 @@ free_out:
return 1;
}
-static void __exit rcuclassic_trace_cleanup(void)
+static void __exit rcutree_trace_cleanup(void)
{
debugfs_remove_recursive(rcudir);
}
-module_init(rcuclassic_trace_init);
-module_exit(rcuclassic_trace_cleanup);
+module_init(rcutree_trace_init);
+module_exit(rcutree_trace_cleanup);
MODULE_AUTHOR("Paul E. McKenney");
MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
diff --git a/kernel/sched.c b/kernel/sched.c
index dc85ceb..d1e8889 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5337,7 +5337,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
idle->se.exec_start = sched_clock();
cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+ /*
+ * We're having a chicken and egg problem, even though we are
+ * holding rq->lock, the cpu isn't yet set to this cpu so the
+ * lockdep check in task_group() will fail.
+ *
+ * Similar case to sched_fork(). / Alternatively we could
+ * use task_rq_lock() here and obtain the other rq->lock.
+ *
+ * Silence PROVE_RCU
+ */
+ rcu_read_lock();
__set_task_cpu(idle, cpu);
+ rcu_read_unlock();
rq->curr = rq->idle = idle;
#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -9119,72 +9131,3 @@ struct cgroup_subsys cpuacct_subsys = {
};
#endif /* CONFIG_CGROUP_CPUACCT */
-#ifndef CONFIG_SMP
-
-void synchronize_sched_expedited(void)
-{
- barrier();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#else /* #ifndef CONFIG_SMP */
-
-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-
-static int synchronize_sched_expedited_cpu_stop(void *data)
-{
- /*
- * There must be a full memory barrier on each affected CPU
- * between the time that try_stop_cpus() is called and the
- * time that it returns.
- *
- * In the current initial implementation of cpu_stop, the
- * above condition is already met when the control reaches
- * this point and the following smp_mb() is not strictly
- * necessary. Do smp_mb() anyway for documentation and
- * robustness against future implementation changes.
- */
- smp_mb(); /* See above comment block. */
- return 0;
-}
-
-/*
- * Wait for an rcu-sched grace period to elapse, but use "big hammer"
- * approach to force grace period to end quickly. This consumes
- * significant time on all CPUs, and is thus not recommended for
- * any sort of common-case code.
- *
- * Note that it is illegal to call this function while holding any
- * lock that is acquired by a CPU-hotplug notifier. Failing to
- * observe this restriction will result in deadlock.
- */
-void synchronize_sched_expedited(void)
-{
- int snap, trycount = 0;
-
- smp_mb(); /* ensure prior mod happens before capturing snap. */
- snap = atomic_read(&synchronize_sched_expedited_count) + 1;
- get_online_cpus();
- while (try_stop_cpus(cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- if (trycount++ < 10)
- udelay(trycount * num_online_cpus());
- else {
- synchronize_sched();
- return;
- }
- if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
- smp_mb(); /* ensure test happens before caller kfree */
- return;
- }
- get_online_cpus();
- }
- atomic_inc(&synchronize_sched_expedited_count);
- smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
- put_online_cpus();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index db3f674..5f996d3 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3751,8 +3751,11 @@ static void task_fork_fair(struct task_struct *p)
update_rq_clock(rq);
- if (unlikely(task_cpu(p) != this_cpu))
+ if (unlikely(task_cpu(p) != this_cpu)) {
+ rcu_read_lock();
__set_task_cpu(p, this_cpu);
+ rcu_read_unlock();
+ }
update_curr(cfs_rq);
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2980da3..a48caa7 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -31,6 +31,7 @@
#include
#include
#include
+#include
#include
static int init_srcu_struct_fields(struct srcu_struct *sp)
@@ -46,11 +47,9 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
int __init_srcu_struct(struct srcu_struct *sp, const char *name,
struct lock_class_key *key)
{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
/* Don't re-initialize a lock while it is held. */
debug_check_no_locks_freed((void *)sp, sizeof(*sp));
lockdep_init_map(&sp->dep_map, name, key, 0);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
return init_srcu_struct_fields(sp);
}
EXPORT_SYMBOL_GPL(__init_srcu_struct);
@@ -205,9 +204,14 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
* all srcu_read_lock() calls using the old counters have completed.
* Their corresponding critical sections might well be still
* executing, but the srcu_read_lock() primitives themselves
- * will have finished executing.
+ * will have finished executing. We initially give readers
+ * an arbitrarily chosen 10 microseconds to get out of their
+ * SRCU read-side critical sections, then loop waiting 1/HZ
+ * seconds per iteration.
*/
+ if (srcu_readers_active_idx(sp, idx))
+ udelay(10);
while (srcu_readers_active_idx(sp, idx))
schedule_timeout_interruptible(1);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 1b4afd2..9886cf5 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -539,6 +539,23 @@ config PROVE_RCU_REPEATEDLY
disabling, allowing multiple RCU-lockdep warnings to be printed
on a single reboot.
+ Say Y to allow multiple RCU-lockdep warnings per boot.
+
+ Say N if you are unsure.
+
+config SPARSE_RCU_POINTER
+ bool "RCU debugging: sparse-based checks for pointer usage"
+ default n
+ help
+ This feature enables the __rcu sparse annotation for
+ RCU-protected pointers. This annotation will cause sparse
+ to flag any non-RCU used of annotated pointers. This can be
+ helpful when debugging RCU usage. Please note that this feature
+ is not intended to enforce code cleanliness; it is instead merely
+ a debugging aid.
+
+ Say Y to make sparse flag questionable use of RCU-protected pointers
+
Say N if you are unsure.
config LOCKDEP
@@ -832,6 +849,30 @@ config RCU_CPU_STALL_DETECTOR
Say Y if you are unsure.
+config RCU_CPU_STALL_TIMEOUT
+ int "RCU CPU stall timeout in seconds"
+ depends on RCU_CPU_STALL_DETECTOR
+ range 3 300
+ default 60
+ help
+ If a given RCU grace period extends more than the specified
+ number of seconds, a CPU stall warning is printed. If the
+ RCU grace period persists, additional CPU stall warnings are
+ printed at more widely spaced intervals.
+
+config RCU_CPU_STALL_DETECTOR_RUNNABLE
+ bool "RCU CPU stall checking starts automatically at boot"
+ depends on RCU_CPU_STALL_DETECTOR
+ default y
+ help
+ If set, start checking for RCU CPU stalls immediately on
+ boot. Otherwise, RCU CPU stall checking must be manually
+ enabled.
+
+ Say Y if you are unsure.
+
+ Say N if you wish to suppress RCU CPU stall checking during boot.
+
config RCU_CPU_STALL_VERBOSE
bool "Print additional per-task information for RCU_CPU_STALL_DETECTOR"
depends on RCU_CPU_STALL_DETECTOR && TREE_PREEMPT_RCU
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index efd16fa..6f412ab 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -49,7 +49,7 @@ struct radix_tree_node {
unsigned int height; /* Height from the bottom */
unsigned int count;
struct rcu_head rcu_head;
- void *slots[RADIX_TREE_MAP_SIZE];
+ void __rcu *slots[RADIX_TREE_MAP_SIZE];
unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
};
diff --git a/net/core/sock.c b/net/core/sock.c
index ef30e9d..7d99e13 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1078,8 +1078,11 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
#ifdef CONFIG_CGROUPS
void sock_update_classid(struct sock *sk)
{
- u32 classid = task_cls_classid(current);
+ u32 classid;
+ rcu_read_lock(); /* doing current task, which cannot vanish. */
+ classid = task_cls_classid(current);
+ rcu_read_unlock();
if (classid && classid != sk->sk_classid)
sk->sk_classid = classid;
}
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 8c8632d..957c924 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -38,7 +38,7 @@ static DEFINE_SPINLOCK(nf_nat_lock);
static struct nf_conntrack_l3proto *l3proto __read_mostly;
#define MAX_IP_NAT_PROTO 256
-static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO]
+static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO]
__read_mostly;
static inline const struct nf_nat_protocol *
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 78b505d..fdaec7d 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -27,7 +27,7 @@
static DEFINE_MUTEX(afinfo_mutex);
-const struct nf_afinfo *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
+const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
EXPORT_SYMBOL(nf_afinfo);
int nf_register_afinfo(const struct nf_afinfo *afinfo)
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index cdcc764..5702de3 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -26,10 +26,10 @@
static DEFINE_MUTEX(nf_ct_ecache_mutex);
-struct nf_ct_event_notifier *nf_conntrack_event_cb __read_mostly;
+struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_event_cb);
-struct nf_exp_event_notifier *nf_expect_event_cb __read_mostly;
+struct nf_exp_event_notifier __rcu *nf_expect_event_cb __read_mostly;
EXPORT_SYMBOL_GPL(nf_expect_event_cb);
/* deliver cached events and clear cache entry - must be called with locally
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index 8d9e4c9..bd82450 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -16,7 +16,7 @@
#include
#include
-static struct nf_ct_ext_type *nf_ct_ext_types[NF_CT_EXT_NUM];
+static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM];
static DEFINE_MUTEX(nf_ct_ext_type_mutex);
void __nf_ct_ext_destroy(struct nf_conn *ct)
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 5886ba1..ed6d929 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -28,8 +28,8 @@
#include
#include
-static struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX] __read_mostly;
-struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX] __read_mostly;
+static struct nf_conntrack_l4proto __rcu **nf_ct_protos[PF_MAX] __read_mostly;
+struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX] __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_l3protos);
static DEFINE_MUTEX(nf_ct_proto_mutex);
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 7df37fd..b07393e 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -16,7 +16,7 @@
#define NF_LOG_PREFIXLEN 128
#define NFLOGGER_NAME_LEN 64
-static const struct nf_logger *nf_loggers[NFPROTO_NUMPROTO] __read_mostly;
+static const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO] __read_mostly;
static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly;
static DEFINE_MUTEX(nf_log_mutex);
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 78b3cf9..74aebed 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -18,7 +18,7 @@
* long term mutex. The handler must provide an an outfn() to accept packets
* for queueing and must reinject all packets it receives, no matter what.
*/
-static const struct nf_queue_handler *queue_handler[NFPROTO_NUMPROTO] __read_mostly;
+static const struct nf_queue_handler __rcu *queue_handler[NFPROTO_NUMPROTO] __read_mostly;
static DEFINE_MUTEX(queue_handler_mutex);