1 | /* $NetBSD: kern_physio.c,v 1.93 2015/04/21 10:54:52 pooka Exp $ */ |
2 | |
3 | /*- |
4 | * Copyright (c) 1982, 1986, 1990, 1993 |
5 | * The Regents of the University of California. All rights reserved. |
6 | * (c) UNIX System Laboratories, Inc. |
7 | * All or some portions of this file are derived from material licensed |
8 | * to the University of California by American Telephone and Telegraph |
9 | * Co. or Unix System Laboratories, Inc. and are reproduced herein with |
10 | * the permission of UNIX System Laboratories, Inc. |
11 | * |
12 | * Redistribution and use in source and binary forms, with or without |
13 | * modification, are permitted provided that the following conditions |
14 | * are met: |
15 | * 1. Redistributions of source code must retain the above copyright |
16 | * notice, this list of conditions and the following disclaimer. |
17 | * 2. Redistributions in binary form must reproduce the above copyright |
18 | * notice, this list of conditions and the following disclaimer in the |
19 | * documentation and/or other materials provided with the distribution. |
20 | * 3. Neither the name of the University nor the names of its contributors |
21 | * may be used to endorse or promote products derived from this software |
22 | * without specific prior written permission. |
23 | * |
24 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
25 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
26 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
27 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
28 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
30 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
31 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
32 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
33 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
34 | * SUCH DAMAGE. |
35 | * |
36 | * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93 |
37 | */ |
38 | |
39 | /*- |
40 | * Copyright (c) 1994 Christopher G. Demetriou |
41 | * |
42 | * Redistribution and use in source and binary forms, with or without |
43 | * modification, are permitted provided that the following conditions |
44 | * are met: |
45 | * 1. Redistributions of source code must retain the above copyright |
46 | * notice, this list of conditions and the following disclaimer. |
47 | * 2. Redistributions in binary form must reproduce the above copyright |
48 | * notice, this list of conditions and the following disclaimer in the |
49 | * documentation and/or other materials provided with the distribution. |
50 | * 3. All advertising materials mentioning features or use of this software |
51 | * must display the following acknowledgement: |
52 | * This product includes software developed by the University of |
53 | * California, Berkeley and its contributors. |
54 | * 4. Neither the name of the University nor the names of its contributors |
55 | * may be used to endorse or promote products derived from this software |
56 | * without specific prior written permission. |
57 | * |
58 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
59 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
60 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
61 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
62 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
63 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
64 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
65 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
66 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
67 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
68 | * SUCH DAMAGE. |
69 | * |
70 | * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93 |
71 | */ |
72 | |
73 | #include <sys/cdefs.h> |
74 | __KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.93 2015/04/21 10:54:52 pooka Exp $" ); |
75 | |
76 | #include <sys/param.h> |
77 | #include <sys/systm.h> |
78 | #include <sys/buf.h> |
79 | #include <sys/proc.h> |
80 | #include <sys/once.h> |
81 | #include <sys/workqueue.h> |
82 | #include <sys/kmem.h> |
83 | |
84 | #include <uvm/uvm_extern.h> |
85 | |
86 | ONCE_DECL(physio_initialized); |
87 | struct workqueue *physio_workqueue; |
88 | |
89 | int physio_concurrency = 16; |
90 | |
91 | /* #define PHYSIO_DEBUG */ |
92 | #if defined(PHYSIO_DEBUG) |
93 | #define DPRINTF(a) printf a |
94 | #else /* defined(PHYSIO_DEBUG) */ |
95 | #define DPRINTF(a) /* nothing */ |
96 | #endif /* defined(PHYSIO_DEBUG) */ |
97 | |
98 | struct physio_stat { |
99 | int ps_running; |
100 | int ps_error; |
101 | int ps_failed; |
102 | off_t ps_endoffset; |
103 | buf_t *ps_orig_bp; |
104 | kmutex_t ps_lock; |
105 | kcondvar_t ps_cv; |
106 | }; |
107 | |
108 | static void |
109 | physio_done(struct work *wk, void *dummy) |
110 | { |
111 | struct buf *bp = (void *)wk; |
112 | size_t todo = bp->b_bufsize; |
113 | size_t done = bp->b_bcount - bp->b_resid; |
114 | struct physio_stat *ps = bp->b_private; |
115 | bool is_iobuf; |
116 | |
117 | KASSERT(&bp->b_work == wk); |
118 | KASSERT(bp->b_bcount <= todo); |
119 | KASSERT(bp->b_resid <= bp->b_bcount); |
120 | KASSERT((bp->b_flags & B_PHYS) != 0); |
121 | KASSERT(dummy == NULL); |
122 | |
123 | vunmapbuf(bp, todo); |
124 | uvm_vsunlock(bp->b_proc->p_vmspace, bp->b_data, todo); |
125 | |
126 | mutex_enter(&ps->ps_lock); |
127 | is_iobuf = (bp != ps->ps_orig_bp); |
128 | if (__predict_false(done != todo)) { |
129 | off_t endoffset = dbtob(bp->b_blkno) + done; |
130 | |
131 | /* |
132 | * we got an error or hit EOM. |
133 | * |
134 | * we only care about the first one. |
135 | * ie. the one at the lowest offset. |
136 | */ |
137 | |
138 | KASSERT(ps->ps_endoffset != endoffset); |
139 | DPRINTF(("%s: error=%d at %" PRIu64 " - %" PRIu64 |
140 | ", blkno=%" PRIu64 ", bcount=%d, flags=0x%x\n" , |
141 | __func__, bp->b_error, dbtob(bp->b_blkno), endoffset, |
142 | bp->b_blkno, bp->b_bcount, bp->b_flags)); |
143 | |
144 | if (ps->ps_endoffset == -1 || endoffset < ps->ps_endoffset) { |
145 | DPRINTF(("%s: ps=%p, error %d -> %d, endoff %" PRIu64 |
146 | " -> %" PRIu64 "\n" , |
147 | __func__, ps, |
148 | ps->ps_error, bp->b_error, |
149 | ps->ps_endoffset, endoffset)); |
150 | |
151 | ps->ps_endoffset = endoffset; |
152 | ps->ps_error = bp->b_error; |
153 | } |
154 | ps->ps_failed++; |
155 | } else { |
156 | KASSERT(bp->b_error == 0); |
157 | } |
158 | |
159 | ps->ps_running--; |
160 | cv_signal(&ps->ps_cv); |
161 | mutex_exit(&ps->ps_lock); |
162 | |
163 | if (is_iobuf) |
164 | putiobuf(bp); |
165 | } |
166 | |
167 | static void |
168 | physio_biodone(struct buf *bp) |
169 | { |
170 | #if defined(DIAGNOSTIC) |
171 | struct physio_stat *ps = bp->b_private; |
172 | size_t todo = bp->b_bufsize; |
173 | size_t done = bp->b_bcount - bp->b_resid; |
174 | |
175 | KASSERT(ps->ps_running > 0); |
176 | KASSERT(bp->b_bcount <= todo); |
177 | KASSERT(bp->b_resid <= bp->b_bcount); |
178 | if (done == todo) |
179 | KASSERT(bp->b_error == 0); |
180 | #endif /* defined(DIAGNOSTIC) */ |
181 | |
182 | workqueue_enqueue(physio_workqueue, &bp->b_work, NULL); |
183 | } |
184 | |
185 | static void |
186 | physio_wait(struct physio_stat *ps, int n) |
187 | { |
188 | |
189 | KASSERT(mutex_owned(&ps->ps_lock)); |
190 | |
191 | while (ps->ps_running > n) |
192 | cv_wait(&ps->ps_cv, &ps->ps_lock); |
193 | } |
194 | |
195 | static int |
196 | physio_init(void) |
197 | { |
198 | int error; |
199 | |
200 | KASSERT(physio_workqueue == NULL); |
201 | |
202 | error = workqueue_create(&physio_workqueue, "physiod" , |
203 | physio_done, NULL, PRI_BIO, IPL_BIO, WQ_MPSAFE); |
204 | |
205 | return error; |
206 | } |
207 | |
208 | /* |
209 | * Do "physical I/O" on behalf of a user. "Physical I/O" is I/O directly |
210 | * from the raw device to user buffers, and bypasses the buffer cache. |
211 | */ |
212 | int |
213 | physio(void (*strategy)(struct buf *), struct buf *obp, dev_t dev, int flags, |
214 | void (*min_phys)(struct buf *), struct uio *uio) |
215 | { |
216 | struct iovec *iovp; |
217 | struct lwp *l = curlwp; |
218 | struct proc *p = l->l_proc; |
219 | int i, error; |
220 | struct buf *bp = NULL; |
221 | struct physio_stat *ps; |
222 | int concurrency = physio_concurrency - 1; |
223 | |
224 | error = RUN_ONCE(&physio_initialized, physio_init); |
225 | if (__predict_false(error != 0)) { |
226 | return error; |
227 | } |
228 | |
229 | DPRINTF(("%s: called: off=%" PRIu64 ", resid=%zu\n" , |
230 | __func__, uio->uio_offset, uio->uio_resid)); |
231 | |
232 | flags &= B_READ | B_WRITE; |
233 | |
234 | ps = kmem_zalloc(sizeof(*ps), KM_SLEEP); |
235 | /* ps->ps_running = 0; */ |
236 | /* ps->ps_error = 0; */ |
237 | /* ps->ps_failed = 0; */ |
238 | ps->ps_orig_bp = obp; |
239 | ps->ps_endoffset = -1; |
240 | mutex_init(&ps->ps_lock, MUTEX_DEFAULT, IPL_NONE); |
241 | cv_init(&ps->ps_cv, "physio" ); |
242 | |
243 | /* Make sure we have a buffer, creating one if necessary. */ |
244 | if (obp != NULL) { |
245 | mutex_enter(&bufcache_lock); |
246 | /* Mark it busy, so nobody else will use it. */ |
247 | while (bbusy(obp, false, 0, NULL) == EPASSTHROUGH) |
248 | ; |
249 | mutex_exit(&bufcache_lock); |
250 | concurrency = 0; /* see "XXXkludge" comment below */ |
251 | } |
252 | |
253 | for (i = 0; i < uio->uio_iovcnt; i++) { |
254 | bool sync = true; |
255 | |
256 | iovp = &uio->uio_iov[i]; |
257 | while (iovp->iov_len > 0) { |
258 | size_t todo; |
259 | vaddr_t endp; |
260 | |
261 | mutex_enter(&ps->ps_lock); |
262 | if (ps->ps_failed != 0) { |
263 | goto done_locked; |
264 | } |
265 | physio_wait(ps, sync ? 0 : concurrency); |
266 | mutex_exit(&ps->ps_lock); |
267 | if (obp != NULL) { |
268 | /* |
269 | * XXXkludge |
270 | * some drivers use "obp" as an identifier. |
271 | */ |
272 | bp = obp; |
273 | } else { |
274 | bp = getiobuf(NULL, true); |
275 | bp->b_cflags = BC_BUSY; |
276 | } |
277 | bp->b_dev = dev; |
278 | bp->b_proc = p; |
279 | bp->b_private = ps; |
280 | |
281 | /* |
282 | * Mrk the buffer busy for physical I/O. Also set |
283 | * B_PHYS because it's an I/O to user memory, and |
284 | * B_RAW because B_RAW is to be "set by physio for |
285 | * raw transfers". |
286 | */ |
287 | bp->b_oflags = 0; |
288 | bp->b_cflags = BC_BUSY; |
289 | bp->b_flags = flags | B_PHYS | B_RAW; |
290 | bp->b_iodone = physio_biodone; |
291 | |
292 | /* Set up the buffer for a maximum-sized transfer. */ |
293 | bp->b_blkno = btodb(uio->uio_offset); |
294 | if (dbtob(bp->b_blkno) != uio->uio_offset) { |
295 | error = EINVAL; |
296 | goto done; |
297 | } |
298 | bp->b_bcount = MIN(MAXPHYS, iovp->iov_len); |
299 | bp->b_data = iovp->iov_base; |
300 | |
301 | /* |
302 | * Call minphys to bound the transfer size, |
303 | * and remember the amount of data to transfer, |
304 | * for later comparison. |
305 | */ |
306 | (*min_phys)(bp); |
307 | todo = bp->b_bufsize = bp->b_bcount; |
308 | #if defined(DIAGNOSTIC) |
309 | if (todo > MAXPHYS) |
310 | panic("todo(%zu) > MAXPHYS; minphys broken" , |
311 | todo); |
312 | #endif /* defined(DIAGNOSTIC) */ |
313 | |
314 | sync = false; |
315 | endp = (vaddr_t)bp->b_data + todo; |
316 | if (trunc_page(endp) != endp) { |
317 | /* |
318 | * Following requests can overlap. |
319 | * note that uvm_vslock does round_page. |
320 | */ |
321 | sync = true; |
322 | } |
323 | |
324 | /* |
325 | * Lock the part of the user address space involved |
326 | * in the transfer. |
327 | */ |
328 | error = uvm_vslock(p->p_vmspace, bp->b_data, todo, |
329 | (flags & B_READ) ? VM_PROT_WRITE : VM_PROT_READ); |
330 | if (error) { |
331 | goto done; |
332 | } |
333 | |
334 | /* |
335 | * Beware vmapbuf(); if succesful it clobbers |
336 | * b_data and saves it in b_saveaddr. |
337 | * However, vunmapbuf() restores b_data. |
338 | */ |
339 | if ((error = vmapbuf(bp, todo)) != 0) { |
340 | uvm_vsunlock(p->p_vmspace, bp->b_data, todo); |
341 | goto done; |
342 | } |
343 | |
344 | BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); |
345 | |
346 | mutex_enter(&ps->ps_lock); |
347 | ps->ps_running++; |
348 | mutex_exit(&ps->ps_lock); |
349 | |
350 | /* Call strategy to start the transfer. */ |
351 | (*strategy)(bp); |
352 | bp = NULL; |
353 | |
354 | iovp->iov_len -= todo; |
355 | iovp->iov_base = (char *)iovp->iov_base + todo; |
356 | uio->uio_offset += todo; |
357 | uio->uio_resid -= todo; |
358 | } |
359 | } |
360 | |
361 | done: |
362 | mutex_enter(&ps->ps_lock); |
363 | done_locked: |
364 | physio_wait(ps, 0); |
365 | mutex_exit(&ps->ps_lock); |
366 | |
367 | if (ps->ps_failed != 0) { |
368 | off_t delta; |
369 | |
370 | delta = uio->uio_offset - ps->ps_endoffset; |
371 | KASSERT(delta > 0); |
372 | uio->uio_resid += delta; |
373 | /* uio->uio_offset = ps->ps_endoffset; */ |
374 | } else { |
375 | KASSERT(ps->ps_endoffset == -1); |
376 | } |
377 | if (bp != NULL && bp != obp) { |
378 | putiobuf(bp); |
379 | } |
380 | if (error == 0) { |
381 | error = ps->ps_error; |
382 | } |
383 | mutex_destroy(&ps->ps_lock); |
384 | cv_destroy(&ps->ps_cv); |
385 | kmem_free(ps, sizeof(*ps)); |
386 | |
387 | /* |
388 | * Clean up the state of the buffer. Remember if somebody wants |
389 | * it, so we can wake them up below. Also, if we had to steal it, |
390 | * give it back. |
391 | */ |
392 | if (obp != NULL) { |
393 | KASSERT((obp->b_cflags & BC_BUSY) != 0); |
394 | |
395 | /* |
396 | * If another process is waiting for the raw I/O buffer, |
397 | * wake up processes waiting to do physical I/O; |
398 | */ |
399 | mutex_enter(&bufcache_lock); |
400 | obp->b_cflags &= ~(BC_BUSY | BC_WANTED); |
401 | obp->b_flags &= ~(B_PHYS | B_RAW); |
402 | obp->b_iodone = NULL; |
403 | cv_broadcast(&obp->b_busy); |
404 | mutex_exit(&bufcache_lock); |
405 | } |
406 | |
407 | DPRINTF(("%s: done: off=%" PRIu64 ", resid=%zu\n" , |
408 | __func__, uio->uio_offset, uio->uio_resid)); |
409 | |
410 | return error; |
411 | } |
412 | |
413 | /* |
414 | * A minphys() routine is called by physio() to adjust the size of each |
415 | * I/O transfer before the latter is passed to the strategy routine. |
416 | * |
417 | * This minphys() is a default that must be called to enforce limits |
418 | * that are applicable to all devices, because of limitations in the |
419 | * kernel or the hardware platform. |
420 | */ |
421 | void |
422 | minphys(struct buf *bp) |
423 | { |
424 | |
425 | if (bp->b_bcount > MAXPHYS) |
426 | bp->b_bcount = MAXPHYS; |
427 | } |
428 | |