1 | /* $NetBSD: ip_encap.c,v 1.61 2016/07/04 04:40:13 knakahara Exp $ */ |
2 | /* $KAME: ip_encap.c,v 1.73 2001/10/02 08:30:58 itojun Exp $ */ |
3 | |
4 | /* |
5 | * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. |
6 | * All rights reserved. |
7 | * |
8 | * Redistribution and use in source and binary forms, with or without |
9 | * modification, are permitted provided that the following conditions |
10 | * are met: |
11 | * 1. Redistributions of source code must retain the above copyright |
12 | * notice, this list of conditions and the following disclaimer. |
13 | * 2. Redistributions in binary form must reproduce the above copyright |
14 | * notice, this list of conditions and the following disclaimer in the |
15 | * documentation and/or other materials provided with the distribution. |
16 | * 3. Neither the name of the project nor the names of its contributors |
17 | * may be used to endorse or promote products derived from this software |
18 | * without specific prior written permission. |
19 | * |
20 | * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND |
21 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
22 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
23 | * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE |
24 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
25 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
26 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
27 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
28 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
29 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
30 | * SUCH DAMAGE. |
31 | */ |
32 | /* |
33 | * My grandfather said that there's a devil inside tunnelling technology... |
34 | * |
35 | * We have surprisingly many protocols that want packets with IP protocol |
36 | * #4 or #41. Here's a list of protocols that want protocol #41: |
37 | * RFC1933 configured tunnel |
38 | * RFC1933 automatic tunnel |
39 | * RFC2401 IPsec tunnel |
40 | * RFC2473 IPv6 generic packet tunnelling |
41 | * RFC2529 6over4 tunnel |
42 | * RFC3056 6to4 tunnel |
43 | * isatap tunnel |
44 | * mobile-ip6 (uses RFC2473) |
45 | * Here's a list of protocol that want protocol #4: |
46 | * RFC1853 IPv4-in-IPv4 tunnelling |
47 | * RFC2003 IPv4 encapsulation within IPv4 |
48 | * RFC2344 reverse tunnelling for mobile-ip4 |
49 | * RFC2401 IPsec tunnel |
50 | * Well, what can I say. They impose different en/decapsulation mechanism |
51 | * from each other, so they need separate protocol handler. The only one |
52 | * we can easily determine by protocol # is IPsec, which always has |
53 | * AH/ESP/IPComp header right after outer IP header. |
54 | * |
55 | * So, clearly good old protosw does not work for protocol #4 and #41. |
56 | * The code will let you match protocol via src/dst address pair. |
57 | */ |
58 | /* XXX is M_NETADDR correct? */ |
59 | |
60 | /* |
61 | * With USE_RADIX the code will use radix table for tunnel lookup, for |
62 | * tunnels registered with encap_attach() with a addr/mask pair. |
63 | * Faster on machines with thousands of tunnel registerations (= interfaces). |
64 | * |
65 | * The code assumes that radix table code can handle non-continuous netmask, |
66 | * as it will pass radix table memory region with (src + dst) sockaddr pair. |
67 | */ |
68 | #define USE_RADIX |
69 | |
70 | #include <sys/cdefs.h> |
71 | __KERNEL_RCSID(0, "$NetBSD: ip_encap.c,v 1.61 2016/07/04 04:40:13 knakahara Exp $" ); |
72 | |
73 | #ifdef _KERNEL_OPT |
74 | #include "opt_mrouting.h" |
75 | #include "opt_inet.h" |
76 | #include "opt_net_mpsafe.h" |
77 | #endif |
78 | |
79 | #include <sys/param.h> |
80 | #include <sys/systm.h> |
81 | #include <sys/socket.h> |
82 | #include <sys/sockio.h> |
83 | #include <sys/mbuf.h> |
84 | #include <sys/errno.h> |
85 | #include <sys/queue.h> |
86 | #include <sys/kmem.h> |
87 | #include <sys/mutex.h> |
88 | #include <sys/condvar.h> |
89 | #include <sys/psref.h> |
90 | #include <sys/pslist.h> |
91 | |
92 | #include <net/if.h> |
93 | |
94 | #include <netinet/in.h> |
95 | #include <netinet/in_systm.h> |
96 | #include <netinet/ip.h> |
97 | #include <netinet/ip_var.h> |
98 | #include <netinet/ip_encap.h> |
99 | #ifdef MROUTING |
100 | #include <netinet/ip_mroute.h> |
101 | #endif /* MROUTING */ |
102 | |
103 | #ifdef INET6 |
104 | #include <netinet/ip6.h> |
105 | #include <netinet6/ip6_var.h> |
106 | #include <netinet6/ip6protosw.h> /* for struct ip6ctlparam */ |
107 | #include <netinet6/in6_var.h> |
108 | #include <netinet6/in6_pcb.h> |
109 | #include <netinet/icmp6.h> |
110 | #endif |
111 | |
112 | #include <net/net_osdep.h> |
113 | |
114 | #ifdef NET_MPSAFE |
115 | #define ENCAP_MPSAFE 1 |
116 | #endif |
117 | |
118 | enum direction { INBOUND, OUTBOUND }; |
119 | |
120 | #ifdef INET |
121 | static struct encaptab *encap4_lookup(struct mbuf *, int, int, enum direction, |
122 | struct psref *); |
123 | #endif |
124 | #ifdef INET6 |
125 | static struct encaptab *encap6_lookup(struct mbuf *, int, int, enum direction, |
126 | struct psref *); |
127 | #endif |
128 | static int encap_add(struct encaptab *); |
129 | static int encap_remove(struct encaptab *); |
130 | static int encap_afcheck(int, const struct sockaddr *, const struct sockaddr *); |
131 | #ifdef USE_RADIX |
132 | static struct radix_node_head *encap_rnh(int); |
133 | static int mask_matchlen(const struct sockaddr *); |
134 | #else |
135 | static int mask_match(const struct encaptab *, const struct sockaddr *, |
136 | const struct sockaddr *); |
137 | #endif |
138 | static void encap_fillarg(struct mbuf *, const struct encaptab *); |
139 | |
140 | /* |
141 | * In encap[46]_lookup(), ep->func can sleep(e.g. rtalloc1) while walking |
142 | * encap_table. So, it cannot use pserialize_read_enter() |
143 | */ |
144 | static struct { |
145 | struct pslist_head list; |
146 | pserialize_t psz; |
147 | struct psref_class *elem_class; /* for the element of et_list */ |
148 | } encaptab __cacheline_aligned = { |
149 | .list = PSLIST_INITIALIZER, |
150 | }; |
151 | #define encap_table encaptab.list |
152 | |
153 | static struct { |
154 | kmutex_t lock; |
155 | kcondvar_t cv; |
156 | struct lwp *busy; |
157 | } encap_whole __cacheline_aligned; |
158 | |
159 | #ifdef USE_RADIX |
160 | struct radix_node_head *encap_head[2]; /* 0 for AF_INET, 1 for AF_INET6 */ |
161 | static bool encap_head_updating = false; |
162 | #endif |
163 | |
164 | /* |
165 | * must be done before other encap interfaces initialization. |
166 | */ |
167 | void |
168 | encapinit(void) |
169 | { |
170 | |
171 | encaptab.psz = pserialize_create(); |
172 | encaptab.elem_class = psref_class_create("encapelem" , IPL_SOFTNET); |
173 | if (encaptab.elem_class == NULL) |
174 | panic("encaptab.elem_class cannot be allocated.\n" ); |
175 | |
176 | mutex_init(&encap_whole.lock, MUTEX_DEFAULT, IPL_NONE); |
177 | cv_init(&encap_whole.cv, "ip_encap cv" ); |
178 | encap_whole.busy = NULL; |
179 | } |
180 | |
181 | void |
182 | encap_init(void) |
183 | { |
184 | static int initialized = 0; |
185 | |
186 | if (initialized) |
187 | return; |
188 | initialized++; |
189 | #if 0 |
190 | /* |
191 | * we cannot use LIST_INIT() here, since drivers may want to call |
192 | * encap_attach(), on driver attach. encap_init() will be called |
193 | * on AF_INET{,6} initialization, which happens after driver |
194 | * initialization - using LIST_INIT() here can nuke encap_attach() |
195 | * from drivers. |
196 | */ |
197 | PSLIST_INIT(&encap_table); |
198 | #endif |
199 | |
200 | #ifdef USE_RADIX |
201 | /* |
202 | * initialize radix lookup table when the radix subsystem is inited. |
203 | */ |
204 | rn_delayedinit((void *)&encap_head[0], |
205 | sizeof(struct sockaddr_pack) << 3); |
206 | #ifdef INET6 |
207 | rn_delayedinit((void *)&encap_head[1], |
208 | sizeof(struct sockaddr_pack) << 3); |
209 | #endif |
210 | #endif |
211 | } |
212 | |
213 | #ifdef INET |
214 | static struct encaptab * |
215 | encap4_lookup(struct mbuf *m, int off, int proto, enum direction dir, |
216 | struct psref *match_psref) |
217 | { |
218 | struct ip *ip; |
219 | struct ip_pack4 pack; |
220 | struct encaptab *ep, *match; |
221 | int prio, matchprio; |
222 | int s; |
223 | #ifdef USE_RADIX |
224 | struct radix_node_head *rnh = encap_rnh(AF_INET); |
225 | struct radix_node *rn; |
226 | #endif |
227 | |
228 | KASSERT(m->m_len >= sizeof(*ip)); |
229 | |
230 | ip = mtod(m, struct ip *); |
231 | |
232 | memset(&pack, 0, sizeof(pack)); |
233 | pack.p.sp_len = sizeof(pack); |
234 | pack.mine.sin_family = pack.yours.sin_family = AF_INET; |
235 | pack.mine.sin_len = pack.yours.sin_len = sizeof(struct sockaddr_in); |
236 | if (dir == INBOUND) { |
237 | pack.mine.sin_addr = ip->ip_dst; |
238 | pack.yours.sin_addr = ip->ip_src; |
239 | } else { |
240 | pack.mine.sin_addr = ip->ip_src; |
241 | pack.yours.sin_addr = ip->ip_dst; |
242 | } |
243 | |
244 | match = NULL; |
245 | matchprio = 0; |
246 | |
247 | s = pserialize_read_enter(); |
248 | #ifdef USE_RADIX |
249 | if (encap_head_updating) { |
250 | /* |
251 | * Update in progress. Do nothing. |
252 | */ |
253 | pserialize_read_exit(s); |
254 | return NULL; |
255 | } |
256 | |
257 | rn = rnh->rnh_matchaddr((void *)&pack, rnh); |
258 | if (rn && (rn->rn_flags & RNF_ROOT) == 0) { |
259 | struct encaptab *encapp = (struct encaptab *)rn; |
260 | |
261 | psref_acquire(match_psref, &encapp->psref, |
262 | encaptab.elem_class); |
263 | match = encapp; |
264 | matchprio = mask_matchlen(match->srcmask) + |
265 | mask_matchlen(match->dstmask); |
266 | } |
267 | #endif |
268 | PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) { |
269 | struct psref elem_psref; |
270 | |
271 | membar_datadep_consumer(); |
272 | |
273 | if (ep->af != AF_INET) |
274 | continue; |
275 | if (ep->proto >= 0 && ep->proto != proto) |
276 | continue; |
277 | |
278 | psref_acquire(&elem_psref, &ep->psref, |
279 | encaptab.elem_class); |
280 | if (ep->func) { |
281 | pserialize_read_exit(s); |
282 | /* ep->func is sleepable. e.g. rtalloc1 */ |
283 | prio = (*ep->func)(m, off, proto, ep->arg); |
284 | s = pserialize_read_enter(); |
285 | } else { |
286 | #ifdef USE_RADIX |
287 | psref_release(&elem_psref, &ep->psref, |
288 | encaptab.elem_class); |
289 | continue; |
290 | #else |
291 | prio = mask_match(ep, (struct sockaddr *)&pack.mine, |
292 | (struct sockaddr *)&pack.yours); |
293 | #endif |
294 | } |
295 | |
296 | /* |
297 | * We prioritize the matches by using bit length of the |
298 | * matches. mask_match() and user-supplied matching function |
299 | * should return the bit length of the matches (for example, |
300 | * if both src/dst are matched for IPv4, 64 should be returned). |
301 | * 0 or negative return value means "it did not match". |
302 | * |
303 | * The question is, since we have two "mask" portion, we |
304 | * cannot really define total order between entries. |
305 | * For example, which of these should be preferred? |
306 | * mask_match() returns 48 (32 + 16) for both of them. |
307 | * src=3ffe::/16, dst=3ffe:501::/32 |
308 | * src=3ffe:501::/32, dst=3ffe::/16 |
309 | * |
310 | * We need to loop through all the possible candidates |
311 | * to get the best match - the search takes O(n) for |
312 | * n attachments (i.e. interfaces). |
313 | * |
314 | * For radix-based lookup, I guess source takes precedence. |
315 | * See rn_{refines,lexobetter} for the correct answer. |
316 | */ |
317 | if (prio <= 0) { |
318 | psref_release(&elem_psref, &ep->psref, |
319 | encaptab.elem_class); |
320 | continue; |
321 | } |
322 | if (prio > matchprio) { |
323 | /* release last matched ep */ |
324 | if (match != NULL) |
325 | psref_release(match_psref, &match->psref, |
326 | encaptab.elem_class); |
327 | |
328 | psref_copy(match_psref, &elem_psref, |
329 | encaptab.elem_class); |
330 | matchprio = prio; |
331 | match = ep; |
332 | } |
333 | KASSERTMSG((match == NULL) || psref_held(&match->psref, |
334 | encaptab.elem_class), |
335 | "current match = %p, but not hold its psref" , match); |
336 | |
337 | psref_release(&elem_psref, &ep->psref, |
338 | encaptab.elem_class); |
339 | } |
340 | pserialize_read_exit(s); |
341 | |
342 | return match; |
343 | } |
344 | |
345 | void |
346 | encap4_input(struct mbuf *m, ...) |
347 | { |
348 | int off, proto; |
349 | va_list ap; |
350 | const struct encapsw *esw; |
351 | struct encaptab *match; |
352 | struct psref match_psref; |
353 | |
354 | va_start(ap, m); |
355 | off = va_arg(ap, int); |
356 | proto = va_arg(ap, int); |
357 | va_end(ap); |
358 | |
359 | match = encap4_lookup(m, off, proto, INBOUND, &match_psref); |
360 | if (match) { |
361 | /* found a match, "match" has the best one */ |
362 | esw = match->esw; |
363 | if (esw && esw->encapsw4.pr_input) { |
364 | encap_fillarg(m, match); |
365 | (*esw->encapsw4.pr_input)(m, off, proto); |
366 | psref_release(&match_psref, &match->psref, |
367 | encaptab.elem_class); |
368 | } else { |
369 | psref_release(&match_psref, &match->psref, |
370 | encaptab.elem_class); |
371 | m_freem(m); |
372 | } |
373 | return; |
374 | } |
375 | |
376 | /* last resort: inject to raw socket */ |
377 | rip_input(m, off, proto); |
378 | } |
379 | #endif |
380 | |
381 | #ifdef INET6 |
382 | static struct encaptab * |
383 | encap6_lookup(struct mbuf *m, int off, int proto, enum direction dir, |
384 | struct psref *match_psref) |
385 | { |
386 | struct ip6_hdr *ip6; |
387 | struct ip_pack6 pack; |
388 | int prio, matchprio; |
389 | int s; |
390 | struct encaptab *ep, *match; |
391 | #ifdef USE_RADIX |
392 | struct radix_node_head *rnh = encap_rnh(AF_INET6); |
393 | struct radix_node *rn; |
394 | #endif |
395 | |
396 | KASSERT(m->m_len >= sizeof(*ip6)); |
397 | |
398 | ip6 = mtod(m, struct ip6_hdr *); |
399 | |
400 | memset(&pack, 0, sizeof(pack)); |
401 | pack.p.sp_len = sizeof(pack); |
402 | pack.mine.sin6_family = pack.yours.sin6_family = AF_INET6; |
403 | pack.mine.sin6_len = pack.yours.sin6_len = sizeof(struct sockaddr_in6); |
404 | if (dir == INBOUND) { |
405 | pack.mine.sin6_addr = ip6->ip6_dst; |
406 | pack.yours.sin6_addr = ip6->ip6_src; |
407 | } else { |
408 | pack.mine.sin6_addr = ip6->ip6_src; |
409 | pack.yours.sin6_addr = ip6->ip6_dst; |
410 | } |
411 | |
412 | match = NULL; |
413 | matchprio = 0; |
414 | |
415 | s = pserialize_read_enter(); |
416 | #ifdef USE_RADIX |
417 | if (encap_head_updating) { |
418 | /* |
419 | * Update in progress. Do nothing. |
420 | */ |
421 | pserialize_read_exit(s); |
422 | return NULL; |
423 | } |
424 | |
425 | rn = rnh->rnh_matchaddr((void *)&pack, rnh); |
426 | if (rn && (rn->rn_flags & RNF_ROOT) == 0) { |
427 | struct encaptab *encapp = (struct encaptab *)rn; |
428 | |
429 | psref_acquire(match_psref, &encapp->psref, |
430 | encaptab.elem_class); |
431 | match = encapp; |
432 | matchprio = mask_matchlen(match->srcmask) + |
433 | mask_matchlen(match->dstmask); |
434 | } |
435 | #endif |
436 | PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) { |
437 | struct psref elem_psref; |
438 | |
439 | membar_datadep_consumer(); |
440 | |
441 | if (ep->af != AF_INET6) |
442 | continue; |
443 | if (ep->proto >= 0 && ep->proto != proto) |
444 | continue; |
445 | |
446 | psref_acquire(&elem_psref, &ep->psref, |
447 | encaptab.elem_class); |
448 | |
449 | if (ep->func) { |
450 | pserialize_read_exit(s); |
451 | /* ep->func is sleepable. e.g. rtalloc1 */ |
452 | prio = (*ep->func)(m, off, proto, ep->arg); |
453 | s = pserialize_read_enter(); |
454 | } else { |
455 | #ifdef USE_RADIX |
456 | psref_release(&elem_psref, &ep->psref, |
457 | encaptab.elem_class); |
458 | continue; |
459 | #else |
460 | prio = mask_match(ep, (struct sockaddr *)&pack.mine, |
461 | (struct sockaddr *)&pack.yours); |
462 | #endif |
463 | } |
464 | |
465 | /* see encap4_lookup() for issues here */ |
466 | if (prio <= 0) { |
467 | psref_release(&elem_psref, &ep->psref, |
468 | encaptab.elem_class); |
469 | continue; |
470 | } |
471 | if (prio > matchprio) { |
472 | /* release last matched ep */ |
473 | if (match != NULL) |
474 | psref_release(match_psref, &match->psref, |
475 | encaptab.elem_class); |
476 | |
477 | psref_copy(match_psref, &elem_psref, |
478 | encaptab.elem_class); |
479 | matchprio = prio; |
480 | match = ep; |
481 | } |
482 | KASSERTMSG((match == NULL) || psref_held(&match->psref, |
483 | encaptab.elem_class), |
484 | "current match = %p, but not hold its psref" , match); |
485 | |
486 | psref_release(&elem_psref, &ep->psref, |
487 | encaptab.elem_class); |
488 | } |
489 | pserialize_read_exit(s); |
490 | |
491 | return match; |
492 | } |
493 | |
494 | int |
495 | encap6_input(struct mbuf **mp, int *offp, int proto) |
496 | { |
497 | struct mbuf *m = *mp; |
498 | const struct encapsw *esw; |
499 | struct encaptab *match; |
500 | struct psref match_psref; |
501 | |
502 | match = encap6_lookup(m, *offp, proto, INBOUND, &match_psref); |
503 | |
504 | if (match) { |
505 | /* found a match */ |
506 | esw = match->esw; |
507 | if (esw && esw->encapsw6.pr_input) { |
508 | int ret; |
509 | encap_fillarg(m, match); |
510 | ret = (*esw->encapsw6.pr_input)(mp, offp, proto); |
511 | psref_release(&match_psref, &match->psref, |
512 | encaptab.elem_class); |
513 | return ret; |
514 | } else { |
515 | psref_release(&match_psref, &match->psref, |
516 | encaptab.elem_class); |
517 | m_freem(m); |
518 | return IPPROTO_DONE; |
519 | } |
520 | } |
521 | |
522 | /* last resort: inject to raw socket */ |
523 | return rip6_input(mp, offp, proto); |
524 | } |
525 | #endif |
526 | |
527 | /* |
528 | * XXX |
529 | * The encaptab list and the rnh radix tree must be manipulated atomically. |
530 | */ |
531 | static int |
532 | encap_add(struct encaptab *ep) |
533 | { |
534 | #ifdef USE_RADIX |
535 | struct radix_node_head *rnh = encap_rnh(ep->af); |
536 | #endif |
537 | |
538 | KASSERT(encap_lock_held()); |
539 | |
540 | #ifdef USE_RADIX |
541 | if (!ep->func && rnh) { |
542 | /* Disable access to the radix tree for reader. */ |
543 | encap_head_updating = true; |
544 | /* Wait for all readers to drain. */ |
545 | pserialize_perform(encaptab.psz); |
546 | |
547 | if (!rnh->rnh_addaddr((void *)ep->addrpack, |
548 | (void *)ep->maskpack, rnh, ep->nodes)) { |
549 | encap_head_updating = false; |
550 | return EEXIST; |
551 | } |
552 | |
553 | /* |
554 | * The ep added to the radix tree must be skipped while |
555 | * encap[46]_lookup walks encaptab list. In other words, |
556 | * encap_add() does not need to care whether the ep has |
557 | * been added encaptab list or not yet. |
558 | * So, we can re-enable access to the radix tree for now. |
559 | */ |
560 | encap_head_updating = false; |
561 | } |
562 | #endif |
563 | PSLIST_WRITER_INSERT_HEAD(&encap_table, ep, chain); |
564 | |
565 | return 0; |
566 | } |
567 | |
568 | /* |
569 | * XXX |
570 | * The encaptab list and the rnh radix tree must be manipulated atomically. |
571 | */ |
572 | static int |
573 | encap_remove(struct encaptab *ep) |
574 | { |
575 | #ifdef USE_RADIX |
576 | struct radix_node_head *rnh = encap_rnh(ep->af); |
577 | #endif |
578 | int error = 0; |
579 | |
580 | KASSERT(encap_lock_held()); |
581 | |
582 | #ifdef USE_RADIX |
583 | if (!ep->func && rnh) { |
584 | /* Disable access to the radix tree for reader. */ |
585 | encap_head_updating = true; |
586 | /* Wait for all readers to drain. */ |
587 | pserialize_perform(encaptab.psz); |
588 | |
589 | if (!rnh->rnh_deladdr((void *)ep->addrpack, |
590 | (void *)ep->maskpack, rnh)) |
591 | error = ESRCH; |
592 | |
593 | /* |
594 | * The ep added to the radix tree must be skipped while |
595 | * encap[46]_lookup walks encaptab list. In other words, |
596 | * encap_add() does not need to care whether the ep has |
597 | * been added encaptab list or not yet. |
598 | * So, we can re-enable access to the radix tree for now. |
599 | */ |
600 | encap_head_updating = false; |
601 | } |
602 | #endif |
603 | PSLIST_WRITER_REMOVE(ep, chain); |
604 | |
605 | return error; |
606 | } |
607 | |
608 | static int |
609 | encap_afcheck(int af, const struct sockaddr *sp, const struct sockaddr *dp) |
610 | { |
611 | if (sp && dp) { |
612 | if (sp->sa_len != dp->sa_len) |
613 | return EINVAL; |
614 | if (af != sp->sa_family || af != dp->sa_family) |
615 | return EINVAL; |
616 | } else if (!sp && !dp) |
617 | ; |
618 | else |
619 | return EINVAL; |
620 | |
621 | switch (af) { |
622 | case AF_INET: |
623 | if (sp && sp->sa_len != sizeof(struct sockaddr_in)) |
624 | return EINVAL; |
625 | if (dp && dp->sa_len != sizeof(struct sockaddr_in)) |
626 | return EINVAL; |
627 | break; |
628 | #ifdef INET6 |
629 | case AF_INET6: |
630 | if (sp && sp->sa_len != sizeof(struct sockaddr_in6)) |
631 | return EINVAL; |
632 | if (dp && dp->sa_len != sizeof(struct sockaddr_in6)) |
633 | return EINVAL; |
634 | break; |
635 | #endif |
636 | default: |
637 | return EAFNOSUPPORT; |
638 | } |
639 | |
640 | return 0; |
641 | } |
642 | |
643 | /* |
644 | * sp (src ptr) is always my side, and dp (dst ptr) is always remote side. |
645 | * length of mask (sm and dm) is assumed to be same as sp/dp. |
646 | * Return value will be necessary as input (cookie) for encap_detach(). |
647 | */ |
648 | const struct encaptab * |
649 | encap_attach(int af, int proto, |
650 | const struct sockaddr *sp, const struct sockaddr *sm, |
651 | const struct sockaddr *dp, const struct sockaddr *dm, |
652 | const struct encapsw *esw, void *arg) |
653 | { |
654 | struct encaptab *ep; |
655 | int error; |
656 | int pss; |
657 | size_t l; |
658 | struct ip_pack4 *pack4; |
659 | #ifdef INET6 |
660 | struct ip_pack6 *pack6; |
661 | #endif |
662 | #ifndef ENCAP_MPSAFE |
663 | int s; |
664 | |
665 | s = splsoftnet(); |
666 | #endif |
667 | /* sanity check on args */ |
668 | error = encap_afcheck(af, sp, dp); |
669 | if (error) |
670 | goto fail; |
671 | |
672 | /* check if anyone have already attached with exactly same config */ |
673 | pss = pserialize_read_enter(); |
674 | PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) { |
675 | membar_datadep_consumer(); |
676 | |
677 | if (ep->af != af) |
678 | continue; |
679 | if (ep->proto != proto) |
680 | continue; |
681 | if (ep->func) |
682 | continue; |
683 | |
684 | KASSERT(ep->src != NULL); |
685 | KASSERT(ep->dst != NULL); |
686 | KASSERT(ep->srcmask != NULL); |
687 | KASSERT(ep->dstmask != NULL); |
688 | |
689 | if (ep->src->sa_len != sp->sa_len || |
690 | memcmp(ep->src, sp, sp->sa_len) != 0 || |
691 | memcmp(ep->srcmask, sm, sp->sa_len) != 0) |
692 | continue; |
693 | if (ep->dst->sa_len != dp->sa_len || |
694 | memcmp(ep->dst, dp, dp->sa_len) != 0 || |
695 | memcmp(ep->dstmask, dm, dp->sa_len) != 0) |
696 | continue; |
697 | |
698 | error = EEXIST; |
699 | pserialize_read_exit(pss); |
700 | goto fail; |
701 | } |
702 | pserialize_read_exit(pss); |
703 | |
704 | switch (af) { |
705 | case AF_INET: |
706 | l = sizeof(*pack4); |
707 | break; |
708 | #ifdef INET6 |
709 | case AF_INET6: |
710 | l = sizeof(*pack6); |
711 | break; |
712 | #endif |
713 | default: |
714 | goto fail; |
715 | } |
716 | |
717 | /* M_NETADDR ok? */ |
718 | ep = kmem_zalloc(sizeof(*ep), KM_NOSLEEP); |
719 | if (ep == NULL) { |
720 | error = ENOBUFS; |
721 | goto fail; |
722 | } |
723 | ep->addrpack = kmem_zalloc(l, KM_NOSLEEP); |
724 | if (ep->addrpack == NULL) { |
725 | error = ENOBUFS; |
726 | goto gc; |
727 | } |
728 | ep->maskpack = kmem_zalloc(l, KM_NOSLEEP); |
729 | if (ep->maskpack == NULL) { |
730 | error = ENOBUFS; |
731 | goto gc; |
732 | } |
733 | |
734 | ep->af = af; |
735 | ep->proto = proto; |
736 | ep->addrpack->sa_len = l & 0xff; |
737 | ep->maskpack->sa_len = l & 0xff; |
738 | switch (af) { |
739 | case AF_INET: |
740 | pack4 = (struct ip_pack4 *)ep->addrpack; |
741 | ep->src = (struct sockaddr *)&pack4->mine; |
742 | ep->dst = (struct sockaddr *)&pack4->yours; |
743 | pack4 = (struct ip_pack4 *)ep->maskpack; |
744 | ep->srcmask = (struct sockaddr *)&pack4->mine; |
745 | ep->dstmask = (struct sockaddr *)&pack4->yours; |
746 | break; |
747 | #ifdef INET6 |
748 | case AF_INET6: |
749 | pack6 = (struct ip_pack6 *)ep->addrpack; |
750 | ep->src = (struct sockaddr *)&pack6->mine; |
751 | ep->dst = (struct sockaddr *)&pack6->yours; |
752 | pack6 = (struct ip_pack6 *)ep->maskpack; |
753 | ep->srcmask = (struct sockaddr *)&pack6->mine; |
754 | ep->dstmask = (struct sockaddr *)&pack6->yours; |
755 | break; |
756 | #endif |
757 | } |
758 | |
759 | memcpy(ep->src, sp, sp->sa_len); |
760 | memcpy(ep->srcmask, sm, sp->sa_len); |
761 | memcpy(ep->dst, dp, dp->sa_len); |
762 | memcpy(ep->dstmask, dm, dp->sa_len); |
763 | ep->esw = esw; |
764 | ep->arg = arg; |
765 | psref_target_init(&ep->psref, encaptab.elem_class); |
766 | |
767 | error = encap_add(ep); |
768 | if (error) |
769 | goto gc; |
770 | |
771 | error = 0; |
772 | #ifndef ENCAP_MPSAFE |
773 | splx(s); |
774 | #endif |
775 | return ep; |
776 | |
777 | gc: |
778 | if (ep->addrpack) |
779 | kmem_free(ep->addrpack, l); |
780 | if (ep->maskpack) |
781 | kmem_free(ep->maskpack, l); |
782 | if (ep) |
783 | kmem_free(ep, sizeof(*ep)); |
784 | fail: |
785 | #ifndef ENCAP_MPSAFE |
786 | splx(s); |
787 | #endif |
788 | return NULL; |
789 | } |
790 | |
791 | const struct encaptab * |
792 | encap_attach_func(int af, int proto, |
793 | int (*func)(struct mbuf *, int, int, void *), |
794 | const struct encapsw *esw, void *arg) |
795 | { |
796 | struct encaptab *ep; |
797 | int error; |
798 | #ifndef ENCAP_MPSAFE |
799 | int s; |
800 | |
801 | s = splsoftnet(); |
802 | #endif |
803 | /* sanity check on args */ |
804 | if (!func) { |
805 | error = EINVAL; |
806 | goto fail; |
807 | } |
808 | |
809 | error = encap_afcheck(af, NULL, NULL); |
810 | if (error) |
811 | goto fail; |
812 | |
813 | ep = kmem_alloc(sizeof(*ep), KM_NOSLEEP); /*XXX*/ |
814 | if (ep == NULL) { |
815 | error = ENOBUFS; |
816 | goto fail; |
817 | } |
818 | memset(ep, 0, sizeof(*ep)); |
819 | |
820 | ep->af = af; |
821 | ep->proto = proto; |
822 | ep->func = func; |
823 | ep->esw = esw; |
824 | ep->arg = arg; |
825 | psref_target_init(&ep->psref, encaptab.elem_class); |
826 | |
827 | error = encap_add(ep); |
828 | if (error) |
829 | goto fail; |
830 | |
831 | error = 0; |
832 | #ifndef ENCAP_MPSAFE |
833 | splx(s); |
834 | #endif |
835 | return ep; |
836 | |
837 | fail: |
838 | #ifndef ENCAP_MPSAFE |
839 | splx(s); |
840 | #endif |
841 | return NULL; |
842 | } |
843 | |
844 | /* XXX encap4_ctlinput() is necessary if we set DF=1 on outer IPv4 header */ |
845 | |
846 | #ifdef INET6 |
847 | void * |
848 | encap6_ctlinput(int cmd, const struct sockaddr *sa, void *d0) |
849 | { |
850 | void *d = d0; |
851 | struct ip6_hdr *ip6; |
852 | struct mbuf *m; |
853 | int off; |
854 | struct ip6ctlparam *ip6cp = NULL; |
855 | int nxt; |
856 | int s; |
857 | struct encaptab *ep; |
858 | const struct encapsw *esw; |
859 | |
860 | if (sa->sa_family != AF_INET6 || |
861 | sa->sa_len != sizeof(struct sockaddr_in6)) |
862 | return NULL; |
863 | |
864 | if ((unsigned)cmd >= PRC_NCMDS) |
865 | return NULL; |
866 | if (cmd == PRC_HOSTDEAD) |
867 | d = NULL; |
868 | else if (cmd == PRC_MSGSIZE) |
869 | ; /* special code is present, see below */ |
870 | else if (inet6ctlerrmap[cmd] == 0) |
871 | return NULL; |
872 | |
873 | /* if the parameter is from icmp6, decode it. */ |
874 | if (d != NULL) { |
875 | ip6cp = (struct ip6ctlparam *)d; |
876 | m = ip6cp->ip6c_m; |
877 | ip6 = ip6cp->ip6c_ip6; |
878 | off = ip6cp->ip6c_off; |
879 | nxt = ip6cp->ip6c_nxt; |
880 | |
881 | if (ip6 && cmd == PRC_MSGSIZE) { |
882 | int valid = 0; |
883 | struct encaptab *match; |
884 | struct psref elem_psref; |
885 | |
886 | /* |
887 | * Check to see if we have a valid encap configuration. |
888 | */ |
889 | match = encap6_lookup(m, off, nxt, OUTBOUND, |
890 | &elem_psref); |
891 | if (match) |
892 | valid++; |
893 | psref_release(&elem_psref, &match->psref, |
894 | encaptab.elem_class); |
895 | |
896 | /* |
897 | * Depending on the value of "valid" and routing table |
898 | * size (mtudisc_{hi,lo}wat), we will: |
899 | * - recalcurate the new MTU and create the |
900 | * corresponding routing entry, or |
901 | * - ignore the MTU change notification. |
902 | */ |
903 | icmp6_mtudisc_update((struct ip6ctlparam *)d, valid); |
904 | } |
905 | } else { |
906 | m = NULL; |
907 | ip6 = NULL; |
908 | nxt = -1; |
909 | } |
910 | |
911 | /* inform all listeners */ |
912 | |
913 | s = pserialize_read_enter(); |
914 | PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) { |
915 | struct psref elem_psref; |
916 | |
917 | membar_datadep_consumer(); |
918 | |
919 | if (ep->af != AF_INET6) |
920 | continue; |
921 | if (ep->proto >= 0 && ep->proto != nxt) |
922 | continue; |
923 | |
924 | /* should optimize by looking at address pairs */ |
925 | |
926 | /* XXX need to pass ep->arg or ep itself to listeners */ |
927 | psref_acquire(&elem_psref, &ep->psref, |
928 | encaptab.elem_class); |
929 | esw = ep->esw; |
930 | if (esw && esw->encapsw6.pr_ctlinput) { |
931 | pserialize_read_exit(s); |
932 | /* pr_ctlinput is sleepable. e.g. rtcache_free */ |
933 | (*esw->encapsw6.pr_ctlinput)(cmd, sa, d, ep->arg); |
934 | s = pserialize_read_enter(); |
935 | } |
936 | psref_release(&elem_psref, &ep->psref, |
937 | encaptab.elem_class); |
938 | } |
939 | pserialize_read_exit(s); |
940 | |
941 | rip6_ctlinput(cmd, sa, d0); |
942 | return NULL; |
943 | } |
944 | #endif |
945 | |
946 | int |
947 | encap_detach(const struct encaptab *cookie) |
948 | { |
949 | const struct encaptab *ep = cookie; |
950 | struct encaptab *p; |
951 | int error; |
952 | |
953 | KASSERT(encap_lock_held()); |
954 | |
955 | PSLIST_WRITER_FOREACH(p, &encap_table, struct encaptab, chain) { |
956 | membar_datadep_consumer(); |
957 | |
958 | if (p == ep) { |
959 | error = encap_remove(p); |
960 | if (error) |
961 | return error; |
962 | else |
963 | break; |
964 | } |
965 | } |
966 | if (p == NULL) |
967 | return ENOENT; |
968 | |
969 | #ifndef USE_RADIX |
970 | /* |
971 | * pserialize_perform(encaptab.psz) is already done in encap_remove(). |
972 | */ |
973 | pserialize_perform(encaptab.psz); |
974 | #endif |
975 | psref_target_destroy(&p->psref, |
976 | encaptab.elem_class); |
977 | if (!ep->func) { |
978 | kmem_free(p->addrpack, ep->addrpack->sa_len); |
979 | kmem_free(p->maskpack, ep->maskpack->sa_len); |
980 | } |
981 | kmem_free(p, sizeof(*p)); |
982 | |
983 | return 0; |
984 | } |
985 | |
986 | #ifdef USE_RADIX |
987 | static struct radix_node_head * |
988 | encap_rnh(int af) |
989 | { |
990 | |
991 | switch (af) { |
992 | case AF_INET: |
993 | return encap_head[0]; |
994 | #ifdef INET6 |
995 | case AF_INET6: |
996 | return encap_head[1]; |
997 | #endif |
998 | default: |
999 | return NULL; |
1000 | } |
1001 | } |
1002 | |
1003 | static int |
1004 | mask_matchlen(const struct sockaddr *sa) |
1005 | { |
1006 | const char *p, *ep; |
1007 | int l; |
1008 | |
1009 | p = (const char *)sa; |
1010 | ep = p + sa->sa_len; |
1011 | p += 2; /* sa_len + sa_family */ |
1012 | |
1013 | l = 0; |
1014 | while (p < ep) { |
1015 | l += (*p ? 8 : 0); /* estimate */ |
1016 | p++; |
1017 | } |
1018 | return l; |
1019 | } |
1020 | #endif |
1021 | |
1022 | #ifndef USE_RADIX |
1023 | static int |
1024 | mask_match(const struct encaptab *ep, |
1025 | const struct sockaddr *sp, |
1026 | const struct sockaddr *dp) |
1027 | { |
1028 | struct sockaddr_storage s; |
1029 | struct sockaddr_storage d; |
1030 | int i; |
1031 | const u_int8_t *p, *q; |
1032 | u_int8_t *r; |
1033 | int matchlen; |
1034 | |
1035 | KASSERTMSG(ep->func == NULL, "wrong encaptab passed to mask_match" ); |
1036 | |
1037 | if (sp->sa_len > sizeof(s) || dp->sa_len > sizeof(d)) |
1038 | return 0; |
1039 | if (sp->sa_family != ep->af || dp->sa_family != ep->af) |
1040 | return 0; |
1041 | if (sp->sa_len != ep->src->sa_len || dp->sa_len != ep->dst->sa_len) |
1042 | return 0; |
1043 | |
1044 | matchlen = 0; |
1045 | |
1046 | p = (const u_int8_t *)sp; |
1047 | q = (const u_int8_t *)ep->srcmask; |
1048 | r = (u_int8_t *)&s; |
1049 | for (i = 0 ; i < sp->sa_len; i++) { |
1050 | r[i] = p[i] & q[i]; |
1051 | /* XXX estimate */ |
1052 | matchlen += (q[i] ? 8 : 0); |
1053 | } |
1054 | |
1055 | p = (const u_int8_t *)dp; |
1056 | q = (const u_int8_t *)ep->dstmask; |
1057 | r = (u_int8_t *)&d; |
1058 | for (i = 0 ; i < dp->sa_len; i++) { |
1059 | r[i] = p[i] & q[i]; |
1060 | /* XXX rough estimate */ |
1061 | matchlen += (q[i] ? 8 : 0); |
1062 | } |
1063 | |
1064 | /* need to overwrite len/family portion as we don't compare them */ |
1065 | s.ss_len = sp->sa_len; |
1066 | s.ss_family = sp->sa_family; |
1067 | d.ss_len = dp->sa_len; |
1068 | d.ss_family = dp->sa_family; |
1069 | |
1070 | if (memcmp(&s, ep->src, ep->src->sa_len) == 0 && |
1071 | memcmp(&d, ep->dst, ep->dst->sa_len) == 0) { |
1072 | return matchlen; |
1073 | } else |
1074 | return 0; |
1075 | } |
1076 | #endif |
1077 | |
1078 | static void |
1079 | encap_fillarg(struct mbuf *m, const struct encaptab *ep) |
1080 | { |
1081 | struct m_tag *mtag; |
1082 | |
1083 | mtag = m_tag_get(PACKET_TAG_ENCAP, sizeof(void *), M_NOWAIT); |
1084 | if (mtag) { |
1085 | *(void **)(mtag + 1) = ep->arg; |
1086 | m_tag_prepend(m, mtag); |
1087 | } |
1088 | } |
1089 | |
1090 | void * |
1091 | encap_getarg(struct mbuf *m) |
1092 | { |
1093 | void *p; |
1094 | struct m_tag *mtag; |
1095 | |
1096 | p = NULL; |
1097 | mtag = m_tag_find(m, PACKET_TAG_ENCAP, NULL); |
1098 | if (mtag != NULL) { |
1099 | p = *(void **)(mtag + 1); |
1100 | m_tag_delete(m, mtag); |
1101 | } |
1102 | return p; |
1103 | } |
1104 | |
1105 | int |
1106 | encap_lock_enter(void) |
1107 | { |
1108 | int error; |
1109 | |
1110 | mutex_enter(&encap_whole.lock); |
1111 | while (encap_whole.busy != NULL) { |
1112 | error = cv_wait_sig(&encap_whole.cv, &encap_whole.lock); |
1113 | if (error) { |
1114 | mutex_exit(&encap_whole.lock); |
1115 | return error; |
1116 | } |
1117 | } |
1118 | KASSERT(encap_whole.busy == NULL); |
1119 | encap_whole.busy = curlwp; |
1120 | mutex_exit(&encap_whole.lock); |
1121 | |
1122 | return 0; |
1123 | } |
1124 | |
1125 | void |
1126 | encap_lock_exit(void) |
1127 | { |
1128 | |
1129 | mutex_enter(&encap_whole.lock); |
1130 | KASSERT(encap_whole.busy == curlwp); |
1131 | encap_whole.busy = NULL; |
1132 | cv_broadcast(&encap_whole.cv); |
1133 | mutex_exit(&encap_whole.lock); |
1134 | } |
1135 | |
1136 | bool |
1137 | encap_lock_held(void) |
1138 | { |
1139 | |
1140 | return (encap_whole.busy == curlwp); |
1141 | } |
1142 | |