unicode.h source code [src/src/sys/fs/unicode.h]

1	/ $NetBSD: unicode.h,v 1.7 2014/04/06 19:25:22 jakllsch Exp $ /
2
3	/-*
4	* Copyright (c) 2001, 2004 The NetBSD Foundation, Inc.
5	* All rights reserved.
6	*
7	* Redistribution and use in source and binary forms, with or without
8	* modification, are permitted provided that the following conditions
9	* are met:
10	* 1. Redistributions of source code must retain the above copyright
11	* notice, this list of conditions and the following disclaimer.
12	* 2. Redistributions in binary form must reproduce the above copyright
13	* notice, this list of conditions and the following disclaimer in the
14	* documentation and/or other materials provided with the distribution.
15	*
16	* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26	* POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/-*
30	* Copyright (c) 1993
31	* The Regents of the University of California. All rights reserved.
32	*
33	* This code is derived from software contributed to Berkeley by
34	* Paul Borman at Krystal Technologies.
35	*
36	* Redistribution and use in source and binary forms, with or without
37	* modification, are permitted provided that the following conditions
38	* are met:
39	* 1. Redistributions of source code must retain the above copyright
40	* notice, this list of conditions and the following disclaimer.
41	* 2. Redistributions in binary form must reproduce the above copyright
42	* notice, this list of conditions and the following disclaimer in the
43	* documentation and/or other materials provided with the distribution.
44	* 3. All advertising materials mentioning features or use of this software
45	* must display the following acknowledgement:
46	* This product includes software developed by the University of
47	* California, Berkeley and its contributors.
48	* 4. Neither the name of the University nor the names of its contributors
49	* may be used to endorse or promote products derived from this software
50	* without specific prior written permission.
51	*
52	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62	* SUCH DAMAGE.
63	*/
64
65	/*
66	* Routines for handling Unicode encoded in UTF-8 form, code derived from
67	* src/lib/libc/locale/utf2.c.
68	*/
69	static u_int16_t wget_utf8(const char *, size_t ) __unused;
70	static int wput_utf8(char *, size_t, u_int16_t) __unused;
71
72	/*
73	* Read one UTF8-encoded character off the string, shift the string pointer
74	* and return the character.
75	*/
76	static u_int16_t
77	wget_utf8(const char *str, size_t sz)
78	{
79	size_t c;
80	u_int16_t rune = `0`;
81	const char s = str;
82	static const int _utf_count[`16`] = {
83	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
84	`0`, `0`, `0`, `0`, `2`, `2`, `3`, `0`,
85	};
86
87	/ must be called with at least one byte remaining /
88	KASSERT(*sz > `0`);
89
90	c = _utf_count[(s[`0`] & `0xf0`) >> `4`];
91	if (c == `0` \|\| c > *sz) {
92	decoding_error:
93	/*
94	* The first character is in range 128-255 and doesn't
95	* mark valid a valid UTF-8 sequence. There is not much
96	* we can do with this, so handle by returning
97	* the first character as if it would be a correctly
98	* encoded ISO-8859-1 character.
99	*/
100	c = `1`;
101	}
102
103	switch (c) {
104	case `1`:
105	rune = s[`0`] & `0xff`;
106	break;
107	case `2`:
108	if ((s[`1`] & `0xc0`) != `0x80`)
109	goto decoding_error;
110	rune = ((s[`0`] & `0x1F`) << `6`) \| (s[`1`] & `0x3F`);
111	break;
112	case `3`:
113	if ((s[`1`] & `0xC0`) != `0x80` \|\| (s[`2`] & `0xC0`) != `0x80`)
114	goto decoding_error;
115	rune = ((s[`0`] & `0x0F`) << `12`) \| ((s[`1`] & `0x3F`) << `6`)
116	\| (s[`2`] & `0x3F`);
117	break;
118	}
119
120	*str += c;
121	*sz -= c;
122	return rune;
123	}
124
125	/*
126	* Encode wide character and write it to the string. 'n' specifies
127	* how much buffer space remains in 's'. Returns number of bytes written
128	* to the target string 's'.
129	*/
130	static int
131	wput_utf8(char *s, size_t n, u_int16_t wc)
132	{
133	if (wc & `0xf800`) {
134	if (n < `3`) {
135	/ bound check failure /
136	return `0`;
137	}
138
139	s[`0`] = `0xE0` \| (wc >> `12`);
140	s[`1`] = `0x80` \| ((wc >> `6`) & `0x3F`);
141	s[`2`] = `0x80` \| ((wc) & `0x3F`);
142	return `3`;
143	} else if (wc & `0x0780`) {
144	if (n < `2`) {
145	/ bound check failure /
146	return `0`;
147	}
148
149	s[`0`] = `0xC0` \| (wc >> `6`);
150	s[`1`] = `0x80` \| ((wc) & `0x3F`);
151	return `2`;
152	} else {
153	if (n < `1`) {
154	/ bound check failure /
155	return `0`;
156	}
157
158	s[`0`] = wc;
159	return `1`;
160	}
161	}
162

Browse the source code of src/src/sys/fs/unicode.h