diff options
Diffstat (limited to 'rubbos/app/httpd-2.0.64/srclib/apr/misc/win32/utf8.c')
-rw-r--r-- | rubbos/app/httpd-2.0.64/srclib/apr/misc/win32/utf8.c | 254 |
1 files changed, 0 insertions, 254 deletions
diff --git a/rubbos/app/httpd-2.0.64/srclib/apr/misc/win32/utf8.c b/rubbos/app/httpd-2.0.64/srclib/apr/misc/win32/utf8.c deleted file mode 100644 index b37dba44..00000000 --- a/rubbos/app/httpd-2.0.64/srclib/apr/misc/win32/utf8.c +++ /dev/null @@ -1,254 +0,0 @@ -/* Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "apr.h" -#include "apr_private.h" -#include "apr_errno.h" -#include "apr_arch_utf8.h" - -/* Implement the design principal specified by RFC 2718 2.2.5 - * Guidelines for new URL Schemes - within the APR. - * - * Since many architectures support unicode, and UCS2 is the most - * efficient storage used by those archictures, these functions - * exist to validate a UCS string. It is up to the operating system - * to determine the validitity of the string in the context of it's - * native language support. File systems that support filename - * characters of 0x80-0xff but have no support of Unicode will find - * this function useful only for validating the character sequences - * and rejecting poorly encoded strings, if RFC 2718 2.2.5 naming is - * desired. - * - * from RFC 2279 UTF-8, a transformation format of ISO 10646 - * - * UCS-4 range (hex.) UTF-8 octet sequence (binary) - * 1:2 0000 0000-0000 007F 0xxxxxxx - * 2:2 0000 0080-0000 07FF 110XXXXx 10xxxxxx - * 3:2 0000 0800-0000 FFFF 1110XXXX 10Xxxxxx 10xxxxxx - * 4:4 0001 0000-001F FFFF 11110zXX 10XXxxxx 10xxxxxx 10xxxxxx - * inv 0020 0000-03FF FFFF 111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx - * inv 0400 0000-7FFF FFFF 1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - * - * One of the X values must be one for the encoding length to be legit. - * Neither the z bit, nor the final two forms, are used for ucs-2 - * - * "Pairs of UCS-2 values between D800 and DFFF (surrogate pairs in - * Unicode parlance), being actually UCS-4 characters transformed - * through UTF-16, need special treatment: the UTF-16 transformation - * must be undone, yielding a UCS-4 character that is then transformed - * as above." - * - * from RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask - * - * U' = U - 0x10000 - * U' = 000000000000yyyyyyyyyyxxxxxxxxxx - * W1 = 110110yyyyyyyyyy - * W2 = 110111xxxxxxxxxx - * - * apr_conv_utf8_to_ucs2 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2 - * - * apr_conv_ucs2_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2 - */ - -APR_DECLARE(apr_status_t) apr_conv_utf8_to_ucs2(const char *in, - apr_size_t *inbytes, - apr_wchar_t *out, - apr_size_t *outwords) -{ - apr_int64_t newch, mask; - apr_size_t expect, eating; - int ch; - - while (*inbytes && *outwords) - { - ch = (unsigned char)(*in++); - if (!(ch & 0200)) { - /* US-ASCII-7 plain text - */ - --*inbytes; - --*outwords; - *(out++) = ch; - } - else - { - if ((ch & 0300) != 0300) { - /* Multibyte Continuation is out of place - */ - return APR_EINVAL; - } - else - { - /* Multibyte Sequence Lead Character - * - * Compute the expected bytes while adjusting - * or lead byte and leading zeros mask. - */ - mask = 0340; - expect = 1; - while ((ch & mask) == mask) { - mask |= mask >> 1; - if (++expect > 3) /* (truly 5 for ucs-4) */ - return APR_EINVAL; - } - newch = ch & ~mask; - eating = expect + 1; - if (*inbytes <= expect) - return APR_INCOMPLETE; - /* Reject values of excessive leading 0 bits - * utf-8 _demands_ the shortest possible byte length - */ - if (expect == 1) { - if (!(newch & 0036)) - return APR_EINVAL; - } - else { - /* Reject values of excessive leading 0 bits - */ - if (!newch && !((unsigned char)*in & 0077 & (mask << 1))) - return APR_EINVAL; - if (expect == 2) { - /* Reject values D800-DFFF when not utf16 encoded - * (may not be an appropriate restriction for ucs-4) - */ - if (newch == 0015 && ((unsigned char)*in & 0040)) - return APR_EINVAL; - } - else if (expect == 3) { - /* Short circuit values > 110000 - */ - if (newch > 4) - return APR_EINVAL; - if (newch == 4 && ((unsigned char)*in & 0060)) - return APR_EINVAL; - } - } - /* Where the boolean (expect > 2) is true, we will need - * an extra word for the output. - */ - if (*outwords < (apr_size_t)(expect > 2) + 1) - break; /* buffer full */ - while (expect--) - { - /* Multibyte Continuation must be legal */ - if (((ch = (unsigned char)*(in++)) & 0300) != 0200) - return APR_EINVAL; - newch <<= 6; - newch |= (ch & 0077); - } - *inbytes -= eating; - /* newch is now a true ucs-4 character - * - * now we need to fold to ucs-2 - */ - if (newch < 0x10000) - { - --*outwords; - *(out++) = (apr_wchar_t) newch; - } - else - { - *outwords -= 2; - newch -= 0x10000; - *(out++) = (apr_wchar_t) (0xD800 | (newch >> 10)); - *(out++) = (apr_wchar_t) (0xDC00 | (newch & 0x03FF)); - } - } - } - } - /* Buffer full 'errors' aren't errors, the client must inspect both - * the inbytes and outwords values - */ - return APR_SUCCESS; -} - -APR_DECLARE(apr_status_t) apr_conv_ucs2_to_utf8(const apr_wchar_t *in, - apr_size_t *inwords, - char *out, - apr_size_t *outbytes) -{ - apr_int64_t newch, require; - apr_size_t need; - char *invout; - int ch; - - while (*inwords && *outbytes) - { - ch = (unsigned short)(*in++); - if (ch < 0x80) - { - --*inwords; - --*outbytes; - *(out++) = (unsigned char) ch; - } - else - { - if ((ch & 0xFC00) == 0xDC00) { - /* Invalid Leading ucs-2 Multiword Continuation Character - */ - return APR_EINVAL; - } - if ((ch & 0xFC00) == 0xD800) { - /* Leading ucs-2 Multiword Character - */ - if (*inwords < 2) { - /* Missing ucs-2 Multiword Continuation Character - */ - return APR_INCOMPLETE; - } - if (((unsigned short)(*in) & 0xFC00) != 0xDC00) { - /* Invalid ucs-2 Multiword Continuation Character - */ - return APR_EINVAL; - } - newch = (ch & 0x03FF) << 10 | ((unsigned short)(*in++) & 0x03FF); - newch += 0x10000; - } - else { - /* ucs-2 Single Word Character - */ - newch = ch; - } - /* Determine the absolute minimum utf-8 bytes required - */ - require = newch >> 11; - need = 1; - while (require) - require >>= 5, ++need; - if (need >= *outbytes) - break; /* Insufficient buffer */ - *inwords -= (need > 2) + 1; - *outbytes -= need + 1; - /* Compute the utf-8 characters in last to first order, - * calculating the lead character length bits along the way. - */ - ch = 0200; - out += need + 1; - invout = out; - while (need--) { - ch |= ch >> 1; - *(--invout) = (unsigned char)(0200 | (newch & 0077)); - newch >>= 6; - } - /* Compute the lead utf-8 character and move the dest offset - */ - *(--invout) = (unsigned char)(ch | newch); - } - } - /* Buffer full 'errors' aren't errors, the client must inspect both - * the inwords and outbytes values - */ - return APR_SUCCESS; -} |