webengine/osswebengine/JavaScriptCore/pcre/ucpinternal.h
changeset 0 dd21522fd290
child 25 0ed94ceaa377
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/webengine/osswebengine/JavaScriptCore/pcre/ucpinternal.h	Mon Mar 30 12:54:55 2009 +0300
@@ -0,0 +1,91 @@
+/*************************************************
+*     libucp - Unicode Property Table handler    *
+*************************************************/
+
+/* Internal header file defining the layout of compact nodes in the tree. */
+
+typedef struct cnode {
+  unsigned short int f0;
+  unsigned short int f1;
+  unsigned short int f2;
+} cnode;
+
+/* Things for the f0 field */
+
+#define f0_leftexists   0x8000    /* Left child exists */
+#define f0_typemask     0x3f00    /* Type bits */
+#define f0_typeshift         8    /* Type shift */
+#define f0_chhmask      0x00ff    /* Character high bits */
+
+/* Things for the f2 field */
+
+#define f2_rightmask    0xf000    /* Mask for right offset bits */
+#define f2_rightshift       12    /* Shift for right offset */
+#define f2_casemask     0x0fff    /* Mask for case offset */
+
+/* The tree consists of a vector of structures of type cnode, with the root
+node as the first element. The three short ints (16-bits) are used as follows:
+
+(f0) (1) The 0x8000 bit of f0 is set if a left child exists. The child's node
+         is the next node in the vector.
+     (2) The 0x4000 bits of f0 is spare.
+     (3) The 0x3f00 bits of f0 contain the character type; this is a number
+         defined by the enumeration in ucp.h (e.g. ucp_Lu).
+     (4) The bottom 8 bits of f0 contain the most significant byte of the
+         character's 24-bit codepoint.
+
+(f1) (1) The f1 field contains the two least significant bytes of the
+         codepoint.
+
+(f2) (1) The 0xf000 bits of f2 contain zero if there is no right child of this
+         node. Otherwise, they contain one plus the exponent of the power of
+         two of the offset to the right node (e.g. a value of 3 means 8). The
+         units of the offset are node items.
+
+     (2) The 0x0fff bits of f2 contain the signed offset from this character to
+         its alternate cased value. They are zero if there is no such
+         character.
+
+
+-----------------------------------------------------------------------------
+||.|.| type (6) | ms char (8) ||  ls char (16)  ||....|  case offset (12)  ||
+-----------------------------------------------------------------------------
+  | |                                              |
+  | |-> spare                                      |
+  |                                        exponent of right
+  |-> left child exists                       child offset
+
+
+The upper/lower casing information is set only for characters that come in
+pairs. There are (at present) four non-one-to-one mappings in the Unicode data.
+These are ignored. They are:
+
+  1FBE Greek Prosgegrammeni (lower, with upper -> capital iota)
+  2126 Ohm
+  212A Kelvin
+  212B Angstrom
+
+Certainly for the last three, having an alternate case would seem to be a
+mistake. I don't know any Greek, so cannot comment on the first one.
+
+
+When searching the tree, proceed as follows:
+
+(1) Start at the first node.
+
+(2) Extract the character value from f1 and the bottom 8 bits of f0;
+
+(3) Compare with the character being sought. If equal, we are done.
+
+(4) If the test character is smaller, inspect the f0_leftexists flag. If it is
+    not set, the character is not in the tree. If it is set, move to the next
+    node, and go to (2).
+
+(5) If the test character is bigger, extract the f2_rightmask bits from f2, and
+    shift them right by f2_rightshift. If the result is zero, the character is
+    not in the tree. Otherwise, calculate the number of nodes to skip by
+    shifting the value 1 left by this number minus one. Go to (2).
+*/
+
+
+/* End of internal.h */