|
1 /* |
|
2 ** 2007 May 6 |
|
3 ** |
|
4 ** The author disclaims copyright to this source code. In place of |
|
5 ** a legal notice, here is a blessing: |
|
6 ** |
|
7 ** May you do good and not evil. |
|
8 ** May you find forgiveness for yourself and forgive others. |
|
9 ** May you share freely, never taking more than you give. |
|
10 ** |
|
11 ************************************************************************* |
|
12 ** $Id: icu.c,v 1.7 2007/12/13 21:54:11 drh Exp $ |
|
13 ** |
|
14 ** This file implements an integration between the ICU library |
|
15 ** ("International Components for Unicode", an open-source library |
|
16 ** for handling unicode data) and SQLite. The integration uses |
|
17 ** ICU to provide the following to SQLite: |
|
18 ** |
|
19 ** * An implementation of the SQL regexp() function (and hence REGEXP |
|
20 ** operator) using the ICU uregex_XX() APIs. |
|
21 ** |
|
22 ** * Implementations of the SQL scalar upper() and lower() functions |
|
23 ** for case mapping. |
|
24 ** |
|
25 ** * Integration of ICU and SQLite collation seqences. |
|
26 ** |
|
27 ** * An implementation of the LIKE operator that uses ICU to |
|
28 ** provide case-independent matching. |
|
29 */ |
|
30 |
|
31 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU) |
|
32 |
|
33 /* Include ICU headers */ |
|
34 #include <unicode/utypes.h> |
|
35 #include <unicode/uregex.h> |
|
36 #include <unicode/ustring.h> |
|
37 #include <unicode/ucol.h> |
|
38 |
|
39 #include <assert.h> |
|
40 |
|
41 #ifndef SQLITE_CORE |
|
42 #include "sqlite3ext.h" |
|
43 SQLITE_EXTENSION_INIT1 |
|
44 #else |
|
45 #include "sqlite3.h" |
|
46 #endif |
|
47 |
|
48 /* |
|
49 ** Maximum length (in bytes) of the pattern in a LIKE or GLOB |
|
50 ** operator. |
|
51 */ |
|
52 #ifndef SQLITE_MAX_LIKE_PATTERN_LENGTH |
|
53 # define SQLITE_MAX_LIKE_PATTERN_LENGTH 50000 |
|
54 #endif |
|
55 |
|
56 /* |
|
57 ** Version of sqlite3_free() that is always a function, never a macro. |
|
58 */ |
|
59 static void xFree(void *p){ |
|
60 sqlite3_free(p); |
|
61 } |
|
62 |
|
63 /* |
|
64 ** Compare two UTF-8 strings for equality where the first string is |
|
65 ** a "LIKE" expression. Return true (1) if they are the same and |
|
66 ** false (0) if they are different. |
|
67 */ |
|
68 static int icuLikeCompare( |
|
69 const uint8_t *zPattern, /* LIKE pattern */ |
|
70 const uint8_t *zString, /* The UTF-8 string to compare against */ |
|
71 const UChar32 uEsc /* The escape character */ |
|
72 ){ |
|
73 static const int MATCH_ONE = (UChar32)'_'; |
|
74 static const int MATCH_ALL = (UChar32)'%'; |
|
75 |
|
76 int iPattern = 0; /* Current byte index in zPattern */ |
|
77 int iString = 0; /* Current byte index in zString */ |
|
78 |
|
79 int prevEscape = 0; /* True if the previous character was uEsc */ |
|
80 |
|
81 while( zPattern[iPattern]!=0 ){ |
|
82 |
|
83 /* Read (and consume) the next character from the input pattern. */ |
|
84 UChar32 uPattern; |
|
85 U8_NEXT_UNSAFE(zPattern, iPattern, uPattern); |
|
86 assert(uPattern!=0); |
|
87 |
|
88 /* There are now 4 possibilities: |
|
89 ** |
|
90 ** 1. uPattern is an unescaped match-all character "%", |
|
91 ** 2. uPattern is an unescaped match-one character "_", |
|
92 ** 3. uPattern is an unescaped escape character, or |
|
93 ** 4. uPattern is to be handled as an ordinary character |
|
94 */ |
|
95 if( !prevEscape && uPattern==MATCH_ALL ){ |
|
96 /* Case 1. */ |
|
97 uint8_t c; |
|
98 |
|
99 /* Skip any MATCH_ALL or MATCH_ONE characters that follow a |
|
100 ** MATCH_ALL. For each MATCH_ONE, skip one character in the |
|
101 ** test string. |
|
102 */ |
|
103 while( (c=zPattern[iPattern]) == MATCH_ALL || c == MATCH_ONE ){ |
|
104 if( c==MATCH_ONE ){ |
|
105 if( zString[iString]==0 ) return 0; |
|
106 U8_FWD_1_UNSAFE(zString, iString); |
|
107 } |
|
108 iPattern++; |
|
109 } |
|
110 |
|
111 if( zPattern[iPattern]==0 ) return 1; |
|
112 |
|
113 while( zString[iString] ){ |
|
114 if( icuLikeCompare(&zPattern[iPattern], &zString[iString], uEsc) ){ |
|
115 return 1; |
|
116 } |
|
117 U8_FWD_1_UNSAFE(zString, iString); |
|
118 } |
|
119 return 0; |
|
120 |
|
121 }else if( !prevEscape && uPattern==MATCH_ONE ){ |
|
122 /* Case 2. */ |
|
123 if( zString[iString]==0 ) return 0; |
|
124 U8_FWD_1_UNSAFE(zString, iString); |
|
125 |
|
126 }else if( !prevEscape && uPattern==uEsc){ |
|
127 /* Case 3. */ |
|
128 prevEscape = 1; |
|
129 |
|
130 }else{ |
|
131 /* Case 4. */ |
|
132 UChar32 uString; |
|
133 U8_NEXT_UNSAFE(zString, iString, uString); |
|
134 uString = u_foldCase(uString, U_FOLD_CASE_DEFAULT); |
|
135 uPattern = u_foldCase(uPattern, U_FOLD_CASE_DEFAULT); |
|
136 if( uString!=uPattern ){ |
|
137 return 0; |
|
138 } |
|
139 prevEscape = 0; |
|
140 } |
|
141 } |
|
142 |
|
143 return zString[iString]==0; |
|
144 } |
|
145 |
|
146 /* |
|
147 ** Implementation of the like() SQL function. This function implements |
|
148 ** the build-in LIKE operator. The first argument to the function is the |
|
149 ** pattern and the second argument is the string. So, the SQL statements: |
|
150 ** |
|
151 ** A LIKE B |
|
152 ** |
|
153 ** is implemented as like(B, A). If there is an escape character E, |
|
154 ** |
|
155 ** A LIKE B ESCAPE E |
|
156 ** |
|
157 ** is mapped to like(B, A, E). |
|
158 */ |
|
159 static void icuLikeFunc( |
|
160 sqlite3_context *context, |
|
161 int argc, |
|
162 sqlite3_value **argv |
|
163 ){ |
|
164 const unsigned char *zA = sqlite3_value_text(argv[0]); |
|
165 const unsigned char *zB = sqlite3_value_text(argv[1]); |
|
166 UChar32 uEsc = 0; |
|
167 |
|
168 /* Limit the length of the LIKE or GLOB pattern to avoid problems |
|
169 ** of deep recursion and N*N behavior in patternCompare(). |
|
170 */ |
|
171 if( sqlite3_value_bytes(argv[0])>SQLITE_MAX_LIKE_PATTERN_LENGTH ){ |
|
172 sqlite3_result_error(context, "LIKE or GLOB pattern too complex", -1); |
|
173 return; |
|
174 } |
|
175 |
|
176 |
|
177 if( argc==3 ){ |
|
178 /* The escape character string must consist of a single UTF-8 character. |
|
179 ** Otherwise, return an error. |
|
180 */ |
|
181 int nE= sqlite3_value_bytes(argv[2]); |
|
182 const unsigned char *zE = sqlite3_value_text(argv[2]); |
|
183 int i = 0; |
|
184 if( zE==0 ) return; |
|
185 U8_NEXT(zE, i, nE, uEsc); |
|
186 if( i!=nE){ |
|
187 sqlite3_result_error(context, |
|
188 "ESCAPE expression must be a single character", -1); |
|
189 return; |
|
190 } |
|
191 } |
|
192 |
|
193 if( zA && zB ){ |
|
194 sqlite3_result_int(context, icuLikeCompare(zA, zB, uEsc)); |
|
195 } |
|
196 } |
|
197 |
|
198 /* |
|
199 ** This function is called when an ICU function called from within |
|
200 ** the implementation of an SQL scalar function returns an error. |
|
201 ** |
|
202 ** The scalar function context passed as the first argument is |
|
203 ** loaded with an error message based on the following two args. |
|
204 */ |
|
205 static void icuFunctionError( |
|
206 sqlite3_context *pCtx, /* SQLite scalar function context */ |
|
207 const char *zName, /* Name of ICU function that failed */ |
|
208 UErrorCode e /* Error code returned by ICU function */ |
|
209 ){ |
|
210 char zBuf[128]; |
|
211 sqlite3_snprintf(128, zBuf, "ICU error: %s(): %s", zName, u_errorName(e)); |
|
212 zBuf[127] = '\0'; |
|
213 sqlite3_result_error(pCtx, zBuf, -1); |
|
214 } |
|
215 |
|
216 /* |
|
217 ** Function to delete compiled regexp objects. Registered as |
|
218 ** a destructor function with sqlite3_set_auxdata(). |
|
219 */ |
|
220 static void icuRegexpDelete(void *p){ |
|
221 URegularExpression *pExpr = (URegularExpression *)p; |
|
222 uregex_close(pExpr); |
|
223 } |
|
224 |
|
225 /* |
|
226 ** Implementation of SQLite REGEXP operator. This scalar function takes |
|
227 ** two arguments. The first is a regular expression pattern to compile |
|
228 ** the second is a string to match against that pattern. If either |
|
229 ** argument is an SQL NULL, then NULL Is returned. Otherwise, the result |
|
230 ** is 1 if the string matches the pattern, or 0 otherwise. |
|
231 ** |
|
232 ** SQLite maps the regexp() function to the regexp() operator such |
|
233 ** that the following two are equivalent: |
|
234 ** |
|
235 ** zString REGEXP zPattern |
|
236 ** regexp(zPattern, zString) |
|
237 ** |
|
238 ** Uses the following ICU regexp APIs: |
|
239 ** |
|
240 ** uregex_open() |
|
241 ** uregex_matches() |
|
242 ** uregex_close() |
|
243 */ |
|
244 static void icuRegexpFunc(sqlite3_context *p, int nArg, sqlite3_value **apArg){ |
|
245 UErrorCode status = U_ZERO_ERROR; |
|
246 URegularExpression *pExpr; |
|
247 UBool res; |
|
248 const UChar *zString = sqlite3_value_text16(apArg[1]); |
|
249 |
|
250 /* If the left hand side of the regexp operator is NULL, |
|
251 ** then the result is also NULL. |
|
252 */ |
|
253 if( !zString ){ |
|
254 return; |
|
255 } |
|
256 |
|
257 pExpr = sqlite3_get_auxdata(p, 0); |
|
258 if( !pExpr ){ |
|
259 const UChar *zPattern = sqlite3_value_text16(apArg[0]); |
|
260 if( !zPattern ){ |
|
261 return; |
|
262 } |
|
263 pExpr = uregex_open(zPattern, -1, 0, 0, &status); |
|
264 |
|
265 if( U_SUCCESS(status) ){ |
|
266 sqlite3_set_auxdata(p, 0, pExpr, icuRegexpDelete); |
|
267 }else{ |
|
268 assert(!pExpr); |
|
269 icuFunctionError(p, "uregex_open", status); |
|
270 return; |
|
271 } |
|
272 } |
|
273 |
|
274 /* Configure the text that the regular expression operates on. */ |
|
275 uregex_setText(pExpr, zString, -1, &status); |
|
276 if( !U_SUCCESS(status) ){ |
|
277 icuFunctionError(p, "uregex_setText", status); |
|
278 return; |
|
279 } |
|
280 |
|
281 /* Attempt the match */ |
|
282 res = uregex_matches(pExpr, 0, &status); |
|
283 if( !U_SUCCESS(status) ){ |
|
284 icuFunctionError(p, "uregex_matches", status); |
|
285 return; |
|
286 } |
|
287 |
|
288 /* Set the text that the regular expression operates on to a NULL |
|
289 ** pointer. This is not really necessary, but it is tidier than |
|
290 ** leaving the regular expression object configured with an invalid |
|
291 ** pointer after this function returns. |
|
292 */ |
|
293 uregex_setText(pExpr, 0, 0, &status); |
|
294 |
|
295 /* Return 1 or 0. */ |
|
296 sqlite3_result_int(p, res ? 1 : 0); |
|
297 } |
|
298 |
|
299 /* |
|
300 ** Implementations of scalar functions for case mapping - upper() and |
|
301 ** lower(). Function upper() converts its input to upper-case (ABC). |
|
302 ** Function lower() converts to lower-case (abc). |
|
303 ** |
|
304 ** ICU provides two types of case mapping, "general" case mapping and |
|
305 ** "language specific". Refer to ICU documentation for the differences |
|
306 ** between the two. |
|
307 ** |
|
308 ** To utilise "general" case mapping, the upper() or lower() scalar |
|
309 ** functions are invoked with one argument: |
|
310 ** |
|
311 ** upper('ABC') -> 'abc' |
|
312 ** lower('abc') -> 'ABC' |
|
313 ** |
|
314 ** To access ICU "language specific" case mapping, upper() or lower() |
|
315 ** should be invoked with two arguments. The second argument is the name |
|
316 ** of the locale to use. Passing an empty string ("") or SQL NULL value |
|
317 ** as the second argument is the same as invoking the 1 argument version |
|
318 ** of upper() or lower(). |
|
319 ** |
|
320 ** lower('I', 'en_us') -> 'i' |
|
321 ** lower('I', 'tr_tr') -> 'ı' (small dotless i) |
|
322 ** |
|
323 ** http://www.icu-project.org/userguide/posix.html#case_mappings |
|
324 */ |
|
325 static void icuCaseFunc16(sqlite3_context *p, int nArg, sqlite3_value **apArg){ |
|
326 const UChar *zInput; |
|
327 UChar *zOutput; |
|
328 int nInput; |
|
329 int nOutput; |
|
330 |
|
331 UErrorCode status = U_ZERO_ERROR; |
|
332 const char *zLocale = 0; |
|
333 |
|
334 assert(nArg==1 || nArg==2); |
|
335 if( nArg==2 ){ |
|
336 zLocale = (const char *)sqlite3_value_text(apArg[1]); |
|
337 } |
|
338 |
|
339 zInput = sqlite3_value_text16(apArg[0]); |
|
340 if( !zInput ){ |
|
341 return; |
|
342 } |
|
343 nInput = sqlite3_value_bytes16(apArg[0]); |
|
344 |
|
345 nOutput = nInput * 2 + 2; |
|
346 zOutput = sqlite3_malloc(nOutput); |
|
347 if( !zOutput ){ |
|
348 return; |
|
349 } |
|
350 |
|
351 if( sqlite3_user_data(p) ){ |
|
352 u_strToUpper(zOutput, nOutput/2, zInput, nInput/2, zLocale, &status); |
|
353 }else{ |
|
354 u_strToLower(zOutput, nOutput/2, zInput, nInput/2, zLocale, &status); |
|
355 } |
|
356 |
|
357 if( !U_SUCCESS(status) ){ |
|
358 icuFunctionError(p, "u_strToLower()/u_strToUpper", status); |
|
359 return; |
|
360 } |
|
361 |
|
362 sqlite3_result_text16(p, zOutput, -1, xFree); |
|
363 } |
|
364 |
|
365 /* |
|
366 ** Collation sequence destructor function. The pCtx argument points to |
|
367 ** a UCollator structure previously allocated using ucol_open(). |
|
368 */ |
|
369 static void icuCollationDel(void *pCtx){ |
|
370 UCollator *p = (UCollator *)pCtx; |
|
371 ucol_close(p); |
|
372 } |
|
373 |
|
374 /* |
|
375 ** Collation sequence comparison function. The pCtx argument points to |
|
376 ** a UCollator structure previously allocated using ucol_open(). |
|
377 */ |
|
378 static int icuCollationColl( |
|
379 void *pCtx, |
|
380 int nLeft, |
|
381 const void *zLeft, |
|
382 int nRight, |
|
383 const void *zRight |
|
384 ){ |
|
385 UCollationResult res; |
|
386 UCollator *p = (UCollator *)pCtx; |
|
387 res = ucol_strcoll(p, (UChar *)zLeft, nLeft/2, (UChar *)zRight, nRight/2); |
|
388 switch( res ){ |
|
389 case UCOL_LESS: return -1; |
|
390 case UCOL_GREATER: return +1; |
|
391 case UCOL_EQUAL: return 0; |
|
392 } |
|
393 assert(!"Unexpected return value from ucol_strcoll()"); |
|
394 return 0; |
|
395 } |
|
396 |
|
397 /* |
|
398 ** Implementation of the scalar function icu_load_collation(). |
|
399 ** |
|
400 ** This scalar function is used to add ICU collation based collation |
|
401 ** types to an SQLite database connection. It is intended to be called |
|
402 ** as follows: |
|
403 ** |
|
404 ** SELECT icu_load_collation(<locale>, <collation-name>); |
|
405 ** |
|
406 ** Where <locale> is a string containing an ICU locale identifier (i.e. |
|
407 ** "en_AU", "tr_TR" etc.) and <collation-name> is the name of the |
|
408 ** collation sequence to create. |
|
409 */ |
|
410 static void icuLoadCollation( |
|
411 sqlite3_context *p, |
|
412 int nArg, |
|
413 sqlite3_value **apArg |
|
414 ){ |
|
415 sqlite3 *db = (sqlite3 *)sqlite3_user_data(p); |
|
416 UErrorCode status = U_ZERO_ERROR; |
|
417 const char *zLocale; /* Locale identifier - (eg. "jp_JP") */ |
|
418 const char *zName; /* SQL Collation sequence name (eg. "japanese") */ |
|
419 UCollator *pUCollator; /* ICU library collation object */ |
|
420 int rc; /* Return code from sqlite3_create_collation_x() */ |
|
421 |
|
422 assert(nArg==2); |
|
423 zLocale = (const char *)sqlite3_value_text(apArg[0]); |
|
424 zName = (const char *)sqlite3_value_text(apArg[1]); |
|
425 |
|
426 if( !zLocale || !zName ){ |
|
427 return; |
|
428 } |
|
429 |
|
430 pUCollator = ucol_open(zLocale, &status); |
|
431 if( !U_SUCCESS(status) ){ |
|
432 icuFunctionError(p, "ucol_open", status); |
|
433 return; |
|
434 } |
|
435 assert(p); |
|
436 |
|
437 rc = sqlite3_create_collation_v2(db, zName, SQLITE_UTF16, (void *)pUCollator, |
|
438 icuCollationColl, icuCollationDel |
|
439 ); |
|
440 if( rc!=SQLITE_OK ){ |
|
441 ucol_close(pUCollator); |
|
442 sqlite3_result_error(p, "Error registering collation function", -1); |
|
443 } |
|
444 } |
|
445 |
|
446 /* |
|
447 ** Register the ICU extension functions with database db. |
|
448 */ |
|
449 int sqlite3IcuInit(sqlite3 *db){ |
|
450 struct IcuScalar { |
|
451 const char *zName; /* Function name */ |
|
452 int nArg; /* Number of arguments */ |
|
453 int enc; /* Optimal text encoding */ |
|
454 void *pContext; /* sqlite3_user_data() context */ |
|
455 void (*xFunc)(sqlite3_context*,int,sqlite3_value**); |
|
456 } scalars[] = { |
|
457 {"regexp",-1, SQLITE_ANY, 0, icuRegexpFunc}, |
|
458 |
|
459 {"lower", 1, SQLITE_UTF16, 0, icuCaseFunc16}, |
|
460 {"lower", 2, SQLITE_UTF16, 0, icuCaseFunc16}, |
|
461 {"upper", 1, SQLITE_UTF16, (void*)1, icuCaseFunc16}, |
|
462 {"upper", 2, SQLITE_UTF16, (void*)1, icuCaseFunc16}, |
|
463 |
|
464 {"lower", 1, SQLITE_UTF8, 0, icuCaseFunc16}, |
|
465 {"lower", 2, SQLITE_UTF8, 0, icuCaseFunc16}, |
|
466 {"upper", 1, SQLITE_UTF8, (void*)1, icuCaseFunc16}, |
|
467 {"upper", 2, SQLITE_UTF8, (void*)1, icuCaseFunc16}, |
|
468 |
|
469 {"like", 2, SQLITE_UTF8, 0, icuLikeFunc}, |
|
470 {"like", 3, SQLITE_UTF8, 0, icuLikeFunc}, |
|
471 |
|
472 {"icu_load_collation", 2, SQLITE_UTF8, (void*)db, icuLoadCollation}, |
|
473 }; |
|
474 |
|
475 int rc = SQLITE_OK; |
|
476 int i; |
|
477 |
|
478 for(i=0; rc==SQLITE_OK && i<(sizeof(scalars)/sizeof(struct IcuScalar)); i++){ |
|
479 struct IcuScalar *p = &scalars[i]; |
|
480 rc = sqlite3_create_function( |
|
481 db, p->zName, p->nArg, p->enc, p->pContext, p->xFunc, 0, 0 |
|
482 ); |
|
483 } |
|
484 |
|
485 return rc; |
|
486 } |
|
487 |
|
488 #if !SQLITE_CORE |
|
489 int sqlite3_extension_init( |
|
490 sqlite3 *db, |
|
491 char **pzErrMsg, |
|
492 const sqlite3_api_routines *pApi |
|
493 ){ |
|
494 SQLITE_EXTENSION_INIT2(pApi) |
|
495 return sqlite3IcuInit(db); |
|
496 } |
|
497 #endif |
|
498 |
|
499 #endif |