author | Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com> |
Fri, 12 Mar 2010 15:51:09 +0200 | |
branch | RCL_3 |
changeset 11 | 6971d1c87c9a |
parent 0 | 1fb32624e06b |
permissions | -rw-r--r-- |
0
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
1 |
{\rtf1\ansi\ansicpg1252\uc1 \deff0\deflang1033\deflangfe1033{\fonttbl{\f0\froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f1\fswiss\fcharset0\fprq2{\*\panose 020b0604020202020204}Arial;} |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
2 |
{\f79\froman\fcharset238\fprq2 Times New Roman CE;}{\f80\froman\fcharset204\fprq2 Times New Roman Cyr;}{\f82\froman\fcharset161\fprq2 Times New Roman Greek;}{\f83\froman\fcharset162\fprq2 Times New Roman Tur;} |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
3 |
{\f84\froman\fcharset186\fprq2 Times New Roman Baltic;}{\f85\fswiss\fcharset238\fprq2 Arial CE;}{\f86\fswiss\fcharset204\fprq2 Arial Cyr;}{\f88\fswiss\fcharset161\fprq2 Arial Greek;}{\f89\fswiss\fcharset162\fprq2 Arial Tur;} |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
4 |
{\f90\fswiss\fcharset186\fprq2 Arial Baltic;}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128; |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
5 |
\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}{\stylesheet{\widctlpar\adjustright \fs20\lang2057\cgrid \snext0 Normal;}{ |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
6 |
\s2\sb240\sa60\keepn\widctlpar\adjustright \b\i\f1\lang2057\cgrid \sbasedon0 \snext0 heading 2;}{\*\cs10 \additive Default Paragraph Font;}}{\info{\title EPOC Charconv - Autodetection }{\author Arunsakh Sachamuneewongse}{\operator Symbian} |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
7 |
{\creatim\yr2000\mo12\dy22\hr11\min55}{\revtim\yr2001\mo1\dy4\hr11\min1}{\version5}{\edmins47}{\nofpages3}{\nofwords1234}{\nofchars7039}{\*\company SYMBIAN}{\nofcharsws0}{\vern113}}\paperw11906\paperh16838 |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
8 |
\widowctrl\ftnbj\aenddoc\lytprtmet\hyphcaps0\formshade\viewkind1\viewscale100\pgbrdrhead\pgbrdrfoot \fet0\sectd \linex0\endnhere\sectdefaultcl {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
9 |
{\pntxta .}}{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang{\pntxta )}}{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
10 |
{\pntxtb (}{\pntxta )}}{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}\pard\plain |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
11 |
\s2\sb240\sa60\keepn\widctlpar\outlinelevel1\adjustright \b\i\f1\lang2057\cgrid {\i0 EPOC Charconv - Autodetection |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
12 |
\par }\pard\plain \widctlpar\adjustright \fs20\lang2057\cgrid { |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
13 |
\par Author: Arunsakh Sachamuneewongse |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
14 |
\par Contributors: David Batchelor. |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
15 |
\par Date: 19th October 2000 |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
16 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
17 |
\par }\pard\plain \s2\sb240\sa60\keepn\widctlpar\outlinelevel1\adjustright \b\i\f1\lang2057\cgrid {\i0 Introduction |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
18 |
\par }\pard\plain \widctlpar\adjustright \fs20\lang2057\cgrid { |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
19 |
\par }\pard\plain \s2\sb240\sa60\keepn\widctlpar\outlinelevel1\adjustright \b\i\f1\lang2057\cgrid {\i0 Charconv |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
20 |
\par }\pard\plain \widctlpar\adjustright \fs20\lang2057\cgrid {Charconv seemed the ideal place to implement the new Autodetect functionality, as autodetect can use Charconv\rquote |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
21 |
s information regarding all the different character sets. Since autodetect is implemented in Charconv, the autodetect framework is able to detect Charconv\rquote |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
22 |
s built-in character sets, which are ASCII, SMS7Bit, CP1252, ISO 88591, UTF7 & UTF8. Besides these character sets it also provides support framework for Charconv\rquote |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
23 |
s plug-ins character sets such as GBK, SHIFT-JIS, ISO2022JP, ISO 88592 - 99, although only the Japanese and the Chinese character sets have been implemented in autodetect for 6.1. |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
24 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
25 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
26 |
\par The following static function has been added to }{\cf2\lang1033 CCnvCharacterSetConverter |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
27 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
28 |
\par }\pard \sl240\slmult0\widctlpar\adjustright {\cf2\lang1033 IMPORT_C static void CCnvCharacterSetConverter}{\b\cf2\lang1033 ::}{\cf2\lang1033 AutoDetectCharacterSetL( |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
29 |
\par \tab TInt& aConfidenceLevel, |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
30 |
\par \tab TUint& aCharacterSetIdentifier, |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
31 |
\par \tab const CArrayFix<SCharacterSet>& aArrayOfCharacterSetsAvailable, |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
32 |
\par }\pard \widctlpar\adjustright {\cf2\lang1033 \tab const TDesC8& aSample)}{ |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
33 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
34 |
\par }{\cf1\lang1033 The first two parameters are both output-only parameters: }{\cf2\lang1033 aConfidenceLevel}{\cf1\lang1033 is set by the function to a value between 0 and 100 (inclusive), where 0 means "I have no idea what character set }{\cf2\lang1033 |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
35 |
aSample}{\cf1\lang1033 is in" (in which case }{\cf2\lang1033 aCharacterSetIdentifier}{\cf1\lang1033 is undefined), and 100 means "I have total confidence that }{\cf2\lang1033 aCharacterSetIdentifier}{\cf1\lang1033 is the character-set of }{ |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
36 |
\cf2\lang1033 aSample}{\cf1\lang1033 ". The }{\cf2\lang1033 aArrayOfCharacterSetsAvailable}{\cf1\lang1033 parameter is included for efficiency - }{\cf2\lang1033 CCnvCharacterSetConverter::AutoDetectCharacterSetL}{\cf1\lang1033 |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
37 |
could itself find the list of character-sets available but this is a very time-consuming task and the client may already have this information to hand.}{ |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
38 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
39 |
\par }\pard \sl240\slmult0\widctlpar\adjustright {The Charconv framework provides support for a few character sets built into it. These character sets |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
40 |
are ASCII, SMS7Bit, CP1252, ISO 88591, UTF7 & UTF8. The framework also provides support for many other character sets. This support is provided by Plug-in DLLs. At the CHARCONV }{\cf1\lang1033 plug-in DLL interface,}{ }{\cf1\lang1033 |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
41 |
the first reserved function (at ordinal 4) will be replaced with a new exported function as follows: |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
42 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
43 |
\par }{\cf2\lang1033 IMPORT_C TBool IsInThisCharacterSetL( |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
44 |
\par \tab TBool& aSetToTrue, |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
45 |
\par \tab TInt& aConfidenceLevel, |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
46 |
\par \tab const TDesC8& aSample) |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
47 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
48 |
\par }\pard \widctlpar\adjustright {\cf1\lang1033 The }{\cf2\lang1033 aSetToTrue}{\cf1\lang1033 parameter is just to indicate to }{\cf2\lang1033 CCnvCharacterSetConverter::AutoDetectCharacterSetL}{\cf1\lang1033 th |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
49 |
at the plug-in DLL is actually implementing a function of this signature and not the old first reserved function. The meanings of the remaining parameters is the same as the meanings of the corresponding parameters in }{\cf2\lang1033 |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
50 |
CCnvCharacterSetConverter::AutoDetectCharacterSetL}{\cf1\lang1033 .}{ |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
51 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
52 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
53 |
\par }\pard\plain \s2\sb240\sa60\keepn\widctlpar\outlinelevel1\adjustright \b\i\f1\lang2057\cgrid {\i0\cf1\lang1033 CCnvCharacterSetConverter::AutoDetectCharacterSetL |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
54 |
\par }\pard\plain \widctlpar\adjustright \fs20\lang2057\cgrid { |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
55 |
\par Internally AutoDetectCharacterSetL loops through the available }{\i\cf1 a}{\i\cf1\lang1033 ArrayOfCharacterSetsAvailable }{\cf1\lang1033 and checks whether that character set is a plug-in supported one. If it is, then it loads the |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
56 |
corresponding plug-in DLL and calls the DLL's exported function at ordinal 4 (checking }{\cf2\lang1033 aSetToTrue}{\cf1\lang1033 to verify that the plug-in DLL is actually implementing this function as }{\cf2\lang1033 IsInThisCharacterSetL}{ |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
57 |
\cf1\lang1033 - it may still be implementing it as }{\cf2\lang1033 Reserved_1}{\cf1\lang1033 which is perfectly legitimate).}{\i\cf1\lang1033 IsInThisCharacterSetL}{\cf1\lang1033 then sets a }{\i\cf1\lang1033 confidence level}{\cf1\lang1033 |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
58 |
for that character set for the sample. If the character set is not one that is supported by the plug-in libraries, it then loops through the list of character sets that are supported inte |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
59 |
rnally by Charconv and call their character specific functions, e.g. IsCharacterSetAscii, IsCharacterSetSMS7Bit etc. These functions also set a confidence level for that character set for the sample. Once it has looped through the entire array of }{ |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
60 |
\i\cf1 a}{\i\cf1\lang1033 ArrayOfCharacterSetsAvailable }{\cf1\lang1033 the character set with the highest confidence level is returned as the character set encoding the sample text is encoded in. |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
61 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
62 |
\par If the maximum confidence-level encountered in this loop is less than some threshold (say, 25), then }{\cf2\lang1033 CCnvCharacterSetConverter::AutoDetectCharacterSetL}{\cf1\lang1033 resorts to attempting to convert }{\cf2\lang1033 aSample}{ |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
63 |
\cf1\lang1033 to each of the character sets in }{\cf2\lang1033 aArrayOfCharacterSetsAvailable}{\cf1\lang1033 . The character set that generates fewest }{\cf2\lang1033 0xfffd}{\cf1\lang1033 Unicode "REPLACEMENT CHARACTER"-s wins, with }{\cf2\lang1033 |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
64 |
aConfidenceLevel}{\cf1\lang1033 being set according to some formula based on the difference between the number of }{\cf2\lang1033 0xfffd}{\cf1\lang1033 |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
65 |
characters generated by that character set and its nearest rival. If the maximum confidence level encountered (in the loop mentioned above) is shared by more than one character set, it will make the best guess possible. |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
66 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
67 |
\par If there are more that one character set encoding that have the same confidence level, }{\cf2\lang1033 CCnvCharacterSetConverter::AutoDetectCharacterSetL}{\cf1\lang1033 will convert the sample text }{\cf2\lang1033 aSample}{\cf1\lang1033 |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
68 |
in each character set encoding with the same confidence level and the one with the least number of Unicode replacement characters wins. |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
69 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
70 |
\par }\pard\plain \s2\sb240\sa60\keepn\widctlpar\outlinelevel1\adjustright \b\i\f1\lang2057\cgrid {\i0\cf1\lang1033 IsInThisCharacterSetL |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
71 |
\par }\pard\plain \widctlpar\adjustright \fs20\lang2057\cgrid { |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
72 |
\par Each of the character sets supported by the plug-ins have their own implementation of }{\i\cf2\lang1033 IsInThisCharacterSetL.}{\i\cf1\lang1033 }{\cf1\lang1033 This is |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
73 |
achieved by using signature sequences associated to a language/ encoding which is the same technique employed in virus detection. One such example would be if the encoding was a Modal Encoding - where escape characters/sequences or special characters are |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
74 |
used to switch between different character modes 1-byte, 2-byte or other character set. For e.g. ISO-2022JP is a modal encoding. The following are the escape sequences and their meaning. |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
75 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
76 |
\par Hex values of keys that make up the escape sequence. ISO2022JP is a multi-byte encoding. The range for the 1}{\cf1\lang1033\super st}{\cf1\lang1033 byte and the 2}{\cf1\lang1033\super nd}{\cf1\lang1033 |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
77 |
byte are the same, i.e. between the hex value 0x21 & 0x7E. |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
78 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
79 |
\par }{\lang1033 0x1B 0x28 0x42 - ASCII escape sequence. |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
80 |
\par 0x1B 0x28 0x42 - JIS-Roman escape sequence. |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
81 |
\par 0x1B 0x24 0x40 - JIS C 6226-1978. |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
82 |
\par \'85 etc. |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
83 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
84 |
\par The escape sequences tell what character the text following the escape sequence is encoded in. |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
85 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
86 |
\par So in }{\cf2\lang1033 AutoDetectCharacterSetL}{\lang1033 when ISO2022 is the character set in }{\cf2\lang1033 aArrayOfCharacterSetsAvailable, }{\cf1\lang1033 the }{\cf2\lang1033 IsInThisCharacterSetL}{\i\cf2\lang1033 }{\cf1\lang1033 |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
87 |
implementation for the ISO2022JP\rquote s p |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
88 |
lug-in is called. In that function, it is checked that all the characters that make up the sample text are in range, if it is not in range, i.e. if there exist characters whose values are greater than 0x7e then the sample text is not ISO2022JP. So we can |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
89 |
g |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
90 |
ive it a confidence level of 0. Then it looks for the escape sequences. The presence of the escape sequences that are unique (not completely unique though. For example ISO2022JP1 is exactly the same as ISO2022JP + an extra escape sequence) to this charac |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
91 |
ter set increases the possibility of the sample text being encoded in this character set encoding. |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
92 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
93 |
\par EUCJP encoding uses shift characters. 0x8E and 0x8F are the single shift characters. The range of the encoded characters + the single shift character tells whether that byte is part of a 2-byte character. Here are the ranges of EUCJP |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
94 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
95 |
\par 0x00 - 0x7F |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
96 |
\par [0xA1 - 0xFE] [0xA1 - 0xFE] 2 byte character with the range of each byte shown brackets. |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
97 |
\par 0x8E [0xA0 - 0xDF] single Shift Character 0x8E |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
98 |
\par 0x80 [0xA1 - 0xFE] [0xA1 - 0xFE] |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
99 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
100 |
\par The sample text provided affects the outcome of the function. If the sample text is very small, there might be only one escape sequence which is not enough to give a 100% accurate answer. In order to gain more knowledge of different Character |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
101 |
set encoding, read chapter 4 of the following |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
102 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
103 |
\par \ldblquote CJKV Information Processing\rdblquote - by Ken Lunde. |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
104 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
105 |
\par }\pard\plain \s2\sb240\sa60\keepn\widctlpar\outlinelevel1\adjustright \b\i\f1\lang2057\cgrid {\i0\lang1033 Sample Code |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
106 |
\par }\pard\plain \widctlpar\adjustright \fs20\lang2057\cgrid { |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
107 |
\par Here are a few working sample of Autodetection. |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
108 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
109 |
\par As mentioned earlier }{\cf1\lang1033 the }{\cf2\lang1033 aArrayOfCharacterSetsAvailable}{\cf1\lang1033 parameter is included for efficiency - }{\cf2\lang1033 CCnvCharacterSetConverter::AutoDetectCharacterSetL}{\cf1\lang1033 |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
110 |
could itself find the list of character-sets available but this is a very time-consuming task and the client may already have this information to hand. If the client does not have that information then the following creates an array of character sets |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
111 |
|
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
112 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
113 |
\par }{\cf2 RFs fileServerSession; |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
114 |
\par CleanupClosePushL(fileServerSession); |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
115 |
\par User::LeaveIfError(fileServerSession.Connect()); |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
116 |
\par CCnvCharacterSetConverter* characterSetConverter=CCnvCharacterSetConverter::NewLC(); |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
117 |
\par CArrayFix<CCnvCharacterSetConverter::SCharacterSet>* arrayOfCharacterSetsAvailable=characterSetConverter-> CreateArrayOfCharacterSetsAvailableLC(fileServerSession); |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
118 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
119 |
\par }{\cf1 then the }{\cf1\lang1033 AutoDetectCharacterSetL}{\cf1 function can be used |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
120 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
121 |
\par }{\cf2 _LIT8(KASCII, \ldblquote The result I am expecting is that this is recognised as ASCII!"); |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
122 |
\par TInt Confidence = 0; |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
123 |
\par TUint Character = 0; |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
124 |
\par CCnvCharacterSetConverter::AutoDetectCharacterSetL (Confidence, Character, *arrayOfCharacterSetsAvailable, KEUP); |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
125 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
126 |
\par }{\cf1 The sample text is ASCII encoded plain text. Confidence and Character are where the confidence level and the winning Character set are returned. |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
127 |
\par }{\cf2 |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
128 |
\par }{ |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
129 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
130 |
\par |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
131 |
\par }{\cf1\lang1033 |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
132 |
\par }{ |
1fb32624e06b
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
133 |
\par }} |