diff -r 56cd22a7a1cb -r f2f7b3284356 charconvfw/charconv_fw/documentation/autodetect.rtf --- a/charconvfw/charconv_fw/documentation/autodetect.rtf Fri Apr 16 16:55:07 2010 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,133 +0,0 @@ -{\rtf1\ansi\ansicpg1252\uc1 \deff0\deflang1033\deflangfe1033{\fonttbl{\f0\froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f1\fswiss\fcharset0\fprq2{\*\panose 020b0604020202020204}Arial;} -{\f79\froman\fcharset238\fprq2 Times New Roman CE;}{\f80\froman\fcharset204\fprq2 Times New Roman Cyr;}{\f82\froman\fcharset161\fprq2 Times New Roman Greek;}{\f83\froman\fcharset162\fprq2 Times New Roman Tur;} -{\f84\froman\fcharset186\fprq2 Times New Roman Baltic;}{\f85\fswiss\fcharset238\fprq2 Arial CE;}{\f86\fswiss\fcharset204\fprq2 Arial Cyr;}{\f88\fswiss\fcharset161\fprq2 Arial Greek;}{\f89\fswiss\fcharset162\fprq2 Arial Tur;} -{\f90\fswiss\fcharset186\fprq2 Arial Baltic;}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128; -\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}{\stylesheet{\widctlpar\adjustright \fs20\lang2057\cgrid \snext0 Normal;}{ -\s2\sb240\sa60\keepn\widctlpar\adjustright \b\i\f1\lang2057\cgrid \sbasedon0 \snext0 heading 2;}{\*\cs10 \additive Default Paragraph Font;}}{\info{\title EPOC Charconv - Autodetection }{\author Arunsakh Sachamuneewongse}{\operator Symbian} -{\creatim\yr2000\mo12\dy22\hr11\min55}{\revtim\yr2001\mo1\dy4\hr11\min1}{\version5}{\edmins47}{\nofpages3}{\nofwords1234}{\nofchars7039}{\*\company SYMBIAN}{\nofcharsws0}{\vern113}}\paperw11906\paperh16838 -\widowctrl\ftnbj\aenddoc\lytprtmet\hyphcaps0\formshade\viewkind1\viewscale100\pgbrdrhead\pgbrdrfoot \fet0\sectd \linex0\endnhere\sectdefaultcl {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang -{\pntxta .}}{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang{\pntxta )}}{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang -{\pntxtb (}{\pntxta )}}{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}\pard\plain -\s2\sb240\sa60\keepn\widctlpar\outlinelevel1\adjustright \b\i\f1\lang2057\cgrid {\i0 EPOC Charconv - Autodetection -\par }\pard\plain \widctlpar\adjustright \fs20\lang2057\cgrid { -\par Author: Arunsakh Sachamuneewongse -\par Contributors: David Batchelor. -\par Date: 19th October 2000 -\par -\par }\pard\plain \s2\sb240\sa60\keepn\widctlpar\outlinelevel1\adjustright \b\i\f1\lang2057\cgrid {\i0 Introduction -\par }\pard\plain \widctlpar\adjustright \fs20\lang2057\cgrid { -\par }\pard\plain \s2\sb240\sa60\keepn\widctlpar\outlinelevel1\adjustright \b\i\f1\lang2057\cgrid {\i0 Charconv -\par }\pard\plain \widctlpar\adjustright \fs20\lang2057\cgrid {Charconv seemed the ideal place to implement the new Autodetect functionality, as autodetect can use Charconv\rquote -s information regarding all the different character sets. Since autodetect is implemented in Charconv, the autodetect framework is able to detect Charconv\rquote -s built-in character sets, which are ASCII, SMS7Bit, CP1252, ISO 88591, UTF7 & UTF8. Besides these character sets it also provides support framework for Charconv\rquote -s plug-ins character sets such as GBK, SHIFT-JIS, ISO2022JP, ISO 88592 - 99, although only the Japanese and the Chinese character sets have been implemented in autodetect for 6.1. -\par -\par -\par The following static function has been added to }{\cf2\lang1033 CCnvCharacterSetConverter -\par -\par }\pard \sl240\slmult0\widctlpar\adjustright {\cf2\lang1033 IMPORT_C static void CCnvCharacterSetConverter}{\b\cf2\lang1033 ::}{\cf2\lang1033 AutoDetectCharacterSetL( -\par \tab TInt& aConfidenceLevel, -\par \tab TUint& aCharacterSetIdentifier, -\par \tab const CArrayFix& aArrayOfCharacterSetsAvailable, -\par }\pard \widctlpar\adjustright {\cf2\lang1033 \tab const TDesC8& aSample)}{ -\par -\par }{\cf1\lang1033 The first two parameters are both output-only parameters: }{\cf2\lang1033 aConfidenceLevel}{\cf1\lang1033 is set by the function to a value between 0 and 100 (inclusive), where 0 means "I have no idea what character set }{\cf2\lang1033 -aSample}{\cf1\lang1033 is in" (in which case }{\cf2\lang1033 aCharacterSetIdentifier}{\cf1\lang1033 is undefined), and 100 means "I have total confidence that }{\cf2\lang1033 aCharacterSetIdentifier}{\cf1\lang1033 is the character-set of }{ -\cf2\lang1033 aSample}{\cf1\lang1033 ". The }{\cf2\lang1033 aArrayOfCharacterSetsAvailable}{\cf1\lang1033 parameter is included for efficiency - }{\cf2\lang1033 CCnvCharacterSetConverter::AutoDetectCharacterSetL}{\cf1\lang1033 - could itself find the list of character-sets available but this is a very time-consuming task and the client may already have this information to hand.}{ -\par -\par }\pard \sl240\slmult0\widctlpar\adjustright {The Charconv framework provides support for a few character sets built into it. These character sets - are ASCII, SMS7Bit, CP1252, ISO 88591, UTF7 & UTF8. The framework also provides support for many other character sets. This support is provided by Plug-in DLLs. At the CHARCONV }{\cf1\lang1033 plug-in DLL interface,}{ }{\cf1\lang1033 -the first reserved function (at ordinal 4) will be replaced with a new exported function as follows: -\par -\par }{\cf2\lang1033 IMPORT_C TBool IsInThisCharacterSetL( -\par \tab TBool& aSetToTrue, -\par \tab TInt& aConfidenceLevel, -\par \tab const TDesC8& aSample) -\par -\par }\pard \widctlpar\adjustright {\cf1\lang1033 The }{\cf2\lang1033 aSetToTrue}{\cf1\lang1033 parameter is just to indicate to }{\cf2\lang1033 CCnvCharacterSetConverter::AutoDetectCharacterSetL}{\cf1\lang1033 th -at the plug-in DLL is actually implementing a function of this signature and not the old first reserved function. The meanings of the remaining parameters is the same as the meanings of the corresponding parameters in }{\cf2\lang1033 -CCnvCharacterSetConverter::AutoDetectCharacterSetL}{\cf1\lang1033 .}{ -\par -\par -\par }\pard\plain \s2\sb240\sa60\keepn\widctlpar\outlinelevel1\adjustright \b\i\f1\lang2057\cgrid {\i0\cf1\lang1033 CCnvCharacterSetConverter::AutoDetectCharacterSetL -\par }\pard\plain \widctlpar\adjustright \fs20\lang2057\cgrid { -\par Internally AutoDetectCharacterSetL loops through the available }{\i\cf1 a}{\i\cf1\lang1033 ArrayOfCharacterSetsAvailable }{\cf1\lang1033 and checks whether that character set is a plug-in supported one. If it is, then it loads the -corresponding plug-in DLL and calls the DLL's exported function at ordinal 4 (checking }{\cf2\lang1033 aSetToTrue}{\cf1\lang1033 to verify that the plug-in DLL is actually implementing this function as }{\cf2\lang1033 IsInThisCharacterSetL}{ -\cf1\lang1033 - it may still be implementing it as }{\cf2\lang1033 Reserved_1}{\cf1\lang1033 which is perfectly legitimate).}{\i\cf1\lang1033 IsInThisCharacterSetL}{\cf1\lang1033 then sets a }{\i\cf1\lang1033 confidence level}{\cf1\lang1033 - for that character set for the sample. If the character set is not one that is supported by the plug-in libraries, it then loops through the list of character sets that are supported inte -rnally by Charconv and call their character specific functions, e.g. IsCharacterSetAscii, IsCharacterSetSMS7Bit etc. These functions also set a confidence level for that character set for the sample. Once it has looped through the entire array of }{ -\i\cf1 a}{\i\cf1\lang1033 ArrayOfCharacterSetsAvailable }{\cf1\lang1033 the character set with the highest confidence level is returned as the character set encoding the sample text is encoded in. -\par -\par If the maximum confidence-level encountered in this loop is less than some threshold (say, 25), then }{\cf2\lang1033 CCnvCharacterSetConverter::AutoDetectCharacterSetL}{\cf1\lang1033 resorts to attempting to convert }{\cf2\lang1033 aSample}{ -\cf1\lang1033 to each of the character sets in }{\cf2\lang1033 aArrayOfCharacterSetsAvailable}{\cf1\lang1033 . The character set that generates fewest }{\cf2\lang1033 0xfffd}{\cf1\lang1033 Unicode "REPLACEMENT CHARACTER"-s wins, with }{\cf2\lang1033 -aConfidenceLevel}{\cf1\lang1033 being set according to some formula based on the difference between the number of }{\cf2\lang1033 0xfffd}{\cf1\lang1033 - characters generated by that character set and its nearest rival. If the maximum confidence level encountered (in the loop mentioned above) is shared by more than one character set, it will make the best guess possible. -\par -\par If there are more that one character set encoding that have the same confidence level, }{\cf2\lang1033 CCnvCharacterSetConverter::AutoDetectCharacterSetL}{\cf1\lang1033 will convert the sample text }{\cf2\lang1033 aSample}{\cf1\lang1033 - in each character set encoding with the same confidence level and the one with the least number of Unicode replacement characters wins. -\par -\par }\pard\plain \s2\sb240\sa60\keepn\widctlpar\outlinelevel1\adjustright \b\i\f1\lang2057\cgrid {\i0\cf1\lang1033 IsInThisCharacterSetL -\par }\pard\plain \widctlpar\adjustright \fs20\lang2057\cgrid { -\par Each of the character sets supported by the plug-ins have their own implementation of }{\i\cf2\lang1033 IsInThisCharacterSetL.}{\i\cf1\lang1033 }{\cf1\lang1033 This is -achieved by using signature sequences associated to a language/ encoding which is the same technique employed in virus detection. One such example would be if the encoding was a Modal Encoding - where escape characters/sequences or special characters are - used to switch between different character modes 1-byte, 2-byte or other character set. For e.g. ISO-2022JP is a modal encoding. The following are the escape sequences and their meaning. -\par -\par Hex values of keys that make up the escape sequence. ISO2022JP is a multi-byte encoding. The range for the 1}{\cf1\lang1033\super st}{\cf1\lang1033 byte and the 2}{\cf1\lang1033\super nd}{\cf1\lang1033 - byte are the same, i.e. between the hex value 0x21 & 0x7E. -\par -\par }{\lang1033 0x1B 0x28 0x42 - ASCII escape sequence. -\par 0x1B 0x28 0x42 - JIS-Roman escape sequence. -\par 0x1B 0x24 0x40 - JIS C 6226-1978. -\par \'85 etc. -\par -\par The escape sequences tell what character the text following the escape sequence is encoded in. -\par -\par So in }{\cf2\lang1033 AutoDetectCharacterSetL}{\lang1033 when ISO2022 is the character set in }{\cf2\lang1033 aArrayOfCharacterSetsAvailable, }{\cf1\lang1033 the }{\cf2\lang1033 IsInThisCharacterSetL}{\i\cf2\lang1033 }{\cf1\lang1033 -implementation for the ISO2022JP\rquote s p -lug-in is called. In that function, it is checked that all the characters that make up the sample text are in range, if it is not in range, i.e. if there exist characters whose values are greater than 0x7e then the sample text is not ISO2022JP. So we can -g -ive it a confidence level of 0. Then it looks for the escape sequences. The presence of the escape sequences that are unique (not completely unique though. For example ISO2022JP1 is exactly the same as ISO2022JP + an extra escape sequence) to this charac -ter set increases the possibility of the sample text being encoded in this character set encoding. -\par -\par EUCJP encoding uses shift characters. 0x8E and 0x8F are the single shift characters. The range of the encoded characters + the single shift character tells whether that byte is part of a 2-byte character. Here are the ranges of EUCJP -\par -\par 0x00 - 0x7F -\par [0xA1 - 0xFE] [0xA1 - 0xFE] 2 byte character with the range of each byte shown brackets. -\par 0x8E [0xA0 - 0xDF] single Shift Character 0x8E -\par 0x80 [0xA1 - 0xFE] [0xA1 - 0xFE] -\par -\par The sample text provided affects the outcome of the function. If the sample text is very small, there might be only one escape sequence which is not enough to give a 100% accurate answer. In order to gain more knowledge of different Character -set encoding, read chapter 4 of the following -\par -\par \ldblquote CJKV Information Processing\rdblquote - by Ken Lunde. -\par -\par }\pard\plain \s2\sb240\sa60\keepn\widctlpar\outlinelevel1\adjustright \b\i\f1\lang2057\cgrid {\i0\lang1033 Sample Code -\par }\pard\plain \widctlpar\adjustright \fs20\lang2057\cgrid { -\par Here are a few working sample of Autodetection. -\par -\par As mentioned earlier }{\cf1\lang1033 the }{\cf2\lang1033 aArrayOfCharacterSetsAvailable}{\cf1\lang1033 parameter is included for efficiency - }{\cf2\lang1033 CCnvCharacterSetConverter::AutoDetectCharacterSetL}{\cf1\lang1033 - could itself find the list of character-sets available but this is a very time-consuming task and the client may already have this information to hand. If the client does not have that information then the following creates an array of character sets - -\par -\par }{\cf2 RFs fileServerSession; -\par CleanupClosePushL(fileServerSession); -\par User::LeaveIfError(fileServerSession.Connect()); -\par CCnvCharacterSetConverter* characterSetConverter=CCnvCharacterSetConverter::NewLC(); -\par CArrayFix* arrayOfCharacterSetsAvailable=characterSetConverter-> CreateArrayOfCharacterSetsAvailableLC(fileServerSession); -\par -\par }{\cf1 then the }{\cf1\lang1033 AutoDetectCharacterSetL}{\cf1 function can be used -\par -\par }{\cf2 _LIT8(KASCII, \ldblquote The result I am expecting is that this is recognised as ASCII!"); -\par TInt Confidence = 0; -\par TUint Character = 0; -\par CCnvCharacterSetConverter::AutoDetectCharacterSetL (Confidence, Character, *arrayOfCharacterSetsAvailable, KEUP); -\par -\par }{\cf1 The sample text is ASCII encoded plain text. Confidence and Character are where the confidence level and the winning Character set are returned. -\par }{\cf2 -\par }{ -\par -\par -\par }{\cf1\lang1033 -\par }{ -\par }} \ No newline at end of file