charconvfw/Charconv/ongoing/Group/Autodetect.rtf
author William Roberts <williamr@symbian.org>
Mon, 08 Mar 2010 21:45:11 +0000
branchCompilerCompatibility
changeset 7 3969f087709d
parent 0 1fb32624e06b
permissions -rw-r--r--
Create CompilerCompatibility branch
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
0
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
     1
{\rtf1\ansi\ansicpg1252\uc1 \deff0\deflang1033\deflangfe1033{\fonttbl{\f0\froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f1\fswiss\fcharset0\fprq2{\*\panose 020b0604020202020204}Arial;}
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
     2
{\f79\froman\fcharset238\fprq2 Times New Roman CE;}{\f80\froman\fcharset204\fprq2 Times New Roman Cyr;}{\f82\froman\fcharset161\fprq2 Times New Roman Greek;}{\f83\froman\fcharset162\fprq2 Times New Roman Tur;}
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
     3
{\f84\froman\fcharset186\fprq2 Times New Roman Baltic;}{\f85\fswiss\fcharset238\fprq2 Arial CE;}{\f86\fswiss\fcharset204\fprq2 Arial Cyr;}{\f88\fswiss\fcharset161\fprq2 Arial Greek;}{\f89\fswiss\fcharset162\fprq2 Arial Tur;}
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
     4
{\f90\fswiss\fcharset186\fprq2 Arial Baltic;}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
     5
\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}{\stylesheet{\widctlpar\adjustright \fs20\lang2057\cgrid \snext0 Normal;}{
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
     6
\s2\sb240\sa60\keepn\widctlpar\adjustright \b\i\f1\lang2057\cgrid \sbasedon0 \snext0 heading 2;}{\*\cs10 \additive Default Paragraph Font;}}{\info{\title EPOC Charconv - Autodetection }{\author Arunsakh Sachamuneewongse}{\operator Symbian}
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
     7
{\creatim\yr2000\mo12\dy22\hr11\min55}{\revtim\yr2001\mo1\dy4\hr11\min1}{\version5}{\edmins47}{\nofpages3}{\nofwords1234}{\nofchars7039}{\*\company SYMBIAN}{\nofcharsws0}{\vern113}}\paperw11906\paperh16838 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
     8
\widowctrl\ftnbj\aenddoc\lytprtmet\hyphcaps0\formshade\viewkind1\viewscale100\pgbrdrhead\pgbrdrfoot \fet0\sectd \linex0\endnhere\sectdefaultcl {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
     9
{\pntxta .}}{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang{\pntxta )}}{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    10
{\pntxtb (}{\pntxta )}}{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}\pard\plain 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    11
\s2\sb240\sa60\keepn\widctlpar\outlinelevel1\adjustright \b\i\f1\lang2057\cgrid {\i0 EPOC Charconv - Autodetection 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    12
\par }\pard\plain \widctlpar\adjustright \fs20\lang2057\cgrid {
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    13
\par Author: Arunsakh Sachamuneewongse
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    14
\par Contributors: David Batchelor. 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    15
\par Date: 19th October 2000 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    16
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    17
\par }\pard\plain \s2\sb240\sa60\keepn\widctlpar\outlinelevel1\adjustright \b\i\f1\lang2057\cgrid {\i0 Introduction
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    18
\par }\pard\plain \widctlpar\adjustright \fs20\lang2057\cgrid {
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    19
\par }\pard\plain \s2\sb240\sa60\keepn\widctlpar\outlinelevel1\adjustright \b\i\f1\lang2057\cgrid {\i0 Charconv
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    20
\par }\pard\plain \widctlpar\adjustright \fs20\lang2057\cgrid {Charconv seemed the ideal place to implement the new Autodetect functionality, as autodetect can use Charconv\rquote 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    21
s information regarding all the different character sets. Since autodetect is implemented in Charconv, the autodetect framework is able to detect Charconv\rquote 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    22
s built-in character sets, which are ASCII, SMS7Bit, CP1252, ISO 88591, UTF7 & UTF8. Besides these character sets it also provides support framework for Charconv\rquote 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    23
s plug-ins character sets such as GBK, SHIFT-JIS, ISO2022JP, ISO 88592 - 99, although only the Japanese and the Chinese character sets have been implemented in autodetect for 6.1. 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    24
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    25
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    26
\par The following static function has been added to }{\cf2\lang1033 CCnvCharacterSetConverter 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    27
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    28
\par }\pard \sl240\slmult0\widctlpar\adjustright {\cf2\lang1033 IMPORT_C static void CCnvCharacterSetConverter}{\b\cf2\lang1033 ::}{\cf2\lang1033 AutoDetectCharacterSetL(
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    29
\par \tab TInt& aConfidenceLevel,
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    30
\par \tab TUint& aCharacterSetIdentifier,
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    31
\par \tab const CArrayFix<SCharacterSet>& aArrayOfCharacterSetsAvailable,
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    32
\par }\pard \widctlpar\adjustright {\cf2\lang1033 \tab const TDesC8& aSample)}{
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    33
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    34
\par }{\cf1\lang1033 The first two parameters are both output-only parameters: }{\cf2\lang1033 aConfidenceLevel}{\cf1\lang1033  is set by the function to a value between 0 and 100 (inclusive), where 0 means "I have no idea what character set }{\cf2\lang1033 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    35
aSample}{\cf1\lang1033  is in" (in which case }{\cf2\lang1033 aCharacterSetIdentifier}{\cf1\lang1033  is undefined), and 100 means "I have total confidence that }{\cf2\lang1033 aCharacterSetIdentifier}{\cf1\lang1033  is the character-set of }{
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    36
\cf2\lang1033 aSample}{\cf1\lang1033 ". The }{\cf2\lang1033 aArrayOfCharacterSetsAvailable}{\cf1\lang1033  parameter is included for efficiency - }{\cf2\lang1033 CCnvCharacterSetConverter::AutoDetectCharacterSetL}{\cf1\lang1033 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    37
 could itself find the list of character-sets available but this is a very time-consuming task and the client may already have this information to hand.}{
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    38
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    39
\par }\pard \sl240\slmult0\widctlpar\adjustright {The Charconv framework provides support for a few character sets built into it. These character sets
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    40
 are ASCII, SMS7Bit, CP1252, ISO 88591, UTF7 & UTF8. The framework also provides support for many other character sets. This support is provided by Plug-in DLLs. At the CHARCONV }{\cf1\lang1033 plug-in DLL interface,}{ }{\cf1\lang1033 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    41
the first reserved function (at ordinal 4) will be replaced with a new exported function as follows:
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    42
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    43
\par }{\cf2\lang1033 IMPORT_C TBool IsInThisCharacterSetL(
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    44
\par \tab TBool& aSetToTrue,
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    45
\par \tab TInt& aConfidenceLevel,
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    46
\par \tab const TDesC8& aSample)
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    47
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    48
\par }\pard \widctlpar\adjustright {\cf1\lang1033 The }{\cf2\lang1033 aSetToTrue}{\cf1\lang1033  parameter is just to indicate to }{\cf2\lang1033 CCnvCharacterSetConverter::AutoDetectCharacterSetL}{\cf1\lang1033  th
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    49
at the plug-in DLL is actually implementing a function of this signature and not the old first reserved function. The meanings of the remaining parameters is the same as the meanings of the corresponding parameters in }{\cf2\lang1033 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    50
CCnvCharacterSetConverter::AutoDetectCharacterSetL}{\cf1\lang1033 .}{
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    51
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    52
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    53
\par }\pard\plain \s2\sb240\sa60\keepn\widctlpar\outlinelevel1\adjustright \b\i\f1\lang2057\cgrid {\i0\cf1\lang1033 CCnvCharacterSetConverter::AutoDetectCharacterSetL
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    54
\par }\pard\plain \widctlpar\adjustright \fs20\lang2057\cgrid {
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    55
\par Internally AutoDetectCharacterSetL loops through the available }{\i\cf1 a}{\i\cf1\lang1033 ArrayOfCharacterSetsAvailable }{\cf1\lang1033 and checks whether that character set is a plug-in supported one. If it is, then it loads the 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    56
corresponding plug-in DLL and calls the DLL's exported function at ordinal 4 (checking }{\cf2\lang1033 aSetToTrue}{\cf1\lang1033  to verify that the plug-in DLL is actually implementing this function as }{\cf2\lang1033 IsInThisCharacterSetL}{
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    57
\cf1\lang1033  - it may still be implementing it as }{\cf2\lang1033 Reserved_1}{\cf1\lang1033  which is perfectly legitimate).}{\i\cf1\lang1033  IsInThisCharacterSetL}{\cf1\lang1033   then sets a }{\i\cf1\lang1033 confidence level}{\cf1\lang1033 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    58
 for that character set for the sample. If the character set is not one that is supported by the plug-in libraries, it then loops through the list of character sets that are supported inte
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    59
rnally by Charconv and call their character specific functions, e.g. IsCharacterSetAscii, IsCharacterSetSMS7Bit etc. These functions also set a confidence level for that character set for the sample. Once it has looped through the entire array of }{
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    60
\i\cf1 a}{\i\cf1\lang1033 ArrayOfCharacterSetsAvailable }{\cf1\lang1033 the character set with the highest confidence level is returned as the character set encoding the sample text is encoded in. 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    61
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    62
\par If the maximum confidence-level encountered in this loop is less than some threshold (say, 25), then }{\cf2\lang1033 CCnvCharacterSetConverter::AutoDetectCharacterSetL}{\cf1\lang1033  resorts to attempting to convert }{\cf2\lang1033 aSample}{
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    63
\cf1\lang1033  to each of the character sets in }{\cf2\lang1033 aArrayOfCharacterSetsAvailable}{\cf1\lang1033 . The character set that generates fewest }{\cf2\lang1033 0xfffd}{\cf1\lang1033  Unicode "REPLACEMENT CHARACTER"-s wins, with }{\cf2\lang1033 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    64
aConfidenceLevel}{\cf1\lang1033  being set according to some formula based on the difference between the number of }{\cf2\lang1033 0xfffd}{\cf1\lang1033 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    65
 characters generated by that character set and its nearest rival. If the maximum confidence level encountered (in the loop mentioned above) is shared by more than one character set, it will make the best guess possible.
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    66
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    67
\par If there are more that one character set encoding that have the same confidence level, }{\cf2\lang1033 CCnvCharacterSetConverter::AutoDetectCharacterSetL}{\cf1\lang1033  will convert the sample text }{\cf2\lang1033 aSample}{\cf1\lang1033 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    68
  in each character set encoding with the same confidence level and the one with the least number of Unicode replacement characters wins. 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    69
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    70
\par }\pard\plain \s2\sb240\sa60\keepn\widctlpar\outlinelevel1\adjustright \b\i\f1\lang2057\cgrid {\i0\cf1\lang1033 IsInThisCharacterSetL
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    71
\par }\pard\plain \widctlpar\adjustright \fs20\lang2057\cgrid {
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    72
\par Each of the character sets supported by the plug-ins have their own implementation of }{\i\cf2\lang1033 IsInThisCharacterSetL.}{\i\cf1\lang1033  }{\cf1\lang1033 This is 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    73
achieved by using signature sequences associated to a language/  encoding which is the same technique employed in virus detection. One such example would be if the encoding was a Modal Encoding - where escape characters/sequences or special characters are
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    74
 used to switch between different character modes 1-byte, 2-byte or other character set. For e.g. ISO-2022JP is a modal encoding. The following are the escape sequences and their meaning. 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    75
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    76
\par Hex values of keys that make up the escape sequence. ISO2022JP is a multi-byte encoding. The range for the 1}{\cf1\lang1033\super st}{\cf1\lang1033  byte and the 2}{\cf1\lang1033\super nd}{\cf1\lang1033 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    77
 byte are the same, i.e. between the hex value 0x21 & 0x7E.  
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    78
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    79
\par }{\lang1033 0x1B 0x28 0x42  - ASCII escape sequence. 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    80
\par 0x1B 0x28 0x42 - JIS-Roman escape sequence. 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    81
\par 0x1B 0x24 0x40 - JIS C 6226-1978. 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    82
\par \'85 etc. 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    83
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    84
\par The escape sequences tell what character the text following the escape sequence is encoded in. 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    85
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    86
\par So in }{\cf2\lang1033 AutoDetectCharacterSetL}{\lang1033  when ISO2022 is the character set in }{\cf2\lang1033 aArrayOfCharacterSetsAvailable, }{\cf1\lang1033 the }{\cf2\lang1033 IsInThisCharacterSetL}{\i\cf2\lang1033  }{\cf1\lang1033 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    87
implementation for the ISO2022JP\rquote s p
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    88
lug-in is called. In that function, it is checked that all the characters that make up the sample text are in range, if it is not in range, i.e. if there exist characters whose values are greater than 0x7e then the sample text is not ISO2022JP. So we can 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    89
g
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    90
ive it a confidence level of 0. Then it looks for the escape sequences. The presence of the escape sequences that are unique (not completely unique though. For example ISO2022JP1 is exactly the same as ISO2022JP + an extra escape sequence)  to this charac
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    91
ter set increases the possibility of the sample text being encoded in this character set encoding. 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    92
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    93
\par EUCJP encoding uses shift characters. 0x8E and 0x8F are the single shift characters. The range of the encoded characters + the single shift character tells whether that byte is part of a 2-byte character. Here are the ranges of EUCJP
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    94
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    95
\par 0x00 - 0x7F   
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    96
\par [0xA1 - 0xFE] [0xA1 - 0xFE]  2 byte character with the range of each byte shown brackets. 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    97
\par 0x8E [0xA0 - 0xDF]  single Shift Character 0x8E 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    98
\par 0x80 [0xA1 - 0xFE] [0xA1 - 0xFE] 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    99
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   100
\par The sample text provided affects the outcome of the function. If the sample text is very small, there might be only one escape sequence which is not enough to give a 100% accurate answer. In order to gain more knowledge of different Character 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   101
set encoding, read  chapter 4 of the following 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   102
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   103
\par \ldblquote CJKV Information Processing\rdblquote  - by Ken Lunde. 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   104
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   105
\par }\pard\plain \s2\sb240\sa60\keepn\widctlpar\outlinelevel1\adjustright \b\i\f1\lang2057\cgrid {\i0\lang1033 Sample Code
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   106
\par }\pard\plain \widctlpar\adjustright \fs20\lang2057\cgrid {
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   107
\par Here are a few working sample of Autodetection. 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   108
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   109
\par As mentioned earlier }{\cf1\lang1033 the }{\cf2\lang1033 aArrayOfCharacterSetsAvailable}{\cf1\lang1033  parameter is included for efficiency - }{\cf2\lang1033 CCnvCharacterSetConverter::AutoDetectCharacterSetL}{\cf1\lang1033 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   110
 could itself find the list of character-sets available but this is a very time-consuming task and the client may already have this information to hand. If the client does not have that information then the following creates an array of character sets
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   111
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   112
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   113
\par }{\cf2 RFs fileServerSession;
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   114
\par CleanupClosePushL(fileServerSession);
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   115
\par User::LeaveIfError(fileServerSession.Connect());
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   116
\par CCnvCharacterSetConverter* characterSetConverter=CCnvCharacterSetConverter::NewLC();
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   117
\par CArrayFix<CCnvCharacterSetConverter::SCharacterSet>* arrayOfCharacterSetsAvailable=characterSetConverter-> CreateArrayOfCharacterSetsAvailableLC(fileServerSession);
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   118
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   119
\par }{\cf1 then the }{\cf1\lang1033 AutoDetectCharacterSetL}{\cf1  function can be used
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   120
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   121
\par }{\cf2 _LIT8(KASCII, \ldblquote The result I am expecting is that this is recognised as ASCII!");
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   122
\par TInt Confidence = 0;
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   123
\par TUint Character = 0; 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   124
\par CCnvCharacterSetConverter::AutoDetectCharacterSetL (Confidence, Character, *arrayOfCharacterSetsAvailable, KEUP);
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   125
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   126
\par }{\cf1 The sample text is ASCII encoded plain text. Confidence and Character are where the confidence level and the winning Character set are returned. 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   127
\par }{\cf2 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   128
\par }{ 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   129
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   130
\par 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   131
\par }{\cf1\lang1033 
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   132
\par }{
1fb32624e06b Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   133
\par }}