/*--

 Copyright 2001, 2002 Elliotte Rusty Harold.
 All rights reserved.

    This file is part of XIncluder, a Java class library for integrating XInclude
    processing with SAX, DOM, and JDOM. 

    XIncluder is free software; you can redistribute it and/or modify
    it under the terms of the GNU Lesser General Public License version 2.1 
    as published by the Free Software Foundation.

    XIncluder is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public License
    along with XIncluder; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

 THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED
 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED.  IN NO EVENT SHALL ELLIOTTE RUSTY HAROLD OR ANY
 OTHER CONTRIBUTORS TO THIS PACKAGE
 BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 SUCH DAMAGE.

 */

package com.elharo.xml.xinclude;

import java.io.IOException;
import java.io.InputStreamReader;
import java.io.InputStream;

/**
 * <p>
 * <code>EncodingHeuristics</code> reads from a stream
 * (which should be buffered) and attempts to guess
 * what the encoding of the text in the stream is.
 * Byte order marks are stripped from the stream.
 * If it fails to determine the type of the encoding,
 * it returns the default UTF-8. 
 * </p>
 *
 *
 * @author Elliotte Rusty Harold
 * @version 1.0d9, July 4, 2002
 */
public class EncodingHeuristics {

  // No instances allowed
  private EncodingHeuristics() {}

  /**
    * <p>
    * This utility method ????.
    * </p>
    *
    * @param in   <code>InputStream</code> to read from. 
    * @return String  The name of the encoding.
    * @throws IOException if the stream cannot be reset back to where it was when
    *                     the method was invoked.
    */    
    public static String readEncodingFromStream(InputStream in)
      throws IOException {
     
        // This may fail if there are a lot of space characters before the end
        // of the encoding declaration
        in.mark(1024);
        
        try {
          // lots of things can go wrong here. If any do, I just return null
          // so that we'll fall back on the encoding declaration or the
          // UTF-8 default
            int byte1 = in.read();
            int byte2 = in.read();
            if (byte1 == 0xFE && byte2 == 0xFF) {
                // don't reset because the byte order mark should not be included????
                return "UnicodeBig"; // name for big-endian????         
            }        
            else if (byte1 == 0xFF && byte2 == 0xFE) {
                // don't reset because the byte order mark should not be included????
                // will the reader throw away the byte order mark or will it return it????
                return "UnicodeLittle";        
            }        
            
            /* In accordance with the Character Model [Character Model], 
               when the text format is a Unicode encoding, the XInclude 
               processor must fail the inclusion when the text in the 
               selected range is non-normalized. When transcoding characters 
               to a Unicode encoding from a legacy encoding, a normalizing transcoder must be used. */
                    
            int byte3 = in.read();
            // check for UTF-8 byte order mark
            if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF) {
                // don't reset because the byte order mark should not be included????
                // in general what happens if text document includes non-XML legal chars????
                return "UTF-8";          
            }
            
            int byte4 = in.read();
            if (byte1 == 0x00 && byte2 == 0x00 && byte3 == 0xFE && byte4 == 0xFF) {
                // don't reset because the byte order mark should not be included????
                return "UCS-4"; // right name for big-endian UCS-4 in Java 1.4????         
            }
            else if (byte1 == 0x00 && byte2 == 0x00 && byte3 == 0xFF && byte4 == 0xFE) {
                // don't reset because the byte order mark should not be included????
                return "UCS-4"; // right name for little-endian UCS-4 in Java 1.4????         
            }
            
            // no byte order mark present; first character must be 
            // less than sign or white space
            // Let's look for less-than signs first
            if (byte1 == 0x00 && byte2 == 0x00 && byte3 == 0x00 && byte4 == '<') {
                in.reset();
                return "UCS-4"; // right name for big-endian UCS-4 in Java 1.4????         
            }
            else if (byte1 == '<' && byte2 == 0x00 && byte3 == 0x00 && byte4 == 0x00) {
                in.reset();
                return "UCS-4"; // right name for little-endian UCS-4 in Java 1.4????         
            }
            else if (byte1 == 0x00 && byte2 == '<' && byte3 == 0x00 && byte4 == '?') {
                in.reset();
                return "UnicodeBigUnmarked";          
            }
            else if (byte1 == '<' && byte2 == 0x00 && byte3 == '?' && byte4 == 0x00) {
                in.reset();
                return "UnicodeLittleUnmarked";          
            }
            else if (byte1 == '<' && byte2 == '?' && byte3 == 'x' && byte4 == 'm') {
              // ASCII compatible, must read encoding declaration 
              // 1024 bytes will be far enough to read most XML declarations
              byte[] data = new byte[1024];
              data[0] = (byte) byte1;
              data[1] = (byte) byte2;
              data[2] = (byte) byte3;
              data[3] = (byte) byte4;
              int length = in.read(data, 4, 1020) + 4;
              // Use Latin-1 (ISO-8859-1) because it's ASCII compatible and
              // all byte sequences are legal Latin-1 sequences so I don't have
              // to worry about encoding errors if I slip past the 
              // end of the XML/text declaration
              String declaration = new String(data, 0, length, "8859_1");
              // if any of these throw a StringIndexOutOfBoundsException
              // we just fall into the catch bloclk and return null
              // since this can't be well-formed XML
              int position = declaration.indexOf("encoding") + 8;
              char c;
              // get rid of white space before equals sign
              while (true) {
                  c = declaration.charAt(position++);
                  if (c != ' ' && c != '\t' && c != '\r' && c != '\n') break;
              }
              if (c != '=') { // malformed
                  in.reset();
                  return "UTF-8"; 
              }
              // get rid of white space after equals sign
              while (true) {
                  c = declaration.charAt(position++);
                  if (c != ' ' && c != '\t' && c != '\r' && c != '\n') break;
              }
              char delimiter = c;
              if (delimiter != '\'' && delimiter != '"') { // malformed
                  in.reset();
                  return "UTF-8";
              }
              // now positioned to read encoding name
              StringBuffer encodingName = new StringBuffer();
              while (true) {
                  c = declaration.charAt(position++);
                  if (c == delimiter) break;
                  encodingName.append(c);
              }
              in.reset();
              return encodingName.toString();
              
            }
            else if (byte1 == 0x4C && byte2 == 0x6F && byte3 == 0xA7 && byte4 == 0x94) {
              // EBCDIC compatible, must read encoding declaration 
              // ???? 
            }
        
        }   
        catch (Exception e) {
            in.reset();
            return "UTF-8";        
        }
        
        // no XML or text declaration present
        in.reset();
        return "UTF-8";
        
    }

}
