searchengine/cpix/cpix/inc/public/cpixidxdb.h
author Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
Thu, 02 Sep 2010 21:37:32 +0300
changeset 18 3e1f76dd2722
parent 10 afe194b6b1cd
permissions -rw-r--r--
Revision: 201033 Kit: 201035

/*
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
* All rights reserved.
* This component and the accompanying materials are made available
* under the terms of "Eclipse Public License v1.0"
* which accompanies this distribution, and is available
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
*
* Initial Contributors:
* Nokia Corporation - initial contribution.
*
* Contributors:
*
* Description: 
*
*/

#ifndef CPIX_CPIXIDXDB_H
#define CPIX_CPIXIDXDB_H

#include <inttypes.h>

#include "cpixinit.h"
#include "cpixerror.h"
#include "cpixdoc.h"
#include "cpixanalyzer.h"
#include "cpixsearch.h"


/**
 * CPix API for searching.
 *
 * The CPix engine must be initialized before use and shut down
 * afterwards (see. cpixinit.h).
 *
 * Currently, this engine supports only single-threaded access.
 *
 * Short description of lifetime management
 *
 * All the cpix_XXX structs in here are wrappers over native (clucene)
 * classes. Some functions result in creation of new native instances,
 * some other functions merely return reference pointers to existing
 * ones that are owned by some other instance. While the wrappers must
 * be deallocated in both cases, the native instances wrapped by them
 * must be deallocated in one case and must not be deallocated in the
 * other. Obviously, there will have to be two different ways of
 * deallocating wrappers. To make things simple, and also to be able
 * to express intent of lifetime management in function signatures,
 * the following rules are adopted.
 *
 *   (a) Returning C-strings (char*, wchar_t*) will always be
 *       documented if the ownership of the returned string passes to
 *       the caller.
 *
 *   (b) Returning a pointer to a cpix_XXX will always indicate that
 *       the native type and its cpix_XXX wrapper are newly created
 *       instances. In this case, the caller must make sure to clean
 *       them up (both the native and the wrapper) with a call to
 *       cpix_XXX_destroy(...). DO NOT USE free() or delete on these
 *       pointers!
 *
 *   (c) Some functions will take a pointer to an cpix_XXX instance
 *       and fill it up with a pointer to a native instance. This
 *       always indicates that the native instance is not a newly
 *       created one. Therefore, deallocation of such wrappers are not
 *       needed, and cpix_XXX_destroy most not be called. The most
 *       comfortable way to use these functions is to have the
 *       cpix_XXX wrapper instance as a local variable of a function
 *       (i.e. on the stack), and when the variable's lifetime is over
 *       everything will be fine. Of course, a client may create
 *       cpix_XXX instance on the heap too, in which case, it is the
 *       callers responsibility to deallocate the dynamically
 *       allocated cpix_XXX instance with free() or delete (depending
 *       on how it was allocated).
 *
 *
 * CPIX VOCABULARY
 *
 * Application class or "appclass", for short, is simply a string
 * representiation of a type of a document in an index. It has an
 * internal structure expressing hierarchy of those types. The most
 * generic type is "root", and all documents representing files are of
 * the type "root file" or a descendant type of "root file", like
 * "root file usrdoc text". See appclass-hierarchy.txt.
 *
 * Base application class or "baseappclass" is an appclass, that an
 * application / application domain deems significant enough to assign
 * its own application index to it. Like files are stored in an index
 * identified by base app class "root file". It means that ALL
 * documents that go to that index MUST be of the type "root file" or
 * some of its descendant, like "root file usrdoc text".
 *
 * Qualified base app class: an application index, that is a whole
 * conceptually, can be broken down to several physical index
 * databases, called volumes. The usual reason is performance: if you
 * have 10 thousand documents that you don't ever need to update, but
 * there are tens of documents that change frequently, you can
 * organize your application index into two volumes, and only one of
 * them is frequently updated. Such physical index needs sometimes
 * explicit addressing, and this is achieved by qualifying the base
 * app class with a volume id. For instancem, files harvested from the
 * e drive may be stored in a volume identified by qualified base app
 * class "@d:root file", where the volume identifier, 'd', is in the
 * beginning of the string.
 *
 * Unqualified base app class: the same as a (plain) base app class.
 *
 * Domain selector: a non empty, comma-separated list of
 * appclasses. Like "@d:root file,root msg phone sms" domain selector,
 * when gicen to index searcher constructor, will search those two
 * indexes and nothing else.
 *
 */


#ifdef __cplusplus
extern "C" {
#endif

    
    /**
     * A type storing internal state for the excerpt processing
     * utility functions like cpix_getExcerptOfWText and
     * cpix_getExcerptOfText.
     *
     * Must be initialized by a call to cpix_init_EPIState().
     */
    typedef int  cpix_EPIState;

    
    /**
     * Initializes an excerpt processor internal state instance.
     */
    void cpix_init_EPIState(cpix_EPIState * state);
    
    extern const char* cpix_LOCALE_AUTO;
    
    /**
     * Sets the locale used by CPix. Locale is used in indexing and
     * searching for text's lexical analysis. Text of different languages
     * may be treated differently. At this point, the locale 
     * should only hold languge code following ISO 639-1 two letter
     * format or ISO 639-2 three letter format, if two letter format is
     * not available. If cpix_LOCALE_AUTO is given, locale is left to 
     * be determined automatically by cpix. 
     * 
     * @param locale the new locale. Should be a language code of ISO 639-1 standard 
     */
    void cpix_SetLocale(cpix_Result* result, const char* locale);

    /**
     * A simple utility function getting the first couple of words of
     * a text by compressing adjacent blank characters. Has a
     * signature that is geared toward processing text streams.
     *
     * @param dst the destination buffer to copy to, must not be NULL
     *
     * @param src the original text to get the first couple of words
     * of, must not be NULL
     *
     * @param maxWords pointer to the the maximum number of first
     * words to get. Must not be NULL, and it is updated: it is
     * decreased with the number of words read.
     *
     * @param bufSize pointer to the size of dst buffer. Must not be
     * NULL, and it is updated: it is decreased with the number of
     * characters written to dst (excluding the terminating zero).
     *
     * @param state as this function is meant for stream processing,
     * some state may have to be preserved accross function
     * calls. Must not be NULL. Before invoking this function for a
     * text stream for a first time, it must be initialized by a call
     * to cpix_init_EPIState.
     *
     * @returns the pointer to the terminating zero at the end of the
     * string in dst.
     */
    wchar_t * cpix_getExcerptOfWText(wchar_t       * dst,
                                     const wchar_t * src,
                                     size_t        * maxWords,
                                     size_t        * bufSize,
                                     cpix_EPIState * state);


    /**
     * The very same semantics as that of cpix_getExcerptOfWText,
     * except the source string here is not a wide string.
     */
    wchar_t * cpix_getExcerptOfText(wchar_t       * dst,
                                    const char    * src,
                                    size_t        * maxWords,
                                    size_t        * bufSize,
                                    cpix_EPIState * state);



    /**
     * A field descriptor struct for schema registration purposes. To
     * be able to index some content with cpix_IdxDb, one must define
     * document schemas for it first. Fields of documents to be
     * indexed are described by this cpix_FieldDesc.
     */
    struct cpix_FieldDesc
    {
        /**
         * Name of the field.
         */
        const wchar_t           * name_;

        /**
         * Use the cpix_Store and cpix_Index flags in here (from
         * cpixdoc.h).
         */
        int                       cfg_;
    };


    /************************************************************************
     * Class cpix_Term
     *
     * NOTE TO INTERNAL DEVELOPMENT: Native instances of this class
     * MUST be specially freed using _CLDECDELETE.
     */
    struct cpix_Term_
    {
        // pointer to native (CLucene) impl
        void       * ptr_;
        
        // Last error, if any, that resulted from executing the
        // last operation on this cpix_XXX object
        // Use macros cpix_Failed, cpix_Succeeded and cpix_ClearError.
        // Do not attempt releasing it.
        cpix_Error * err_;
    };
    typedef cpix_Term_ cpix_Term;

    
    /**
     * Constructs a term with a field name and field value.
     *
     * @param fieldName the name of the field this term is about
     *
     * @param fieldValue the textual value of the field this term
     *        is about
     *
     * @return newly created Term object the ownership of which
     * is transferred to the caller. May fail, in which case NULL
     * is returned.
     */
    cpix_Term * cpix_Term_create(cpix_Result     * result,
                                 const wchar_t   * fieldName,
                                 const wchar_t   * fieldValue);



    /**
     * Destroys this term instance, never fails.
     */
    void cpix_Term_destroy(cpix_Term * thisTerm);


    /**
     * cpix_IdxDb
     */
    struct cpix_IdxDb_
    {
        // instead of a pointer to native (CLucene) impl, there is a
        // handle here only. The type here must be equivalent to 
        IdxDbHndl    handle_;
        
        // Last error, if any, that resulted from executing the
        // last operation on this cpix_XXX object
        // Use macros cpix_Failed, cpix_Succeeded and cpix_ClearError.
        // Do not attempt releasing it.
        cpix_Error * err_;
    };
    typedef cpix_IdxDb_ cpix_IdxDb;
    
    
    enum cpix_IDX_OpenMode_
        {
            cpix_IDX_OPEN   = 0,
            cpix_IDX_CREATE = 1
        };
    typedef cpix_IDX_OpenMode_ cpix_IDX_OpenMode;


    /**
     * Returns a cpix_IdxDb instance through which a client can search
     * of modify the index database responsible for a given qualified
     * base app class. The qualified base app class must have an index
     * path associated to it (via defineVolume call).
     *
     * @param result pointer through which error details are
     * communicated back
     *
     * @param qualBaseAppClass the qualified base application class,
     * that is the latest common appliciaton super class for all
     * documents that will be put into this index. See
     * appclass-hierarchy.txt for details. Index databases MUST have
     * unique qualified baseappclass values.
     *
     * @param openMode whether to open the an existing index database
     * or (re-)create it even if it exists.
     *
     * @retur pointer to the cpix_IdxDb instance that can be used for
     * searching and modifying the index.
     */
    cpix_IdxDb * cpix_IdxDb_openDb(cpix_Result       * result,
                                   const char        * qualBaseAppClass,
                                   cpix_IDX_OpenMode   openMode);


    /**
     * Releases the native index database manipulator (from the caller
     * client). (Actually, it only decreases its reference count, as
     * index database manipulator is kept around for performance as
     * long as it is needed.) It also frees up the wrapper.
     */
    void cpix_IdxDb_releaseDb(cpix_IdxDb * thisIdxDb);
    

    /**
     * Searches the index.
     *
     * NOTE: Keep the original query parser and query instances alive
     * as long as you use the result of the search - in general, they
     * may be needed.
     *
     * @param thisIdxDb the index db instance that should
     *        perform the search, must not be NULL.
     *
     * @param query the query itself in a structured form, must not be
     *        NULL, its ownership does not change.
     *
     * @return the hits instance with the search results, that will be
     * owned by the calling client.
     */
    cpix_Hits * 
    cpix_IdxDb_search(cpix_IdxDb * thisIdxDb,
                      cpix_Query * query);
    
    
    typedef size_t SchemaId; // TODO rename -> cpix_SchemaId

    /**
     * Adds a schema for efficient (bulk) addition or update. Instead
     * of communicating the schema implicitly every time a document is
     * created and populated with fields, we can define the field
     * names and their configurations only once, and use that later.
     *
     * @param thisIdxDb the index db instance to register the schema
     * to.
     *
     * @param fieldDescs array of field descriptors
     *
     * @param count the number of field descriptors in the array
     * fieldDescs.
     *
     * @return an opaque ID for the schema. Do not persist this
     * value. Currently it is guaranteed to be valid only between a
     * cpix_IdxDb_openDb call and a cpix_IdxDb_releaseDb call. The
     * schema ID is not valid for other cpix_IdxDb instances.
     */
    SchemaId cpix_IdxDb_addSchema(cpix_IdxDb           * thisIdxDb,
                                  const cpix_FieldDesc * fieldDescs,
                                  size_t                 count);


    /**
     * Adds a document to this index.
     *
     * @param thidIdxDb the index to add the document to
     *
     * @param document the document with all the mandatory, optional
     * and custom fields populated
     *
     * @param analyzer - MUST NOT BE NULL.
     */
    void cpix_IdxDb_add(cpix_IdxDb      * thisIdxDb,
                        cpix_Document   * document,
                        cpix_Analyzer   * analyzer);

    /**
     * Adds a document (=set of fields) to the index.
     *
     * @param thisIdxDb the index to add to
     *
     * @param schemaId identifies the schema to use
     *
     * @param docUid document unique id - whatever string the document
     * can be identified with. Needed for deletion. It needs to be
     * unique only in the context of this index database. Mandatory
     * field. See global variable cpix_DOCUID_FIELD
     * (cpixdox.h). Stored and indexed untokenized.
     *
     * @param appClass the application class of the document. See
     * appclass-hierarchy.txt for more details. Mandatory field. See
     * global variable cpix_APPCLASS_FIELD (cpixdox.h). Stored and
     * indexed untokenized. NOTE: this value is supposed to be ASCII,
     * so it is accepted as char*, but the actual field is stored as
     * wchar_t*, and that is how it is returned on documents.
     *
     * @param excerpt a short textual extract of the document. It's up
     * to the client to figure out what short text can represent the
     * documents best when shown in search results. Mandatory
     * field. See global variable cpix_EXCERPT_FIELD
     * (cpixdoc.h). Stored, not indexed.
     *
     * @param mimeType contains document's mime-type to be identified
     * with the application, which would be able to view or edit this
     * document.  For example, Messaging Application is able to view
     * SMS messages. Application handling the data is chosen by the
     * system based on the given mime-type.  Optional field, can be
     * NULL. See global variable cpix_MIMETYPE_FIELD
     * (cpixdoc.h). Stored and indexed not tokenized.
     *
     * @param fieldValues the array with the field values. The length
     * of the array must be exactly the same as the length of the
     * corresponding schema, identified by schemaId. However,
     * individual pointers in the array may be NULL, indicating that
     * the corresponding field will not have any value. Field values
     * are delt with according to the schema given: field names, and
     * configurations (indexed, stored, ...) are taken from the
     * schema.
     *
     * A special use case is when the field value is not an actual
     * field value but a file system path. In this case the
     * corresponding cpix_FieldDesc defines the chain of filters that
     * needs to be applied to obtain the actual textual content for
     * the field. See cpix_FieldDesc.
     *
     * @param analyzer the analyzer to use for this document. Must NOT
     * BE NULL (currently there is no default analyzer defined for an
     * index database).
     *
     * NOTE: after a lot of additions, it is worth calling
     * cpix_IdxDb_optimize(), otherwise the search may be (much)
     * slower.
     *
     */
    void cpix_IdxDb_add2(cpix_IdxDb      * thisIdxDb,
                         SchemaId          schemaId,
                         const wchar_t   * docUid,
                         const char      * appClass,
                         const wchar_t   * excerpt,
                         const wchar_t   * mimeType,
                         const wchar_t  ** fieldValues,
                         cpix_Analyzer   * analyzer);
    

    /**
     * Deletes the document from the index using the document ID value
     * the system field cpix_DOCUID_FIELD. To delete based on an
     * arbitrary field, use deleteDocuments2().
     *
     * @param thidIdxDb the index database to delete from
     *
     * @param docUid the unique id of the document to delete
     *
     * @return the number of documents deleted
     */
    int32_t cpix_IdxDb_deleteDocuments(cpix_IdxDb    * thisIdxDb,
                                       const wchar_t * docUid);

    /**
     * Deletes documents based on an arbitrary field.
     *
     * @param thidIdxDb the index database to delete from
     *
     * @param term the field name/value pair to match documents for
     * deletion
     *
     * @return the number of documents deleted
     */
    int32_t cpix_IdxDb_deleteDocuments2(cpix_IdxDb    * thisIdxDb,
                                        cpix_Term     * term);


    /**
     * Updates a document. Conceptually, it is equivalent to deleting
     * the document if existed and re-adding it.
     *
     * @param thidIdxDb the index to add the document to
     *
     * @param document the document with all the mandatory, optional
     * and custom fields populated
     *
     * @param analyzer - MUST NOT BE NULL.
     */
    void cpix_IdxDb_update(cpix_IdxDb      * thisIdxDb,
                           cpix_Document   * document,
                           cpix_Analyzer   * analyzer);

    /**
     * Updates a document (=set of fields) in the index. Conceptually,
     * it is equivalent to deleting the document if existed and
     * re-adding it.
     *
     * For the parameters, see the comments of cpix_IdxDb_add().
     */
    void cpix_IdxDb_update2(cpix_IdxDb      * thisIdxDb,
                            SchemaId          schemaId,
                            const wchar_t   * docUid,
                            const char      * appClass,
                            const wchar_t   * excerpt,
                            const wchar_t   * mimeType,
                            const wchar_t  ** fieldValues,
                            cpix_Analyzer   * analyzer);


    /**
     * Sets the physical maximum amount of the RAM buffer, where the
     * indexed documents are saved before flushing them into the
     * actual index databse. Having higher value will increase memory
     * consumption, but will speed up indexing operations.
     *
     * @param thisIdxDb the index database to set the maximum index
     * size for
     *
     * @param value The maximum number of bytes RAM buffer can hold
     * before flushing
     *
     * NOTE: see cpixinit.h, cpix_InitParams_setMaxInsertBufSize()
     * - that call sets the general, default buffer size, while this
     * one sets the same value only on thisIdxDb.
     */
    void cpix_IdxDb_setMaxInsertBufSize(cpix_IdxDb * thisIdxDb,
                                        size_t       value);



    /**
     * Forces this index database to write its state (including the
     * insert buffer's state) to the disk.
     *
     * Do NOT use this operation lightly, as it is a costly
     * operation. In fact, there is only one (more-or-less) justified
     * use case for this call: when a harvester at the end of its
     * harvesting session, if it wants to make sure of quick update
     * before the regular cpix_doHousekeep() is ran.
     *
     * @param thisIdxDb the index database to flush to the hard drive.
     */
    void cpix_IdxDb_flush(cpix_IdxDb * thisIdxDb);

    

    /**
     * Scraps all information wrt what base classes are and what index
     * databases there are, but only if there are no index db handlers
     * in use. Meant to be used only for unit testing. May fail.
     *
     * @param result pointer through which error details are
     * communicated back
     *
     */
    void cpix_IdxDb_dbgScrapAll(cpix_Result * result);
    
    
    
    /**
     * Defines a base app class - path association. If the path
     * already holds the index, then it can be readily used by an
     * cpix_IdxDb_openDb call (so it can have mounting semantics).
     *
     * TODO this signature should evolve to take a human readable nice
     * display name for the volume. Actually, a human readable name
     * for the application domain as well as a human readable name for
     * the volume itself.
     *
     * @param result since it may fail, errors are reported through
     * this argument
     *
     * @param qualBaseAppClass the qualified base app class that will
     * be used to access the volume being defined now. Must NOT be
     * NULL. The same qualified base app class must not already be
     * associated to another path (redefinition to the same path is
     * allowed (= NOP)).
     *
     * @param path the path to the index volume to use. CPix will
     * access the index from under here. If the directory and/or
     * usable index does not exist, they will be created (empty
     * state). If the index exist but is corrupt, it will be
     * re-created. If the index exists and is usable, it will be kept
     * for use. NOTE: later calls to cpix_IdxDb_openDb with creation
     * mode are possible, discarding whatever state the existing index
     * db was in.
     */
    void cpix_IdxDb_defineVolume(cpix_Result * result,
                                 const char  * qualBaseAppClass,
                                 const char  * path);

    
    /**
     * Undefines a volume (does not delete it, only forgets about it).
     * If there was no such volume, nothing is done.
     */
    void cpix_IdxDb_undefineVolume(const char * qualBaseAppClass);


#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */


#endif