persistentstorage/sqlite3api/SQLite/btree.c
changeset 0 08ec8eefde2f
equal deleted inserted replaced
-1:000000000000 0:08ec8eefde2f
       
     1 /*
       
     2 ** 2004 April 6
       
     3 **
       
     4 ** The author disclaims copyright to this source code.  In place of
       
     5 ** a legal notice, here is a blessing:
       
     6 **
       
     7 **    May you do good and not evil.
       
     8 **    May you find forgiveness for yourself and forgive others.
       
     9 **    May you share freely, never taking more than you give.
       
    10 **
       
    11 *************************************************************************
       
    12 ** $Id: btree.c,v 1.524 2008/09/30 17:18:17 drh Exp $
       
    13 **
       
    14 ** This file implements a external (disk-based) database using BTrees.
       
    15 ** See the header comment on "btreeInt.h" for additional information.
       
    16 ** Including a description of file format and an overview of operation.
       
    17 */
       
    18 #include "btreeInt.h"
       
    19 
       
    20 /*
       
    21 ** The header string that appears at the beginning of every
       
    22 ** SQLite database.
       
    23 */
       
    24 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
       
    25 
       
    26 /*
       
    27 ** Set this global variable to 1 to enable tracing using the TRACE
       
    28 ** macro.
       
    29 */
       
    30 #if 0
       
    31 int sqlite3BtreeTrace=0;  /* True to enable tracing */
       
    32 # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
       
    33 #else
       
    34 # define TRACE(X)
       
    35 #endif
       
    36 
       
    37 /*
       
    38 ** Sometimes we need a small amount of code such as a variable initialization
       
    39 ** to setup for a later assert() statement.  We do not want this code to
       
    40 ** appear when assert() is disabled.  The following macro is therefore
       
    41 ** used to contain that setup code.  The "VVA" acronym stands for
       
    42 ** "Verification, Validation, and Accreditation".  In other words, the
       
    43 ** code within VVA_ONLY() will only run during verification processes.
       
    44 */
       
    45 #ifndef NDEBUG
       
    46 # define VVA_ONLY(X)  X
       
    47 #else
       
    48 # define VVA_ONLY(X)
       
    49 #endif
       
    50 
       
    51 
       
    52 
       
    53 #ifndef SQLITE_OMIT_SHARED_CACHE
       
    54 /*
       
    55 ** A list of BtShared objects that are eligible for participation
       
    56 ** in shared cache.  This variable has file scope during normal builds,
       
    57 ** but the test harness needs to access it so we make it global for 
       
    58 ** test builds.
       
    59 */
       
    60 #ifdef SQLITE_TEST
       
    61 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
       
    62 #else
       
    63 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
       
    64 #endif
       
    65 #endif /* SQLITE_OMIT_SHARED_CACHE */
       
    66 
       
    67 #ifndef SQLITE_OMIT_SHARED_CACHE
       
    68 /*
       
    69 ** Enable or disable the shared pager and schema features.
       
    70 **
       
    71 ** This routine has no effect on existing database connections.
       
    72 ** The shared cache setting effects only future calls to
       
    73 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
       
    74 */
       
    75 SQLITE_EXPORT int sqlite3_enable_shared_cache(int enable){
       
    76   sqlite3GlobalConfig.sharedCacheEnabled = enable;
       
    77   return SQLITE_OK;
       
    78 }
       
    79 #endif
       
    80 
       
    81 
       
    82 /*
       
    83 ** Forward declaration
       
    84 */
       
    85 static int checkReadLocks(Btree*, Pgno, BtCursor*, i64);
       
    86 
       
    87 
       
    88 #ifdef SQLITE_OMIT_SHARED_CACHE
       
    89   /*
       
    90   ** The functions queryTableLock(), lockTable() and unlockAllTables()
       
    91   ** manipulate entries in the BtShared.pLock linked list used to store
       
    92   ** shared-cache table level locks. If the library is compiled with the
       
    93   ** shared-cache feature disabled, then there is only ever one user
       
    94   ** of each BtShared structure and so this locking is not necessary. 
       
    95   ** So define the lock related functions as no-ops.
       
    96   */
       
    97   #define queryTableLock(a,b,c) SQLITE_OK
       
    98   #define lockTable(a,b,c) SQLITE_OK
       
    99   #define unlockAllTables(a)
       
   100 #endif
       
   101 
       
   102 #ifndef SQLITE_OMIT_SHARED_CACHE
       
   103 /*
       
   104 ** Query to see if btree handle p may obtain a lock of type eLock 
       
   105 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
       
   106 ** SQLITE_OK if the lock may be obtained (by calling lockTable()), or
       
   107 ** SQLITE_LOCKED if not.
       
   108 */
       
   109 static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){
       
   110   BtShared *pBt = p->pBt;
       
   111   BtLock *pIter;
       
   112 
       
   113   assert( sqlite3BtreeHoldsMutex(p) );
       
   114   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
       
   115   assert( p->db!=0 );
       
   116   
       
   117   /* This is a no-op if the shared-cache is not enabled */
       
   118   if( !p->sharable ){
       
   119     return SQLITE_OK;
       
   120   }
       
   121 
       
   122   /* If some other connection is holding an exclusive lock, the
       
   123   ** requested lock may not be obtained.
       
   124   */
       
   125   if( pBt->pExclusive && pBt->pExclusive!=p ){
       
   126     return SQLITE_LOCKED;
       
   127   }
       
   128 
       
   129   /* This (along with lockTable()) is where the ReadUncommitted flag is
       
   130   ** dealt with. If the caller is querying for a read-lock and the flag is
       
   131   ** set, it is unconditionally granted - even if there are write-locks
       
   132   ** on the table. If a write-lock is requested, the ReadUncommitted flag
       
   133   ** is not considered.
       
   134   **
       
   135   ** In function lockTable(), if a read-lock is demanded and the 
       
   136   ** ReadUncommitted flag is set, no entry is added to the locks list 
       
   137   ** (BtShared.pLock).
       
   138   **
       
   139   ** To summarize: If the ReadUncommitted flag is set, then read cursors do
       
   140   ** not create or respect table locks. The locking procedure for a 
       
   141   ** write-cursor does not change.
       
   142   */
       
   143   if( 
       
   144     0==(p->db->flags&SQLITE_ReadUncommitted) || 
       
   145     eLock==WRITE_LOCK ||
       
   146     iTab==MASTER_ROOT
       
   147   ){
       
   148     for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
       
   149       if( pIter->pBtree!=p && pIter->iTable==iTab && 
       
   150           (pIter->eLock!=eLock || eLock!=READ_LOCK) ){
       
   151         return SQLITE_LOCKED;
       
   152       }
       
   153     }
       
   154   }
       
   155   return SQLITE_OK;
       
   156 }
       
   157 #endif /* !SQLITE_OMIT_SHARED_CACHE */
       
   158 
       
   159 #ifndef SQLITE_OMIT_SHARED_CACHE
       
   160 /*
       
   161 ** Add a lock on the table with root-page iTable to the shared-btree used
       
   162 ** by Btree handle p. Parameter eLock must be either READ_LOCK or 
       
   163 ** WRITE_LOCK.
       
   164 **
       
   165 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_BUSY and
       
   166 ** SQLITE_NOMEM may also be returned.
       
   167 */
       
   168 static int lockTable(Btree *p, Pgno iTable, u8 eLock){
       
   169   BtShared *pBt = p->pBt;
       
   170   BtLock *pLock = 0;
       
   171   BtLock *pIter;
       
   172 
       
   173   assert( sqlite3BtreeHoldsMutex(p) );
       
   174   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
       
   175   assert( p->db!=0 );
       
   176 
       
   177   /* This is a no-op if the shared-cache is not enabled */
       
   178   if( !p->sharable ){
       
   179     return SQLITE_OK;
       
   180   }
       
   181 
       
   182   assert( SQLITE_OK==queryTableLock(p, iTable, eLock) );
       
   183 
       
   184   /* If the read-uncommitted flag is set and a read-lock is requested,
       
   185   ** return early without adding an entry to the BtShared.pLock list. See
       
   186   ** comment in function queryTableLock() for more info on handling 
       
   187   ** the ReadUncommitted flag.
       
   188   */
       
   189   if( 
       
   190     (p->db->flags&SQLITE_ReadUncommitted) && 
       
   191     (eLock==READ_LOCK) &&
       
   192     iTable!=MASTER_ROOT
       
   193   ){
       
   194     return SQLITE_OK;
       
   195   }
       
   196 
       
   197   /* First search the list for an existing lock on this table. */
       
   198   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
       
   199     if( pIter->iTable==iTable && pIter->pBtree==p ){
       
   200       pLock = pIter;
       
   201       break;
       
   202     }
       
   203   }
       
   204 
       
   205   /* If the above search did not find a BtLock struct associating Btree p
       
   206   ** with table iTable, allocate one and link it into the list.
       
   207   */
       
   208   if( !pLock ){
       
   209     pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
       
   210     if( !pLock ){
       
   211       return SQLITE_NOMEM;
       
   212     }
       
   213     pLock->iTable = iTable;
       
   214     pLock->pBtree = p;
       
   215     pLock->pNext = pBt->pLock;
       
   216     pBt->pLock = pLock;
       
   217   }
       
   218 
       
   219   /* Set the BtLock.eLock variable to the maximum of the current lock
       
   220   ** and the requested lock. This means if a write-lock was already held
       
   221   ** and a read-lock requested, we don't incorrectly downgrade the lock.
       
   222   */
       
   223   assert( WRITE_LOCK>READ_LOCK );
       
   224   if( eLock>pLock->eLock ){
       
   225     pLock->eLock = eLock;
       
   226   }
       
   227 
       
   228   return SQLITE_OK;
       
   229 }
       
   230 #endif /* !SQLITE_OMIT_SHARED_CACHE */
       
   231 
       
   232 #ifndef SQLITE_OMIT_SHARED_CACHE
       
   233 /*
       
   234 ** Release all the table locks (locks obtained via calls to the lockTable()
       
   235 ** procedure) held by Btree handle p.
       
   236 */
       
   237 static void unlockAllTables(Btree *p){
       
   238   BtShared *pBt = p->pBt;
       
   239   BtLock **ppIter = &pBt->pLock;
       
   240 
       
   241   assert( sqlite3BtreeHoldsMutex(p) );
       
   242   assert( p->sharable || 0==*ppIter );
       
   243 
       
   244   while( *ppIter ){
       
   245     BtLock *pLock = *ppIter;
       
   246     assert( pBt->pExclusive==0 || pBt->pExclusive==pLock->pBtree );
       
   247     if( pLock->pBtree==p ){
       
   248       *ppIter = pLock->pNext;
       
   249       sqlite3_free(pLock);
       
   250     }else{
       
   251       ppIter = &pLock->pNext;
       
   252     }
       
   253   }
       
   254 
       
   255   if( pBt->pExclusive==p ){
       
   256     pBt->pExclusive = 0;
       
   257   }
       
   258 }
       
   259 #endif /* SQLITE_OMIT_SHARED_CACHE */
       
   260 
       
   261 static void releasePage(MemPage *pPage);  /* Forward reference */
       
   262 
       
   263 /*
       
   264 ** Verify that the cursor holds a mutex on the BtShared
       
   265 */
       
   266 #ifndef NDEBUG
       
   267 static int cursorHoldsMutex(BtCursor *p){
       
   268   return sqlite3_mutex_held(p->pBt->mutex);
       
   269 }
       
   270 #endif
       
   271 
       
   272 
       
   273 #ifndef SQLITE_OMIT_INCRBLOB
       
   274 /*
       
   275 ** Invalidate the overflow page-list cache for cursor pCur, if any.
       
   276 */
       
   277 static void invalidateOverflowCache(BtCursor *pCur){
       
   278   assert( cursorHoldsMutex(pCur) );
       
   279   sqlite3_free(pCur->aOverflow);
       
   280   pCur->aOverflow = 0;
       
   281 }
       
   282 
       
   283 /*
       
   284 ** Invalidate the overflow page-list cache for all cursors opened
       
   285 ** on the shared btree structure pBt.
       
   286 */
       
   287 static void invalidateAllOverflowCache(BtShared *pBt){
       
   288   BtCursor *p;
       
   289   assert( sqlite3_mutex_held(pBt->mutex) );
       
   290   for(p=pBt->pCursor; p; p=p->pNext){
       
   291     invalidateOverflowCache(p);
       
   292   }
       
   293 }
       
   294 #else
       
   295   #define invalidateOverflowCache(x)
       
   296   #define invalidateAllOverflowCache(x)
       
   297 #endif
       
   298 
       
   299 /*
       
   300 ** Save the current cursor position in the variables BtCursor.nKey 
       
   301 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
       
   302 */
       
   303 static int saveCursorPosition(BtCursor *pCur){
       
   304   int rc;
       
   305 
       
   306   assert( CURSOR_VALID==pCur->eState );
       
   307   assert( 0==pCur->pKey );
       
   308   assert( cursorHoldsMutex(pCur) );
       
   309 
       
   310   rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
       
   311 
       
   312   /* If this is an intKey table, then the above call to BtreeKeySize()
       
   313   ** stores the integer key in pCur->nKey. In this case this value is
       
   314   ** all that is required. Otherwise, if pCur is not open on an intKey
       
   315   ** table, then malloc space for and store the pCur->nKey bytes of key 
       
   316   ** data.
       
   317   */
       
   318   if( rc==SQLITE_OK && 0==pCur->apPage[0]->intKey){
       
   319     void *pKey = sqlite3Malloc(pCur->nKey);
       
   320     if( pKey ){
       
   321       rc = sqlite3BtreeKey(pCur, 0, pCur->nKey, pKey);
       
   322       if( rc==SQLITE_OK ){
       
   323         pCur->pKey = pKey;
       
   324       }else{
       
   325         sqlite3_free(pKey);
       
   326       }
       
   327     }else{
       
   328       rc = SQLITE_NOMEM;
       
   329     }
       
   330   }
       
   331   assert( !pCur->apPage[0]->intKey || !pCur->pKey );
       
   332 
       
   333   if( rc==SQLITE_OK ){
       
   334     int i;
       
   335     for(i=0; i<=pCur->iPage; i++){
       
   336       releasePage(pCur->apPage[i]);
       
   337       pCur->apPage[i] = 0;
       
   338     }
       
   339     pCur->iPage = -1;
       
   340     pCur->eState = CURSOR_REQUIRESEEK;
       
   341   }
       
   342 
       
   343   invalidateOverflowCache(pCur);
       
   344   return rc;
       
   345 }
       
   346 
       
   347 /*
       
   348 ** Save the positions of all cursors except pExcept open on the table 
       
   349 ** with root-page iRoot. Usually, this is called just before cursor
       
   350 ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
       
   351 */
       
   352 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
       
   353   BtCursor *p;
       
   354   assert( sqlite3_mutex_held(pBt->mutex) );
       
   355   assert( pExcept==0 || pExcept->pBt==pBt );
       
   356   for(p=pBt->pCursor; p; p=p->pNext){
       
   357     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) && 
       
   358         p->eState==CURSOR_VALID ){
       
   359       int rc = saveCursorPosition(p);
       
   360       if( SQLITE_OK!=rc ){
       
   361         return rc;
       
   362       }
       
   363     }
       
   364   }
       
   365   return SQLITE_OK;
       
   366 }
       
   367 
       
   368 /*
       
   369 ** Clear the current cursor position.
       
   370 */
       
   371 static void clearCursorPosition(BtCursor *pCur){
       
   372   assert( cursorHoldsMutex(pCur) );
       
   373   sqlite3_free(pCur->pKey);
       
   374   pCur->pKey = 0;
       
   375   pCur->eState = CURSOR_INVALID;
       
   376 }
       
   377 
       
   378 /*
       
   379 ** Restore the cursor to the position it was in (or as close to as possible)
       
   380 ** when saveCursorPosition() was called. Note that this call deletes the 
       
   381 ** saved position info stored by saveCursorPosition(), so there can be
       
   382 ** at most one effective restoreCursorPosition() call after each 
       
   383 ** saveCursorPosition().
       
   384 */
       
   385 int sqlite3BtreeRestoreCursorPosition(BtCursor *pCur){
       
   386   int rc;
       
   387   assert( cursorHoldsMutex(pCur) );
       
   388   assert( pCur->eState>=CURSOR_REQUIRESEEK );
       
   389   if( pCur->eState==CURSOR_FAULT ){
       
   390     return pCur->skip;
       
   391   }
       
   392   pCur->eState = CURSOR_INVALID;
       
   393   rc = sqlite3BtreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skip);
       
   394   if( rc==SQLITE_OK ){
       
   395     sqlite3_free(pCur->pKey);
       
   396     pCur->pKey = 0;
       
   397     assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
       
   398   }
       
   399   return rc;
       
   400 }
       
   401 
       
   402 #define restoreCursorPosition(p) \
       
   403   (p->eState>=CURSOR_REQUIRESEEK ? \
       
   404          sqlite3BtreeRestoreCursorPosition(p) : \
       
   405          SQLITE_OK)
       
   406 
       
   407 /*
       
   408 ** Determine whether or not a cursor has moved from the position it
       
   409 ** was last placed at.  Cursor can move when the row they are pointing
       
   410 ** at is deleted out from under them.
       
   411 **
       
   412 ** This routine returns an error code if something goes wrong.  The
       
   413 ** integer *pHasMoved is set to one if the cursor has moved and 0 if not.
       
   414 */
       
   415 int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){
       
   416   int rc;
       
   417 
       
   418   rc = restoreCursorPosition(pCur);
       
   419   if( rc ){
       
   420     *pHasMoved = 1;
       
   421     return rc;
       
   422   }
       
   423   if( pCur->eState!=CURSOR_VALID || pCur->skip!=0 ){
       
   424     *pHasMoved = 1;
       
   425   }else{
       
   426     *pHasMoved = 0;
       
   427   }
       
   428   return SQLITE_OK;
       
   429 }
       
   430 
       
   431 #ifndef SQLITE_OMIT_AUTOVACUUM
       
   432 /*
       
   433 ** Given a page number of a regular database page, return the page
       
   434 ** number for the pointer-map page that contains the entry for the
       
   435 ** input page number.
       
   436 */
       
   437 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
       
   438   int nPagesPerMapPage, iPtrMap, ret;
       
   439   assert( sqlite3_mutex_held(pBt->mutex) );
       
   440   nPagesPerMapPage = (pBt->usableSize/5)+1;
       
   441   iPtrMap = (pgno-2)/nPagesPerMapPage;
       
   442   ret = (iPtrMap*nPagesPerMapPage) + 2; 
       
   443   if( ret==PENDING_BYTE_PAGE(pBt) ){
       
   444     ret++;
       
   445   }
       
   446   return ret;
       
   447 }
       
   448 
       
   449 /*
       
   450 ** Write an entry into the pointer map.
       
   451 **
       
   452 ** This routine updates the pointer map entry for page number 'key'
       
   453 ** so that it maps to type 'eType' and parent page number 'pgno'.
       
   454 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
       
   455 */
       
   456 static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){
       
   457   DbPage *pDbPage;  /* The pointer map page */
       
   458   u8 *pPtrmap;      /* The pointer map data */
       
   459   Pgno iPtrmap;     /* The pointer map page number */
       
   460   int offset;       /* Offset in pointer map page */
       
   461   int rc;
       
   462 
       
   463   assert( sqlite3_mutex_held(pBt->mutex) );
       
   464   /* The master-journal page number must never be used as a pointer map page */
       
   465   assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
       
   466 
       
   467   assert( pBt->autoVacuum );
       
   468   if( key==0 ){
       
   469     return SQLITE_CORRUPT_BKPT;
       
   470   }
       
   471   iPtrmap = PTRMAP_PAGENO(pBt, key);
       
   472   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
       
   473   if( rc!=SQLITE_OK ){
       
   474     return rc;
       
   475   }
       
   476   offset = PTRMAP_PTROFFSET(iPtrmap, key);
       
   477   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
       
   478 
       
   479   if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
       
   480     TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
       
   481     rc = sqlite3PagerWrite(pDbPage);
       
   482     if( rc==SQLITE_OK ){
       
   483       pPtrmap[offset] = eType;
       
   484       put4byte(&pPtrmap[offset+1], parent);
       
   485     }
       
   486   }
       
   487 
       
   488   sqlite3PagerUnref(pDbPage);
       
   489   return rc;
       
   490 }
       
   491 
       
   492 /*
       
   493 ** Read an entry from the pointer map.
       
   494 **
       
   495 ** This routine retrieves the pointer map entry for page 'key', writing
       
   496 ** the type and parent page number to *pEType and *pPgno respectively.
       
   497 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
       
   498 */
       
   499 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
       
   500   DbPage *pDbPage;   /* The pointer map page */
       
   501   int iPtrmap;       /* Pointer map page index */
       
   502   u8 *pPtrmap;       /* Pointer map page data */
       
   503   int offset;        /* Offset of entry in pointer map */
       
   504   int rc;
       
   505 
       
   506   assert( sqlite3_mutex_held(pBt->mutex) );
       
   507 
       
   508   iPtrmap = PTRMAP_PAGENO(pBt, key);
       
   509   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
       
   510   if( rc!=0 ){
       
   511     return rc;
       
   512   }
       
   513   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
       
   514 
       
   515   offset = PTRMAP_PTROFFSET(iPtrmap, key);
       
   516   assert( pEType!=0 );
       
   517   *pEType = pPtrmap[offset];
       
   518   if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
       
   519 
       
   520   sqlite3PagerUnref(pDbPage);
       
   521   if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
       
   522   return SQLITE_OK;
       
   523 }
       
   524 
       
   525 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
       
   526   #define ptrmapPut(w,x,y,z) SQLITE_OK
       
   527   #define ptrmapGet(w,x,y,z) SQLITE_OK
       
   528   #define ptrmapPutOvfl(y,z) SQLITE_OK
       
   529 #endif
       
   530 
       
   531 /*
       
   532 ** Given a btree page and a cell index (0 means the first cell on
       
   533 ** the page, 1 means the second cell, and so forth) return a pointer
       
   534 ** to the cell content.
       
   535 **
       
   536 ** This routine works only for pages that do not contain overflow cells.
       
   537 */
       
   538 #define findCell(P,I) \
       
   539   ((P)->aData + ((P)->maskPage & get2byte(&(P)->aData[(P)->cellOffset+2*(I)])))
       
   540 
       
   541 /*
       
   542 ** This a more complex version of findCell() that works for
       
   543 ** pages that do contain overflow cells.  See insert
       
   544 */
       
   545 static u8 *findOverflowCell(MemPage *pPage, int iCell){
       
   546   int i;
       
   547   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
       
   548   for(i=pPage->nOverflow-1; i>=0; i--){
       
   549     int k;
       
   550     struct _OvflCell *pOvfl;
       
   551     pOvfl = &pPage->aOvfl[i];
       
   552     k = pOvfl->idx;
       
   553     if( k<=iCell ){
       
   554       if( k==iCell ){
       
   555         return pOvfl->pCell;
       
   556       }
       
   557       iCell--;
       
   558     }
       
   559   }
       
   560   return findCell(pPage, iCell);
       
   561 }
       
   562 
       
   563 /*
       
   564 ** Parse a cell content block and fill in the CellInfo structure.  There
       
   565 ** are two versions of this function.  sqlite3BtreeParseCell() takes a 
       
   566 ** cell index as the second argument and sqlite3BtreeParseCellPtr() 
       
   567 ** takes a pointer to the body of the cell as its second argument.
       
   568 **
       
   569 ** Within this file, the parseCell() macro can be called instead of
       
   570 ** sqlite3BtreeParseCellPtr(). Using some compilers, this will be faster.
       
   571 */
       
   572 void sqlite3BtreeParseCellPtr(
       
   573   MemPage *pPage,         /* Page containing the cell */
       
   574   u8 *pCell,              /* Pointer to the cell text. */
       
   575   CellInfo *pInfo         /* Fill in this structure */
       
   576 ){
       
   577   int n;                  /* Number bytes in cell content header */
       
   578   u32 nPayload;           /* Number of bytes of cell payload */
       
   579 
       
   580   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
       
   581 
       
   582   pInfo->pCell = pCell;
       
   583   assert( pPage->leaf==0 || pPage->leaf==1 );
       
   584   n = pPage->childPtrSize;
       
   585   assert( n==4-4*pPage->leaf );
       
   586   if( pPage->intKey ){
       
   587     if( pPage->hasData ){
       
   588       n += getVarint32(&pCell[n], nPayload);
       
   589     }else{
       
   590       nPayload = 0;
       
   591     }
       
   592     n += getVarint(&pCell[n], (u64*)&pInfo->nKey);
       
   593     pInfo->nData = nPayload;
       
   594   }else{
       
   595     pInfo->nData = 0;
       
   596     n += getVarint32(&pCell[n], nPayload);
       
   597     pInfo->nKey = nPayload;
       
   598   }
       
   599   pInfo->nPayload = nPayload;
       
   600   pInfo->nHeader = n;
       
   601   if( likely(nPayload<=pPage->maxLocal) ){
       
   602     /* This is the (easy) common case where the entire payload fits
       
   603     ** on the local page.  No overflow is required.
       
   604     */
       
   605     int nSize;          /* Total size of cell content in bytes */
       
   606     nSize = nPayload + n;
       
   607     pInfo->nLocal = nPayload;
       
   608     pInfo->iOverflow = 0;
       
   609     if( (nSize & ~3)==0 ){
       
   610       nSize = 4;        /* Minimum cell size is 4 */
       
   611     }
       
   612     pInfo->nSize = nSize;
       
   613   }else{
       
   614     /* If the payload will not fit completely on the local page, we have
       
   615     ** to decide how much to store locally and how much to spill onto
       
   616     ** overflow pages.  The strategy is to minimize the amount of unused
       
   617     ** space on overflow pages while keeping the amount of local storage
       
   618     ** in between minLocal and maxLocal.
       
   619     **
       
   620     ** Warning:  changing the way overflow payload is distributed in any
       
   621     ** way will result in an incompatible file format.
       
   622     */
       
   623     int minLocal;  /* Minimum amount of payload held locally */
       
   624     int maxLocal;  /* Maximum amount of payload held locally */
       
   625     int surplus;   /* Overflow payload available for local storage */
       
   626 
       
   627     minLocal = pPage->minLocal;
       
   628     maxLocal = pPage->maxLocal;
       
   629     surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
       
   630     if( surplus <= maxLocal ){
       
   631       pInfo->nLocal = surplus;
       
   632     }else{
       
   633       pInfo->nLocal = minLocal;
       
   634     }
       
   635     pInfo->iOverflow = pInfo->nLocal + n;
       
   636     pInfo->nSize = pInfo->iOverflow + 4;
       
   637   }
       
   638 }
       
   639 #define parseCell(pPage, iCell, pInfo) \
       
   640   sqlite3BtreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
       
   641 void sqlite3BtreeParseCell(
       
   642   MemPage *pPage,         /* Page containing the cell */
       
   643   int iCell,              /* The cell index.  First cell is 0 */
       
   644   CellInfo *pInfo         /* Fill in this structure */
       
   645 ){
       
   646   parseCell(pPage, iCell, pInfo);
       
   647 }
       
   648 
       
   649 /*
       
   650 ** Compute the total number of bytes that a Cell needs in the cell
       
   651 ** data area of the btree-page.  The return number includes the cell
       
   652 ** data header and the local payload, but not any overflow page or
       
   653 ** the space used by the cell pointer.
       
   654 */
       
   655 #ifndef NDEBUG
       
   656 static u16 cellSize(MemPage *pPage, int iCell){
       
   657   CellInfo info;
       
   658   sqlite3BtreeParseCell(pPage, iCell, &info);
       
   659   return info.nSize;
       
   660 }
       
   661 #endif
       
   662 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
       
   663   CellInfo info;
       
   664   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
       
   665   return info.nSize;
       
   666 }
       
   667 
       
   668 #ifndef SQLITE_OMIT_AUTOVACUUM
       
   669 /*
       
   670 ** If the cell pCell, part of page pPage contains a pointer
       
   671 ** to an overflow page, insert an entry into the pointer-map
       
   672 ** for the overflow page.
       
   673 */
       
   674 static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){
       
   675   CellInfo info;
       
   676   assert( pCell!=0 );
       
   677   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
       
   678   assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
       
   679   if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
       
   680     Pgno ovfl = get4byte(&pCell[info.iOverflow]);
       
   681     return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno);
       
   682   }
       
   683   return SQLITE_OK;
       
   684 }
       
   685 /*
       
   686 ** If the cell with index iCell on page pPage contains a pointer
       
   687 ** to an overflow page, insert an entry into the pointer-map
       
   688 ** for the overflow page.
       
   689 */
       
   690 static int ptrmapPutOvfl(MemPage *pPage, int iCell){
       
   691   u8 *pCell;
       
   692   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
       
   693   pCell = findOverflowCell(pPage, iCell);
       
   694   return ptrmapPutOvflPtr(pPage, pCell);
       
   695 }
       
   696 #endif
       
   697 
       
   698 
       
   699 /*
       
   700 ** Defragment the page given.  All Cells are moved to the
       
   701 ** end of the page and all free space is collected into one
       
   702 ** big FreeBlk that occurs in between the header and cell
       
   703 ** pointer array and the cell content area.
       
   704 */
       
   705 static void defragmentPage(MemPage *pPage){
       
   706   int i;                     /* Loop counter */
       
   707   int pc;                    /* Address of a i-th cell */
       
   708   int addr;                  /* Offset of first byte after cell pointer array */
       
   709   int hdr;                   /* Offset to the page header */
       
   710   int size;                  /* Size of a cell */
       
   711   int usableSize;            /* Number of usable bytes on a page */
       
   712   int cellOffset;            /* Offset to the cell pointer array */
       
   713   int cbrk;                  /* Offset to the cell content area */
       
   714   int nCell;                 /* Number of cells on the page */
       
   715   unsigned char *data;       /* The page data */
       
   716   unsigned char *temp;       /* Temp area for cell content */
       
   717 
       
   718   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
       
   719   assert( pPage->pBt!=0 );
       
   720   assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
       
   721   assert( pPage->nOverflow==0 );
       
   722   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
       
   723   temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
       
   724   data = pPage->aData;
       
   725   hdr = pPage->hdrOffset;
       
   726   cellOffset = pPage->cellOffset;
       
   727   nCell = pPage->nCell;
       
   728   assert( nCell==get2byte(&data[hdr+3]) );
       
   729   usableSize = pPage->pBt->usableSize;
       
   730   cbrk = get2byte(&data[hdr+5]);
       
   731   memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
       
   732   cbrk = usableSize;
       
   733   for(i=0; i<nCell; i++){
       
   734     u8 *pAddr;     /* The i-th cell pointer */
       
   735     pAddr = &data[cellOffset + i*2];
       
   736     pc = get2byte(pAddr);
       
   737     assert( pc<pPage->pBt->usableSize );
       
   738     size = cellSizePtr(pPage, &temp[pc]);
       
   739     cbrk -= size;
       
   740     memcpy(&data[cbrk], &temp[pc], size);
       
   741     put2byte(pAddr, cbrk);
       
   742   }
       
   743   assert( cbrk>=cellOffset+2*nCell );
       
   744   put2byte(&data[hdr+5], cbrk);
       
   745   data[hdr+1] = 0;
       
   746   data[hdr+2] = 0;
       
   747   data[hdr+7] = 0;
       
   748   addr = cellOffset+2*nCell;
       
   749   memset(&data[addr], 0, cbrk-addr);
       
   750 }
       
   751 
       
   752 /*
       
   753 ** Allocate nByte bytes of space on a page.
       
   754 **
       
   755 ** Return the index into pPage->aData[] of the first byte of
       
   756 ** the new allocation.  The caller guarantees that there is enough
       
   757 ** space.  This routine will never fail.
       
   758 **
       
   759 ** If the page contains nBytes of free space but does not contain
       
   760 ** nBytes of contiguous free space, then this routine automatically
       
   761 ** calls defragementPage() to consolidate all free space before 
       
   762 ** allocating the new chunk.
       
   763 */
       
   764 static int allocateSpace(MemPage *pPage, int nByte){
       
   765   int addr, pc, hdr;
       
   766   int size;
       
   767   int nFrag;
       
   768   int top;
       
   769   int nCell;
       
   770   int cellOffset;
       
   771   unsigned char *data;
       
   772   
       
   773   data = pPage->aData;
       
   774   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
       
   775   assert( pPage->pBt );
       
   776   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
       
   777   assert( nByte>=0 );  /* Minimum cell size is 4 */
       
   778   assert( pPage->nFree>=nByte );
       
   779   assert( pPage->nOverflow==0 );
       
   780   pPage->nFree -= nByte;
       
   781   hdr = pPage->hdrOffset;
       
   782 
       
   783   nFrag = data[hdr+7];
       
   784   if( nFrag<60 ){
       
   785     /* Search the freelist looking for a slot big enough to satisfy the
       
   786     ** space request. */
       
   787     addr = hdr+1;
       
   788     while( (pc = get2byte(&data[addr]))>0 ){
       
   789       size = get2byte(&data[pc+2]);
       
   790       if( size>=nByte ){
       
   791         if( size<nByte+4 ){
       
   792           memcpy(&data[addr], &data[pc], 2);
       
   793           data[hdr+7] = nFrag + size - nByte;
       
   794           return pc;
       
   795         }else{
       
   796           put2byte(&data[pc+2], size-nByte);
       
   797           return pc + size - nByte;
       
   798         }
       
   799       }
       
   800       addr = pc;
       
   801     }
       
   802   }
       
   803 
       
   804   /* Allocate memory from the gap in between the cell pointer array
       
   805   ** and the cell content area.
       
   806   */
       
   807   top = get2byte(&data[hdr+5]);
       
   808   nCell = get2byte(&data[hdr+3]);
       
   809   cellOffset = pPage->cellOffset;
       
   810   if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){
       
   811     defragmentPage(pPage);
       
   812     top = get2byte(&data[hdr+5]);
       
   813   }
       
   814   top -= nByte;
       
   815   assert( cellOffset + 2*nCell <= top );
       
   816   put2byte(&data[hdr+5], top);
       
   817   return top;
       
   818 }
       
   819 
       
   820 /*
       
   821 ** Return a section of the pPage->aData to the freelist.
       
   822 ** The first byte of the new free block is pPage->aDisk[start]
       
   823 ** and the size of the block is "size" bytes.
       
   824 **
       
   825 ** Most of the effort here is involved in coalesing adjacent
       
   826 ** free blocks into a single big free block.
       
   827 */
       
   828 static void freeSpace(MemPage *pPage, int start, int size){
       
   829   int addr, pbegin, hdr;
       
   830   unsigned char *data = pPage->aData;
       
   831 
       
   832   assert( pPage->pBt!=0 );
       
   833   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
       
   834   assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
       
   835   assert( (start + size)<=pPage->pBt->usableSize );
       
   836   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
       
   837   assert( size>=0 );   /* Minimum cell size is 4 */
       
   838 
       
   839 #ifdef SQLITE_SECURE_DELETE
       
   840   /* Overwrite deleted information with zeros when the SECURE_DELETE 
       
   841   ** option is enabled at compile-time */
       
   842   memset(&data[start], 0, size);
       
   843 #endif
       
   844 
       
   845   /* Add the space back into the linked list of freeblocks */
       
   846   hdr = pPage->hdrOffset;
       
   847   addr = hdr + 1;
       
   848   while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
       
   849     assert( pbegin<=pPage->pBt->usableSize-4 );
       
   850     assert( pbegin>addr );
       
   851     addr = pbegin;
       
   852   }
       
   853   assert( pbegin<=pPage->pBt->usableSize-4 );
       
   854   assert( pbegin>addr || pbegin==0 );
       
   855   put2byte(&data[addr], start);
       
   856   put2byte(&data[start], pbegin);
       
   857   put2byte(&data[start+2], size);
       
   858   pPage->nFree += size;
       
   859 
       
   860   /* Coalesce adjacent free blocks */
       
   861   addr = pPage->hdrOffset + 1;
       
   862   while( (pbegin = get2byte(&data[addr]))>0 ){
       
   863     int pnext, psize;
       
   864     assert( pbegin>addr );
       
   865     assert( pbegin<=pPage->pBt->usableSize-4 );
       
   866     pnext = get2byte(&data[pbegin]);
       
   867     psize = get2byte(&data[pbegin+2]);
       
   868     if( pbegin + psize + 3 >= pnext && pnext>0 ){
       
   869       int frag = pnext - (pbegin+psize);
       
   870       assert( frag<=data[pPage->hdrOffset+7] );
       
   871       data[pPage->hdrOffset+7] -= frag;
       
   872       put2byte(&data[pbegin], get2byte(&data[pnext]));
       
   873       put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
       
   874     }else{
       
   875       addr = pbegin;
       
   876     }
       
   877   }
       
   878 
       
   879   /* If the cell content area begins with a freeblock, remove it. */
       
   880   if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
       
   881     int top;
       
   882     pbegin = get2byte(&data[hdr+1]);
       
   883     memcpy(&data[hdr+1], &data[pbegin], 2);
       
   884     top = get2byte(&data[hdr+5]);
       
   885     put2byte(&data[hdr+5], top + get2byte(&data[pbegin+2]));
       
   886   }
       
   887 }
       
   888 
       
   889 /*
       
   890 ** Decode the flags byte (the first byte of the header) for a page
       
   891 ** and initialize fields of the MemPage structure accordingly.
       
   892 **
       
   893 ** Only the following combinations are supported.  Anything different
       
   894 ** indicates a corrupt database files:
       
   895 **
       
   896 **         PTF_ZERODATA
       
   897 **         PTF_ZERODATA | PTF_LEAF
       
   898 **         PTF_LEAFDATA | PTF_INTKEY
       
   899 **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
       
   900 */
       
   901 static int decodeFlags(MemPage *pPage, int flagByte){
       
   902   BtShared *pBt;     /* A copy of pPage->pBt */
       
   903 
       
   904   assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
       
   905   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
       
   906   pPage->leaf = flagByte>>3;  assert( PTF_LEAF == 1<<3 );
       
   907   flagByte &= ~PTF_LEAF;
       
   908   pPage->childPtrSize = 4-4*pPage->leaf;
       
   909   pBt = pPage->pBt;
       
   910   if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
       
   911     pPage->intKey = 1;
       
   912     pPage->hasData = pPage->leaf;
       
   913     pPage->maxLocal = pBt->maxLeaf;
       
   914     pPage->minLocal = pBt->minLeaf;
       
   915   }else if( flagByte==PTF_ZERODATA ){
       
   916     pPage->intKey = 0;
       
   917     pPage->hasData = 0;
       
   918     pPage->maxLocal = pBt->maxLocal;
       
   919     pPage->minLocal = pBt->minLocal;
       
   920   }else{
       
   921     return SQLITE_CORRUPT_BKPT;
       
   922   }
       
   923   return SQLITE_OK;
       
   924 }
       
   925 
       
   926 /*
       
   927 ** Initialize the auxiliary information for a disk block.
       
   928 **
       
   929 ** Return SQLITE_OK on success.  If we see that the page does
       
   930 ** not contain a well-formed database page, then return 
       
   931 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
       
   932 ** guarantee that the page is well-formed.  It only shows that
       
   933 ** we failed to detect any corruption.
       
   934 */
       
   935 int sqlite3BtreeInitPage(MemPage *pPage){
       
   936 
       
   937   assert( pPage->pBt!=0 );
       
   938   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
       
   939   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
       
   940   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
       
   941   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
       
   942 
       
   943   if( !pPage->isInit ){
       
   944     int pc;            /* Address of a freeblock within pPage->aData[] */
       
   945     int hdr;           /* Offset to beginning of page header */
       
   946     u8 *data;          /* Equal to pPage->aData */
       
   947     BtShared *pBt;        /* The main btree structure */
       
   948     int usableSize;    /* Amount of usable space on each page */
       
   949     int cellOffset;    /* Offset from start of page to first cell pointer */
       
   950     int nFree;         /* Number of unused bytes on the page */
       
   951     int top;           /* First byte of the cell content area */
       
   952 
       
   953     pBt = pPage->pBt;
       
   954 
       
   955     hdr = pPage->hdrOffset;
       
   956     data = pPage->aData;
       
   957     if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
       
   958     assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
       
   959     pPage->maskPage = pBt->pageSize - 1;
       
   960     pPage->nOverflow = 0;
       
   961     usableSize = pBt->usableSize;
       
   962     pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
       
   963     top = get2byte(&data[hdr+5]);
       
   964     pPage->nCell = get2byte(&data[hdr+3]);
       
   965     if( pPage->nCell>MX_CELL(pBt) ){
       
   966       /* To many cells for a single page.  The page must be corrupt */
       
   967       return SQLITE_CORRUPT_BKPT;
       
   968     }
       
   969   
       
   970     /* Compute the total free space on the page */
       
   971     pc = get2byte(&data[hdr+1]);
       
   972     nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell);
       
   973     while( pc>0 ){
       
   974       int next, size;
       
   975       if( pc>usableSize-4 ){
       
   976         /* Free block is off the page */
       
   977         return SQLITE_CORRUPT_BKPT; 
       
   978       }
       
   979       next = get2byte(&data[pc]);
       
   980       size = get2byte(&data[pc+2]);
       
   981       if( next>0 && next<=pc+size+3 ){
       
   982         /* Free blocks must be in accending order */
       
   983         return SQLITE_CORRUPT_BKPT; 
       
   984       }
       
   985       nFree += size;
       
   986       pc = next;
       
   987     }
       
   988     pPage->nFree = nFree;
       
   989     if( nFree>=usableSize ){
       
   990       /* Free space cannot exceed total page size */
       
   991       return SQLITE_CORRUPT_BKPT; 
       
   992     }
       
   993 
       
   994 #if 0
       
   995   /* Check that all the offsets in the cell offset array are within range. 
       
   996   ** 
       
   997   ** Omitting this consistency check and using the pPage->maskPage mask
       
   998   ** to prevent overrunning the page buffer in findCell() results in a
       
   999   ** 2.5% performance gain.
       
  1000   */
       
  1001   {
       
  1002     u8 *pOff;        /* Iterator used to check all cell offsets are in range */
       
  1003     u8 *pEnd;        /* Pointer to end of cell offset array */
       
  1004     u8 mask;         /* Mask of bits that must be zero in MSB of cell offsets */
       
  1005     mask = ~(((u8)(pBt->pageSize>>8))-1);
       
  1006     pEnd = &data[cellOffset + pPage->nCell*2];
       
  1007     for(pOff=&data[cellOffset]; pOff!=pEnd && !((*pOff)&mask); pOff+=2);
       
  1008     if( pOff!=pEnd ){
       
  1009       return SQLITE_CORRUPT_BKPT;
       
  1010     }
       
  1011   }
       
  1012 #endif
       
  1013 
       
  1014     pPage->isInit = 1;
       
  1015   }
       
  1016   return SQLITE_OK;
       
  1017 }
       
  1018 
       
  1019 /*
       
  1020 ** Set up a raw page so that it looks like a database page holding
       
  1021 ** no entries.
       
  1022 */
       
  1023 static void zeroPage(MemPage *pPage, int flags){
       
  1024   unsigned char *data = pPage->aData;
       
  1025   BtShared *pBt = pPage->pBt;
       
  1026   int hdr = pPage->hdrOffset;
       
  1027   int first;
       
  1028 
       
  1029   assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
       
  1030   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
       
  1031   assert( sqlite3PagerGetData(pPage->pDbPage) == data );
       
  1032   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
       
  1033   assert( sqlite3_mutex_held(pBt->mutex) );
       
  1034   /*memset(&data[hdr], 0, pBt->usableSize - hdr);*/
       
  1035   data[hdr] = flags;
       
  1036   first = hdr + 8 + 4*((flags&PTF_LEAF)==0);
       
  1037   memset(&data[hdr+1], 0, 4);
       
  1038   data[hdr+7] = 0;
       
  1039   put2byte(&data[hdr+5], pBt->usableSize);
       
  1040   pPage->nFree = pBt->usableSize - first;
       
  1041   decodeFlags(pPage, flags);
       
  1042   pPage->hdrOffset = hdr;
       
  1043   pPage->cellOffset = first;
       
  1044   pPage->nOverflow = 0;
       
  1045   assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
       
  1046   pPage->maskPage = pBt->pageSize - 1;
       
  1047   pPage->nCell = 0;
       
  1048   pPage->isInit = 1;
       
  1049 }
       
  1050 
       
  1051 
       
  1052 /*
       
  1053 ** Convert a DbPage obtained from the pager into a MemPage used by
       
  1054 ** the btree layer.
       
  1055 */
       
  1056 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
       
  1057   MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
       
  1058   pPage->aData = sqlite3PagerGetData(pDbPage);
       
  1059   pPage->pDbPage = pDbPage;
       
  1060   pPage->pBt = pBt;
       
  1061   pPage->pgno = pgno;
       
  1062   pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
       
  1063   return pPage; 
       
  1064 }
       
  1065 
       
  1066 /*
       
  1067 ** Get a page from the pager.  Initialize the MemPage.pBt and
       
  1068 ** MemPage.aData elements if needed.
       
  1069 **
       
  1070 ** If the noContent flag is set, it means that we do not care about
       
  1071 ** the content of the page at this time.  So do not go to the disk
       
  1072 ** to fetch the content.  Just fill in the content with zeros for now.
       
  1073 ** If in the future we call sqlite3PagerWrite() on this page, that
       
  1074 ** means we have started to be concerned about content and the disk
       
  1075 ** read should occur at that point.
       
  1076 */
       
  1077 int sqlite3BtreeGetPage(
       
  1078   BtShared *pBt,       /* The btree */
       
  1079   Pgno pgno,           /* Number of the page to fetch */
       
  1080   MemPage **ppPage,    /* Return the page in this parameter */
       
  1081   int noContent        /* Do not load page content if true */
       
  1082 ){
       
  1083   int rc;
       
  1084   DbPage *pDbPage;
       
  1085 
       
  1086   assert( sqlite3_mutex_held(pBt->mutex) );
       
  1087   rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
       
  1088   if( rc ) return rc;
       
  1089   *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
       
  1090   return SQLITE_OK;
       
  1091 }
       
  1092 
       
  1093 /*
       
  1094 ** Return the size of the database file in pages.  Or return -1 if
       
  1095 ** there is any kind of error.
       
  1096 */
       
  1097 static int pagerPagecount(Pager *pPager){
       
  1098   int rc;
       
  1099   int nPage;
       
  1100   rc = sqlite3PagerPagecount(pPager, &nPage);
       
  1101   return (rc==SQLITE_OK?nPage:-1);
       
  1102 }
       
  1103 
       
  1104 /*
       
  1105 ** Get a page from the pager and initialize it.  This routine
       
  1106 ** is just a convenience wrapper around separate calls to
       
  1107 ** sqlite3BtreeGetPage() and sqlite3BtreeInitPage().
       
  1108 */
       
  1109 static int getAndInitPage(
       
  1110   BtShared *pBt,          /* The database file */
       
  1111   Pgno pgno,           /* Number of the page to get */
       
  1112   MemPage **ppPage     /* Write the page pointer here */
       
  1113 ){
       
  1114   int rc;
       
  1115   DbPage *pDbPage;
       
  1116   MemPage *pPage;
       
  1117 
       
  1118   assert( sqlite3_mutex_held(pBt->mutex) );
       
  1119   if( pgno==0 ){
       
  1120     return SQLITE_CORRUPT_BKPT; 
       
  1121   }
       
  1122 
       
  1123   /* It is often the case that the page we want is already in cache.
       
  1124   ** If so, get it directly.  This saves us from having to call
       
  1125   ** pagerPagecount() to make sure pgno is within limits, which results
       
  1126   ** in a measureable performance improvements.
       
  1127   */
       
  1128   pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
       
  1129   if( pDbPage ){
       
  1130     /* Page is already in cache */
       
  1131     *ppPage = pPage = btreePageFromDbPage(pDbPage, pgno, pBt);
       
  1132     rc = SQLITE_OK;
       
  1133   }else{
       
  1134     /* Page not in cache.  Acquire it. */
       
  1135     if( pgno>pagerPagecount(pBt->pPager) ){
       
  1136       return SQLITE_CORRUPT_BKPT; 
       
  1137     }
       
  1138     rc = sqlite3BtreeGetPage(pBt, pgno, ppPage, 0);
       
  1139     if( rc ) return rc;
       
  1140     pPage = *ppPage;
       
  1141   }
       
  1142   if( !pPage->isInit ){
       
  1143     rc = sqlite3BtreeInitPage(pPage);
       
  1144   }
       
  1145   if( rc!=SQLITE_OK ){
       
  1146     releasePage(pPage);
       
  1147     *ppPage = 0;
       
  1148   }
       
  1149   return rc;
       
  1150 }
       
  1151 
       
  1152 /*
       
  1153 ** Release a MemPage.  This should be called once for each prior
       
  1154 ** call to sqlite3BtreeGetPage.
       
  1155 */
       
  1156 static void releasePage(MemPage *pPage){
       
  1157   if( pPage ){
       
  1158     assert( pPage->aData );
       
  1159     assert( pPage->pBt );
       
  1160     assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
       
  1161     assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
       
  1162     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
       
  1163     sqlite3PagerUnref(pPage->pDbPage);
       
  1164   }
       
  1165 }
       
  1166 
       
  1167 /*
       
  1168 ** During a rollback, when the pager reloads information into the cache
       
  1169 ** so that the cache is restored to its original state at the start of
       
  1170 ** the transaction, for each page restored this routine is called.
       
  1171 **
       
  1172 ** This routine needs to reset the extra data section at the end of the
       
  1173 ** page to agree with the restored data.
       
  1174 */
       
  1175 static void pageReinit(DbPage *pData){
       
  1176   MemPage *pPage;
       
  1177   pPage = (MemPage *)sqlite3PagerGetExtra(pData);
       
  1178   if( pPage->isInit ){
       
  1179     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
       
  1180     pPage->isInit = 0;
       
  1181     if( sqlite3PagerPageRefcount(pData)>0 ){
       
  1182       sqlite3BtreeInitPage(pPage);
       
  1183     }
       
  1184   }
       
  1185 }
       
  1186 
       
  1187 /*
       
  1188 ** Invoke the busy handler for a btree.
       
  1189 */
       
  1190 static int sqlite3BtreeInvokeBusyHandler(void *pArg, int n){
       
  1191   BtShared *pBt = (BtShared*)pArg;
       
  1192   assert( pBt->db );
       
  1193   assert( sqlite3_mutex_held(pBt->db->mutex) );
       
  1194   return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
       
  1195 }
       
  1196 
       
  1197 /*
       
  1198 ** Open a database file.
       
  1199 ** 
       
  1200 ** zFilename is the name of the database file.  If zFilename is NULL
       
  1201 ** a new database with a random name is created.  This randomly named
       
  1202 ** database file will be deleted when sqlite3BtreeClose() is called.
       
  1203 ** If zFilename is ":memory:" then an in-memory database is created
       
  1204 ** that is automatically destroyed when it is closed.
       
  1205 */
       
  1206 int sqlite3BtreeOpen(
       
  1207   const char *zFilename,  /* Name of the file containing the BTree database */
       
  1208   sqlite3 *db,            /* Associated database handle */
       
  1209   Btree **ppBtree,        /* Pointer to new Btree object written here */
       
  1210   int flags,              /* Options */
       
  1211   int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
       
  1212 ){
       
  1213   sqlite3_vfs *pVfs;      /* The VFS to use for this btree */
       
  1214   BtShared *pBt = 0;      /* Shared part of btree structure */
       
  1215   Btree *p;               /* Handle to return */
       
  1216   int rc = SQLITE_OK;
       
  1217   int nReserve;
       
  1218   unsigned char zDbHeader[100];
       
  1219 
       
  1220   /* Set the variable isMemdb to true for an in-memory database, or 
       
  1221   ** false for a file-based database. This symbol is only required if
       
  1222   ** either of the shared-data or autovacuum features are compiled 
       
  1223   ** into the library.
       
  1224   */
       
  1225 #if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM)
       
  1226   #ifdef SQLITE_OMIT_MEMORYDB
       
  1227     const int isMemdb = 0;
       
  1228   #else
       
  1229     const int isMemdb = zFilename && !strcmp(zFilename, ":memory:");
       
  1230   #endif
       
  1231 #endif
       
  1232 
       
  1233   assert( db!=0 );
       
  1234   assert( sqlite3_mutex_held(db->mutex) );
       
  1235 
       
  1236   pVfs = db->pVfs;
       
  1237   p = sqlite3MallocZero(sizeof(Btree));
       
  1238   if( !p ){
       
  1239     return SQLITE_NOMEM;
       
  1240   }
       
  1241   p->inTrans = TRANS_NONE;
       
  1242   p->db = db;
       
  1243 
       
  1244 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
       
  1245   /*
       
  1246   ** If this Btree is a candidate for shared cache, try to find an
       
  1247   ** existing BtShared object that we can share with
       
  1248   */
       
  1249   if( isMemdb==0
       
  1250    && (db->flags & SQLITE_Vtab)==0
       
  1251    && zFilename && zFilename[0]
       
  1252   ){
       
  1253     if( sqlite3GlobalConfig.sharedCacheEnabled ){
       
  1254       int nFullPathname = pVfs->mxPathname+1;
       
  1255       char *zFullPathname = sqlite3Malloc(nFullPathname);
       
  1256       sqlite3_mutex *mutexShared;
       
  1257       p->sharable = 1;
       
  1258       db->flags |= SQLITE_SharedCache;
       
  1259       if( !zFullPathname ){
       
  1260         sqlite3_free(p);
       
  1261         return SQLITE_NOMEM;
       
  1262       }
       
  1263       sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname);
       
  1264       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
       
  1265       sqlite3_mutex_enter(mutexShared);
       
  1266       for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
       
  1267         assert( pBt->nRef>0 );
       
  1268         if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager))
       
  1269                  && sqlite3PagerVfs(pBt->pPager)==pVfs ){
       
  1270           p->pBt = pBt;
       
  1271           pBt->nRef++;
       
  1272           break;
       
  1273         }
       
  1274       }
       
  1275       sqlite3_mutex_leave(mutexShared);
       
  1276       sqlite3_free(zFullPathname);
       
  1277     }
       
  1278 #ifdef SQLITE_DEBUG
       
  1279     else{
       
  1280       /* In debug mode, we mark all persistent databases as sharable
       
  1281       ** even when they are not.  This exercises the locking code and
       
  1282       ** gives more opportunity for asserts(sqlite3_mutex_held())
       
  1283       ** statements to find locking problems.
       
  1284       */
       
  1285       p->sharable = 1;
       
  1286     }
       
  1287 #endif
       
  1288   }
       
  1289 #endif
       
  1290   if( pBt==0 ){
       
  1291     /*
       
  1292     ** The following asserts make sure that structures used by the btree are
       
  1293     ** the right size.  This is to guard against size changes that result
       
  1294     ** when compiling on a different architecture.
       
  1295     */
       
  1296     assert( sizeof(i64)==8 || sizeof(i64)==4 );
       
  1297     assert( sizeof(u64)==8 || sizeof(u64)==4 );
       
  1298     assert( sizeof(u32)==4 );
       
  1299     assert( sizeof(u16)==2 );
       
  1300     assert( sizeof(Pgno)==4 );
       
  1301   
       
  1302     pBt = sqlite3MallocZero( sizeof(*pBt) );
       
  1303     if( pBt==0 ){
       
  1304       rc = SQLITE_NOMEM;
       
  1305       goto btree_open_out;
       
  1306     }
       
  1307     pBt->busyHdr.xFunc = sqlite3BtreeInvokeBusyHandler;
       
  1308     pBt->busyHdr.pArg = pBt;
       
  1309     rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
       
  1310                           EXTRA_SIZE, flags, vfsFlags);
       
  1311     if( rc==SQLITE_OK ){
       
  1312       rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
       
  1313     }
       
  1314     if( rc!=SQLITE_OK ){
       
  1315       goto btree_open_out;
       
  1316     }
       
  1317     sqlite3PagerSetBusyhandler(pBt->pPager, &pBt->busyHdr);
       
  1318     p->pBt = pBt;
       
  1319   
       
  1320     sqlite3PagerSetReiniter(pBt->pPager, pageReinit);
       
  1321     pBt->pCursor = 0;
       
  1322     pBt->pPage1 = 0;
       
  1323     pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager);
       
  1324     pBt->pageSize = get2byte(&zDbHeader[16]);
       
  1325     if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
       
  1326          || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
       
  1327       pBt->pageSize = 0;
       
  1328       sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
       
  1329 #ifndef SQLITE_OMIT_AUTOVACUUM
       
  1330       /* If the magic name ":memory:" will create an in-memory database, then
       
  1331       ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
       
  1332       ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
       
  1333       ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
       
  1334       ** regular file-name. In this case the auto-vacuum applies as per normal.
       
  1335       */
       
  1336       if( zFilename && !isMemdb ){
       
  1337         pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
       
  1338         pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
       
  1339       }
       
  1340 #endif
       
  1341       nReserve = 0;
       
  1342     }else{
       
  1343       nReserve = zDbHeader[20];
       
  1344       pBt->pageSizeFixed = 1;
       
  1345 #ifndef SQLITE_OMIT_AUTOVACUUM
       
  1346       pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
       
  1347       pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
       
  1348 #endif
       
  1349     }
       
  1350     pBt->usableSize = pBt->pageSize - nReserve;
       
  1351     assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
       
  1352     sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
       
  1353    
       
  1354 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
       
  1355     /* Add the new BtShared object to the linked list sharable BtShareds.
       
  1356     */
       
  1357     if( p->sharable ){
       
  1358       sqlite3_mutex *mutexShared;
       
  1359       pBt->nRef = 1;
       
  1360       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
       
  1361       if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
       
  1362         pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
       
  1363         if( pBt->mutex==0 ){
       
  1364           rc = SQLITE_NOMEM;
       
  1365           db->mallocFailed = 0;
       
  1366           goto btree_open_out;
       
  1367         }
       
  1368       }
       
  1369       sqlite3_mutex_enter(mutexShared);
       
  1370       pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
       
  1371       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
       
  1372       sqlite3_mutex_leave(mutexShared);
       
  1373     }
       
  1374 #endif
       
  1375   }
       
  1376 
       
  1377 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
       
  1378   /* If the new Btree uses a sharable pBtShared, then link the new
       
  1379   ** Btree into the list of all sharable Btrees for the same connection.
       
  1380   ** The list is kept in ascending order by pBt address.
       
  1381   */
       
  1382   if( p->sharable ){
       
  1383     int i;
       
  1384     Btree *pSib;
       
  1385     for(i=0; i<db->nDb; i++){
       
  1386       if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
       
  1387         while( pSib->pPrev ){ pSib = pSib->pPrev; }
       
  1388         if( p->pBt<pSib->pBt ){
       
  1389           p->pNext = pSib;
       
  1390           p->pPrev = 0;
       
  1391           pSib->pPrev = p;
       
  1392         }else{
       
  1393           while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
       
  1394             pSib = pSib->pNext;
       
  1395           }
       
  1396           p->pNext = pSib->pNext;
       
  1397           p->pPrev = pSib;
       
  1398           if( p->pNext ){
       
  1399             p->pNext->pPrev = p;
       
  1400           }
       
  1401           pSib->pNext = p;
       
  1402         }
       
  1403         break;
       
  1404       }
       
  1405     }
       
  1406   }
       
  1407 #endif
       
  1408   *ppBtree = p;
       
  1409 
       
  1410 btree_open_out:
       
  1411   if( rc!=SQLITE_OK ){
       
  1412     if( pBt && pBt->pPager ){
       
  1413       sqlite3PagerClose(pBt->pPager);
       
  1414     }
       
  1415     sqlite3_free(pBt);
       
  1416     sqlite3_free(p);
       
  1417     *ppBtree = 0;
       
  1418   }
       
  1419   return rc;
       
  1420 }
       
  1421 
       
  1422 /*
       
  1423 ** Decrement the BtShared.nRef counter.  When it reaches zero,
       
  1424 ** remove the BtShared structure from the sharing list.  Return
       
  1425 ** true if the BtShared.nRef counter reaches zero and return
       
  1426 ** false if it is still positive.
       
  1427 */
       
  1428 static int removeFromSharingList(BtShared *pBt){
       
  1429 #ifndef SQLITE_OMIT_SHARED_CACHE
       
  1430   sqlite3_mutex *pMaster;
       
  1431   BtShared *pList;
       
  1432   int removed = 0;
       
  1433 
       
  1434   assert( sqlite3_mutex_notheld(pBt->mutex) );
       
  1435   pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
       
  1436   sqlite3_mutex_enter(pMaster);
       
  1437   pBt->nRef--;
       
  1438   if( pBt->nRef<=0 ){
       
  1439     if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
       
  1440       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
       
  1441     }else{
       
  1442       pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
       
  1443       while( ALWAYS(pList) && pList->pNext!=pBt ){
       
  1444         pList=pList->pNext;
       
  1445       }
       
  1446       if( ALWAYS(pList) ){
       
  1447         pList->pNext = pBt->pNext;
       
  1448       }
       
  1449     }
       
  1450     if( SQLITE_THREADSAFE ){
       
  1451       sqlite3_mutex_free(pBt->mutex);
       
  1452     }
       
  1453     removed = 1;
       
  1454   }
       
  1455   sqlite3_mutex_leave(pMaster);
       
  1456   return removed;
       
  1457 #else
       
  1458   return 1;
       
  1459 #endif
       
  1460 }
       
  1461 
       
  1462 /*
       
  1463 ** Make sure pBt->pTmpSpace points to an allocation of 
       
  1464 ** MX_CELL_SIZE(pBt) bytes.
       
  1465 */
       
  1466 static void allocateTempSpace(BtShared *pBt){
       
  1467   if( !pBt->pTmpSpace ){
       
  1468     pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
       
  1469   }
       
  1470 }
       
  1471 
       
  1472 /*
       
  1473 ** Free the pBt->pTmpSpace allocation
       
  1474 */
       
  1475 static void freeTempSpace(BtShared *pBt){
       
  1476   sqlite3PageFree( pBt->pTmpSpace);
       
  1477   pBt->pTmpSpace = 0;
       
  1478 }
       
  1479 
       
  1480 /*
       
  1481 ** Close an open database and invalidate all cursors.
       
  1482 */
       
  1483 int sqlite3BtreeClose(Btree *p){
       
  1484   BtShared *pBt = p->pBt;
       
  1485   BtCursor *pCur;
       
  1486 
       
  1487   /* Close all cursors opened via this handle.  */
       
  1488   assert( sqlite3_mutex_held(p->db->mutex) );
       
  1489   sqlite3BtreeEnter(p);
       
  1490   pBt->db = p->db;
       
  1491   pCur = pBt->pCursor;
       
  1492   while( pCur ){
       
  1493     BtCursor *pTmp = pCur;
       
  1494     pCur = pCur->pNext;
       
  1495     if( pTmp->pBtree==p ){
       
  1496       sqlite3BtreeCloseCursor(pTmp);
       
  1497     }
       
  1498   }
       
  1499 
       
  1500   /* Rollback any active transaction and free the handle structure.
       
  1501   ** The call to sqlite3BtreeRollback() drops any table-locks held by
       
  1502   ** this handle.
       
  1503   */
       
  1504   sqlite3BtreeRollback(p);
       
  1505   sqlite3BtreeLeave(p);
       
  1506 
       
  1507   /* If there are still other outstanding references to the shared-btree
       
  1508   ** structure, return now. The remainder of this procedure cleans 
       
  1509   ** up the shared-btree.
       
  1510   */
       
  1511   assert( p->wantToLock==0 && p->locked==0 );
       
  1512   if( !p->sharable || removeFromSharingList(pBt) ){
       
  1513     /* The pBt is no longer on the sharing list, so we can access
       
  1514     ** it without having to hold the mutex.
       
  1515     **
       
  1516     ** Clean out and delete the BtShared object.
       
  1517     */
       
  1518     assert( !pBt->pCursor );
       
  1519     sqlite3PagerClose(pBt->pPager);
       
  1520     if( pBt->xFreeSchema && pBt->pSchema ){
       
  1521       pBt->xFreeSchema(pBt->pSchema);
       
  1522     }
       
  1523     sqlite3_free(pBt->pSchema);
       
  1524     freeTempSpace(pBt);
       
  1525     sqlite3_free(pBt);
       
  1526   }
       
  1527 
       
  1528 #ifndef SQLITE_OMIT_SHARED_CACHE
       
  1529   assert( p->wantToLock==0 );
       
  1530   assert( p->locked==0 );
       
  1531   if( p->pPrev ) p->pPrev->pNext = p->pNext;
       
  1532   if( p->pNext ) p->pNext->pPrev = p->pPrev;
       
  1533 #endif
       
  1534 
       
  1535   sqlite3_free(p);
       
  1536   return SQLITE_OK;
       
  1537 }
       
  1538 
       
  1539 /*
       
  1540 ** Change the limit on the number of pages allowed in the cache.
       
  1541 **
       
  1542 ** The maximum number of cache pages is set to the absolute
       
  1543 ** value of mxPage.  If mxPage is negative, the pager will
       
  1544 ** operate asynchronously - it will not stop to do fsync()s
       
  1545 ** to insure data is written to the disk surface before
       
  1546 ** continuing.  Transactions still work if synchronous is off,
       
  1547 ** and the database cannot be corrupted if this program
       
  1548 ** crashes.  But if the operating system crashes or there is
       
  1549 ** an abrupt power failure when synchronous is off, the database
       
  1550 ** could be left in an inconsistent and unrecoverable state.
       
  1551 ** Synchronous is on by default so database corruption is not
       
  1552 ** normally a worry.
       
  1553 */
       
  1554 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
       
  1555   BtShared *pBt = p->pBt;
       
  1556   assert( sqlite3_mutex_held(p->db->mutex) );
       
  1557   sqlite3BtreeEnter(p);
       
  1558   sqlite3PagerSetCachesize(pBt->pPager, mxPage);
       
  1559   sqlite3BtreeLeave(p);
       
  1560   return SQLITE_OK;
       
  1561 }
       
  1562 
       
  1563 /*
       
  1564 ** Change the way data is synced to disk in order to increase or decrease
       
  1565 ** how well the database resists damage due to OS crashes and power
       
  1566 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
       
  1567 ** there is a high probability of damage)  Level 2 is the default.  There
       
  1568 ** is a very low but non-zero probability of damage.  Level 3 reduces the
       
  1569 ** probability of damage to near zero but with a write performance reduction.
       
  1570 */
       
  1571 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
       
  1572 int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){
       
  1573   BtShared *pBt = p->pBt;
       
  1574   assert( sqlite3_mutex_held(p->db->mutex) );
       
  1575   sqlite3BtreeEnter(p);
       
  1576   sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync);
       
  1577   sqlite3BtreeLeave(p);
       
  1578   return SQLITE_OK;
       
  1579 }
       
  1580 #endif
       
  1581 
       
  1582 /*
       
  1583 ** Return TRUE if the given btree is set to safety level 1.  In other
       
  1584 ** words, return TRUE if no sync() occurs on the disk files.
       
  1585 */
       
  1586 int sqlite3BtreeSyncDisabled(Btree *p){
       
  1587   BtShared *pBt = p->pBt;
       
  1588   int rc;
       
  1589   assert( sqlite3_mutex_held(p->db->mutex) );  
       
  1590   sqlite3BtreeEnter(p);
       
  1591   assert( pBt && pBt->pPager );
       
  1592   rc = sqlite3PagerNosync(pBt->pPager);
       
  1593   sqlite3BtreeLeave(p);
       
  1594   return rc;
       
  1595 }
       
  1596 
       
  1597 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
       
  1598 /*
       
  1599 ** Change the default pages size and the number of reserved bytes per page.
       
  1600 **
       
  1601 ** The page size must be a power of 2 between 512 and 65536.  If the page
       
  1602 ** size supplied does not meet this constraint then the page size is not
       
  1603 ** changed.
       
  1604 **
       
  1605 ** Page sizes are constrained to be a power of two so that the region
       
  1606 ** of the database file used for locking (beginning at PENDING_BYTE,
       
  1607 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
       
  1608 ** at the beginning of a page.
       
  1609 **
       
  1610 ** If parameter nReserve is less than zero, then the number of reserved
       
  1611 ** bytes per page is left unchanged.
       
  1612 */
       
  1613 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){
       
  1614   int rc = SQLITE_OK;
       
  1615   BtShared *pBt = p->pBt;
       
  1616   sqlite3BtreeEnter(p);
       
  1617   if( pBt->pageSizeFixed ){
       
  1618     sqlite3BtreeLeave(p);
       
  1619     return SQLITE_READONLY;
       
  1620   }
       
  1621   if( nReserve<0 ){
       
  1622     nReserve = pBt->pageSize - pBt->usableSize;
       
  1623   }
       
  1624   if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
       
  1625         ((pageSize-1)&pageSize)==0 ){
       
  1626     assert( (pageSize & 7)==0 );
       
  1627     assert( !pBt->pPage1 && !pBt->pCursor );
       
  1628     pBt->pageSize = pageSize;
       
  1629     freeTempSpace(pBt);
       
  1630     rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
       
  1631   }
       
  1632   pBt->usableSize = pBt->pageSize - nReserve;
       
  1633   sqlite3BtreeLeave(p);
       
  1634   return rc;
       
  1635 }
       
  1636 
       
  1637 /*
       
  1638 ** Return the currently defined page size
       
  1639 */
       
  1640 int sqlite3BtreeGetPageSize(Btree *p){
       
  1641   return p->pBt->pageSize;
       
  1642 }
       
  1643 int sqlite3BtreeGetReserve(Btree *p){
       
  1644   int n;
       
  1645   sqlite3BtreeEnter(p);
       
  1646   n = p->pBt->pageSize - p->pBt->usableSize;
       
  1647   sqlite3BtreeLeave(p);
       
  1648   return n;
       
  1649 }
       
  1650 
       
  1651 /*
       
  1652 ** Set the maximum page count for a database if mxPage is positive.
       
  1653 ** No changes are made if mxPage is 0 or negative.
       
  1654 ** Regardless of the value of mxPage, return the maximum page count.
       
  1655 */
       
  1656 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
       
  1657   int n;
       
  1658   sqlite3BtreeEnter(p);
       
  1659   n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
       
  1660   sqlite3BtreeLeave(p);
       
  1661   return n;
       
  1662 }
       
  1663 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
       
  1664 
       
  1665 /*
       
  1666 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
       
  1667 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
       
  1668 ** is disabled. The default value for the auto-vacuum property is 
       
  1669 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
       
  1670 */
       
  1671 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
       
  1672 #ifdef SQLITE_OMIT_AUTOVACUUM
       
  1673   return SQLITE_READONLY;
       
  1674 #else
       
  1675   BtShared *pBt = p->pBt;
       
  1676   int rc = SQLITE_OK;
       
  1677   int av = (autoVacuum?1:0);
       
  1678 
       
  1679   sqlite3BtreeEnter(p);
       
  1680   if( pBt->pageSizeFixed && av!=pBt->autoVacuum ){
       
  1681     rc = SQLITE_READONLY;
       
  1682   }else{
       
  1683     pBt->autoVacuum = av;
       
  1684   }
       
  1685   sqlite3BtreeLeave(p);
       
  1686   return rc;
       
  1687 #endif
       
  1688 }
       
  1689 
       
  1690 /*
       
  1691 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is 
       
  1692 ** enabled 1 is returned. Otherwise 0.
       
  1693 */
       
  1694 int sqlite3BtreeGetAutoVacuum(Btree *p){
       
  1695 #ifdef SQLITE_OMIT_AUTOVACUUM
       
  1696   return BTREE_AUTOVACUUM_NONE;
       
  1697 #else
       
  1698   int rc;
       
  1699   sqlite3BtreeEnter(p);
       
  1700   rc = (
       
  1701     (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
       
  1702     (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
       
  1703     BTREE_AUTOVACUUM_INCR
       
  1704   );
       
  1705   sqlite3BtreeLeave(p);
       
  1706   return rc;
       
  1707 #endif
       
  1708 }
       
  1709 
       
  1710 
       
  1711 /*
       
  1712 ** Get a reference to pPage1 of the database file.  This will
       
  1713 ** also acquire a readlock on that file.
       
  1714 **
       
  1715 ** SQLITE_OK is returned on success.  If the file is not a
       
  1716 ** well-formed database file, then SQLITE_CORRUPT is returned.
       
  1717 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
       
  1718 ** is returned if we run out of memory. 
       
  1719 */
       
  1720 static int lockBtree(BtShared *pBt){
       
  1721   int rc;
       
  1722   MemPage *pPage1;
       
  1723   int nPage;
       
  1724 
       
  1725   assert( sqlite3_mutex_held(pBt->mutex) );
       
  1726   if( pBt->pPage1 ) return SQLITE_OK;
       
  1727   rc = sqlite3BtreeGetPage(pBt, 1, &pPage1, 0);
       
  1728   if( rc!=SQLITE_OK ) return rc;
       
  1729 
       
  1730   /* Do some checking to help insure the file we opened really is
       
  1731   ** a valid database file. 
       
  1732   */
       
  1733   rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
       
  1734   if( rc!=SQLITE_OK ){
       
  1735     goto page1_init_failed;
       
  1736   }else if( nPage>0 ){
       
  1737     int pageSize;
       
  1738     int usableSize;
       
  1739     u8 *page1 = pPage1->aData;
       
  1740     rc = SQLITE_NOTADB;
       
  1741     if( memcmp(page1, zMagicHeader, 16)!=0 ){
       
  1742       goto page1_init_failed;
       
  1743     }
       
  1744     if( page1[18]>1 ){
       
  1745       pBt->readOnly = 1;
       
  1746     }
       
  1747     if( page1[19]>1 ){
       
  1748       goto page1_init_failed;
       
  1749     }
       
  1750 
       
  1751     /* The maximum embedded fraction must be exactly 25%.  And the minimum
       
  1752     ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
       
  1753     ** The original design allowed these amounts to vary, but as of
       
  1754     ** version 3.6.0, we require them to be fixed.
       
  1755     */
       
  1756     if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
       
  1757       goto page1_init_failed;
       
  1758     }
       
  1759     pageSize = get2byte(&page1[16]);
       
  1760     if( ((pageSize-1)&pageSize)!=0 || pageSize<512 ||
       
  1761         (SQLITE_MAX_PAGE_SIZE<32768 && pageSize>SQLITE_MAX_PAGE_SIZE)
       
  1762     ){
       
  1763       goto page1_init_failed;
       
  1764     }
       
  1765     assert( (pageSize & 7)==0 );
       
  1766     usableSize = pageSize - page1[20];
       
  1767     if( pageSize!=pBt->pageSize ){
       
  1768       /* After reading the first page of the database assuming a page size
       
  1769       ** of BtShared.pageSize, we have discovered that the page-size is
       
  1770       ** actually pageSize. Unlock the database, leave pBt->pPage1 at
       
  1771       ** zero and return SQLITE_OK. The caller will call this function
       
  1772       ** again with the correct page-size.
       
  1773       */
       
  1774       releasePage(pPage1);
       
  1775       pBt->usableSize = usableSize;
       
  1776       pBt->pageSize = pageSize;
       
  1777       freeTempSpace(pBt);
       
  1778       sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
       
  1779       return SQLITE_OK;
       
  1780     }
       
  1781     if( usableSize<500 ){
       
  1782       goto page1_init_failed;
       
  1783     }
       
  1784     pBt->pageSize = pageSize;
       
  1785     pBt->usableSize = usableSize;
       
  1786 #ifndef SQLITE_OMIT_AUTOVACUUM
       
  1787     pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
       
  1788     pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
       
  1789 #endif
       
  1790   }
       
  1791 
       
  1792   /* maxLocal is the maximum amount of payload to store locally for
       
  1793   ** a cell.  Make sure it is small enough so that at least minFanout
       
  1794   ** cells can will fit on one page.  We assume a 10-byte page header.
       
  1795   ** Besides the payload, the cell must store:
       
  1796   **     2-byte pointer to the cell
       
  1797   **     4-byte child pointer
       
  1798   **     9-byte nKey value
       
  1799   **     4-byte nData value
       
  1800   **     4-byte overflow page pointer
       
  1801   ** So a cell consists of a 2-byte poiner, a header which is as much as
       
  1802   ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
       
  1803   ** page pointer.
       
  1804   */
       
  1805   pBt->maxLocal = (pBt->usableSize-12)*64/255 - 23;
       
  1806   pBt->minLocal = (pBt->usableSize-12)*32/255 - 23;
       
  1807   pBt->maxLeaf = pBt->usableSize - 35;
       
  1808   pBt->minLeaf = (pBt->usableSize-12)*32/255 - 23;
       
  1809   assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
       
  1810   pBt->pPage1 = pPage1;
       
  1811   return SQLITE_OK;
       
  1812 
       
  1813 page1_init_failed:
       
  1814   releasePage(pPage1);
       
  1815   pBt->pPage1 = 0;
       
  1816   return rc;
       
  1817 }
       
  1818 
       
  1819 /*
       
  1820 ** This routine works like lockBtree() except that it also invokes the
       
  1821 ** busy callback if there is lock contention.
       
  1822 */
       
  1823 static int lockBtreeWithRetry(Btree *pRef){
       
  1824   int rc = SQLITE_OK;
       
  1825 
       
  1826   assert( sqlite3BtreeHoldsMutex(pRef) );
       
  1827   if( pRef->inTrans==TRANS_NONE ){
       
  1828     u8 inTransaction = pRef->pBt->inTransaction;
       
  1829     btreeIntegrity(pRef);
       
  1830     rc = sqlite3BtreeBeginTrans(pRef, 0);
       
  1831     pRef->pBt->inTransaction = inTransaction;
       
  1832     pRef->inTrans = TRANS_NONE;
       
  1833     if( rc==SQLITE_OK ){
       
  1834       pRef->pBt->nTransaction--;
       
  1835     }
       
  1836     btreeIntegrity(pRef);
       
  1837   }
       
  1838   return rc;
       
  1839 }
       
  1840        
       
  1841 
       
  1842 /*
       
  1843 ** If there are no outstanding cursors and we are not in the middle
       
  1844 ** of a transaction but there is a read lock on the database, then
       
  1845 ** this routine unrefs the first page of the database file which 
       
  1846 ** has the effect of releasing the read lock.
       
  1847 **
       
  1848 ** If there are any outstanding cursors, this routine is a no-op.
       
  1849 **
       
  1850 ** If there is a transaction in progress, this routine is a no-op.
       
  1851 */
       
  1852 static void unlockBtreeIfUnused(BtShared *pBt){
       
  1853   assert( sqlite3_mutex_held(pBt->mutex) );
       
  1854   if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){
       
  1855     if( sqlite3PagerRefcount(pBt->pPager)>=1 ){
       
  1856       assert( pBt->pPage1->aData );
       
  1857 #if 0
       
  1858       if( pBt->pPage1->aData==0 ){
       
  1859         MemPage *pPage = pBt->pPage1;
       
  1860         pPage->aData = sqlite3PagerGetData(pPage->pDbPage);
       
  1861         pPage->pBt = pBt;
       
  1862         pPage->pgno = 1;
       
  1863       }
       
  1864 #endif
       
  1865       releasePage(pBt->pPage1);
       
  1866     }
       
  1867     pBt->pPage1 = 0;
       
  1868     pBt->inStmt = 0;
       
  1869   }
       
  1870 }
       
  1871 
       
  1872 /*
       
  1873 ** Create a new database by initializing the first page of the
       
  1874 ** file.
       
  1875 */
       
  1876 static int newDatabase(BtShared *pBt){
       
  1877   MemPage *pP1;
       
  1878   unsigned char *data;
       
  1879   int rc;
       
  1880   int nPage;
       
  1881 
       
  1882   assert( sqlite3_mutex_held(pBt->mutex) );
       
  1883   rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
       
  1884   if( rc!=SQLITE_OK || nPage>0 ){
       
  1885     return rc;
       
  1886   }
       
  1887   pP1 = pBt->pPage1;
       
  1888   assert( pP1!=0 );
       
  1889   data = pP1->aData;
       
  1890   rc = sqlite3PagerWrite(pP1->pDbPage);
       
  1891   if( rc ) return rc;
       
  1892   memcpy(data, zMagicHeader, sizeof(zMagicHeader));
       
  1893   assert( sizeof(zMagicHeader)==16 );
       
  1894   put2byte(&data[16], pBt->pageSize);
       
  1895   data[18] = 1;
       
  1896   data[19] = 1;
       
  1897   data[20] = pBt->pageSize - pBt->usableSize;
       
  1898   data[21] = 64;
       
  1899   data[22] = 32;
       
  1900   data[23] = 32;
       
  1901   memset(&data[24], 0, 100-24);
       
  1902   zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
       
  1903   pBt->pageSizeFixed = 1;
       
  1904 #ifndef SQLITE_OMIT_AUTOVACUUM
       
  1905   assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
       
  1906   assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
       
  1907   put4byte(&data[36 + 4*4], pBt->autoVacuum);
       
  1908   put4byte(&data[36 + 7*4], pBt->incrVacuum);
       
  1909 #endif
       
  1910   return SQLITE_OK;
       
  1911 }
       
  1912 
       
  1913 /*
       
  1914 ** Attempt to start a new transaction. A write-transaction
       
  1915 ** is started if the second argument is nonzero, otherwise a read-
       
  1916 ** transaction.  If the second argument is 2 or more and exclusive
       
  1917 ** transaction is started, meaning that no other process is allowed
       
  1918 ** to access the database.  A preexisting transaction may not be
       
  1919 ** upgraded to exclusive by calling this routine a second time - the
       
  1920 ** exclusivity flag only works for a new transaction.
       
  1921 **
       
  1922 ** A write-transaction must be started before attempting any 
       
  1923 ** changes to the database.  None of the following routines 
       
  1924 ** will work unless a transaction is started first:
       
  1925 **
       
  1926 **      sqlite3BtreeCreateTable()
       
  1927 **      sqlite3BtreeCreateIndex()
       
  1928 **      sqlite3BtreeClearTable()
       
  1929 **      sqlite3BtreeDropTable()
       
  1930 **      sqlite3BtreeInsert()
       
  1931 **      sqlite3BtreeDelete()
       
  1932 **      sqlite3BtreeUpdateMeta()
       
  1933 **
       
  1934 ** If an initial attempt to acquire the lock fails because of lock contention
       
  1935 ** and the database was previously unlocked, then invoke the busy handler
       
  1936 ** if there is one.  But if there was previously a read-lock, do not
       
  1937 ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is 
       
  1938 ** returned when there is already a read-lock in order to avoid a deadlock.
       
  1939 **
       
  1940 ** Suppose there are two processes A and B.  A has a read lock and B has
       
  1941 ** a reserved lock.  B tries to promote to exclusive but is blocked because
       
  1942 ** of A's read lock.  A tries to promote to reserved but is blocked by B.
       
  1943 ** One or the other of the two processes must give way or there can be
       
  1944 ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
       
  1945 ** when A already has a read lock, we encourage A to give up and let B
       
  1946 ** proceed.
       
  1947 */
       
  1948 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
       
  1949   BtShared *pBt = p->pBt;
       
  1950   int rc = SQLITE_OK;
       
  1951 
       
  1952   sqlite3BtreeEnter(p);
       
  1953   pBt->db = p->db;
       
  1954   btreeIntegrity(p);
       
  1955 
       
  1956   /* If the btree is already in a write-transaction, or it
       
  1957   ** is already in a read-transaction and a read-transaction
       
  1958   ** is requested, this is a no-op.
       
  1959   */
       
  1960   if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
       
  1961     goto trans_begun;
       
  1962   }
       
  1963 
       
  1964   /* Write transactions are not possible on a read-only database */
       
  1965   if( pBt->readOnly && wrflag ){
       
  1966     rc = SQLITE_READONLY;
       
  1967     goto trans_begun;
       
  1968   }
       
  1969 
       
  1970   /* If another database handle has already opened a write transaction 
       
  1971   ** on this shared-btree structure and a second write transaction is
       
  1972   ** requested, return SQLITE_BUSY.
       
  1973   */
       
  1974   if( pBt->inTransaction==TRANS_WRITE && wrflag ){
       
  1975     rc = SQLITE_BUSY;
       
  1976     goto trans_begun;
       
  1977   }
       
  1978 
       
  1979 #ifndef SQLITE_OMIT_SHARED_CACHE
       
  1980   if( wrflag>1 ){
       
  1981     BtLock *pIter;
       
  1982     for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
       
  1983       if( pIter->pBtree!=p ){
       
  1984         rc = SQLITE_BUSY;
       
  1985         goto trans_begun;
       
  1986       }
       
  1987     }
       
  1988   }
       
  1989 #endif
       
  1990 
       
  1991   do {
       
  1992     if( pBt->pPage1==0 ){
       
  1993       do{
       
  1994         rc = lockBtree(pBt);
       
  1995       }while( pBt->pPage1==0 && rc==SQLITE_OK );
       
  1996     }
       
  1997 
       
  1998     if( rc==SQLITE_OK && wrflag ){
       
  1999       if( pBt->readOnly ){
       
  2000         rc = SQLITE_READONLY;
       
  2001       }else{
       
  2002         rc = sqlite3PagerBegin(pBt->pPage1->pDbPage, wrflag>1);
       
  2003         if( rc==SQLITE_OK ){
       
  2004           rc = newDatabase(pBt);
       
  2005         }
       
  2006       }
       
  2007     }
       
  2008   
       
  2009     if( rc==SQLITE_OK ){
       
  2010       if( wrflag ) pBt->inStmt = 0;
       
  2011     }else{
       
  2012       unlockBtreeIfUnused(pBt);
       
  2013     }
       
  2014   }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
       
  2015           sqlite3BtreeInvokeBusyHandler(pBt, 0) );
       
  2016 
       
  2017   if( rc==SQLITE_OK ){
       
  2018     if( p->inTrans==TRANS_NONE ){
       
  2019       pBt->nTransaction++;
       
  2020     }
       
  2021     p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
       
  2022     if( p->inTrans>pBt->inTransaction ){
       
  2023       pBt->inTransaction = p->inTrans;
       
  2024     }
       
  2025 #ifndef SQLITE_OMIT_SHARED_CACHE
       
  2026     if( wrflag>1 ){
       
  2027       assert( !pBt->pExclusive );
       
  2028       pBt->pExclusive = p;
       
  2029     }
       
  2030 #endif
       
  2031   }
       
  2032 
       
  2033 
       
  2034 trans_begun:
       
  2035   btreeIntegrity(p);
       
  2036   sqlite3BtreeLeave(p);
       
  2037   return rc;
       
  2038 }
       
  2039 
       
  2040 
       
  2041 #ifndef SQLITE_OMIT_AUTOVACUUM
       
  2042 
       
  2043 /*
       
  2044 ** Set the pointer-map entries for all children of page pPage. Also, if
       
  2045 ** pPage contains cells that point to overflow pages, set the pointer
       
  2046 ** map entries for the overflow pages as well.
       
  2047 */
       
  2048 static int setChildPtrmaps(MemPage *pPage){
       
  2049   int i;                             /* Counter variable */
       
  2050   int nCell;                         /* Number of cells in page pPage */
       
  2051   int rc;                            /* Return code */
       
  2052   BtShared *pBt = pPage->pBt;
       
  2053   int isInitOrig = pPage->isInit;
       
  2054   Pgno pgno = pPage->pgno;
       
  2055 
       
  2056   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
       
  2057   rc = sqlite3BtreeInitPage(pPage);
       
  2058   if( rc!=SQLITE_OK ){
       
  2059     goto set_child_ptrmaps_out;
       
  2060   }
       
  2061   nCell = pPage->nCell;
       
  2062 
       
  2063   for(i=0; i<nCell; i++){
       
  2064     u8 *pCell = findCell(pPage, i);
       
  2065 
       
  2066     rc = ptrmapPutOvflPtr(pPage, pCell);
       
  2067     if( rc!=SQLITE_OK ){
       
  2068       goto set_child_ptrmaps_out;
       
  2069     }
       
  2070 
       
  2071     if( !pPage->leaf ){
       
  2072       Pgno childPgno = get4byte(pCell);
       
  2073       rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
       
  2074       if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out;
       
  2075     }
       
  2076   }
       
  2077 
       
  2078   if( !pPage->leaf ){
       
  2079     Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
       
  2080     rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
       
  2081   }
       
  2082 
       
  2083 set_child_ptrmaps_out:
       
  2084   pPage->isInit = isInitOrig;
       
  2085   return rc;
       
  2086 }
       
  2087 
       
  2088 /*
       
  2089 ** Somewhere on pPage, which is guarenteed to be a btree page, not an overflow
       
  2090 ** page, is a pointer to page iFrom. Modify this pointer so that it points to
       
  2091 ** iTo. Parameter eType describes the type of pointer to be modified, as 
       
  2092 ** follows:
       
  2093 **
       
  2094 ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child 
       
  2095 **                   page of pPage.
       
  2096 **
       
  2097 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
       
  2098 **                   page pointed to by one of the cells on pPage.
       
  2099 **
       
  2100 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
       
  2101 **                   overflow page in the list.
       
  2102 */
       
  2103 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
       
  2104   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
       
  2105   if( eType==PTRMAP_OVERFLOW2 ){
       
  2106     /* The pointer is always the first 4 bytes of the page in this case.  */
       
  2107     if( get4byte(pPage->aData)!=iFrom ){
       
  2108       return SQLITE_CORRUPT_BKPT;
       
  2109     }
       
  2110     put4byte(pPage->aData, iTo);
       
  2111   }else{
       
  2112     int isInitOrig = pPage->isInit;
       
  2113     int i;
       
  2114     int nCell;
       
  2115 
       
  2116     sqlite3BtreeInitPage(pPage);
       
  2117     nCell = pPage->nCell;
       
  2118 
       
  2119     for(i=0; i<nCell; i++){
       
  2120       u8 *pCell = findCell(pPage, i);
       
  2121       if( eType==PTRMAP_OVERFLOW1 ){
       
  2122         CellInfo info;
       
  2123         sqlite3BtreeParseCellPtr(pPage, pCell, &info);
       
  2124         if( info.iOverflow ){
       
  2125           if( iFrom==get4byte(&pCell[info.iOverflow]) ){
       
  2126             put4byte(&pCell[info.iOverflow], iTo);
       
  2127             break;
       
  2128           }
       
  2129         }
       
  2130       }else{
       
  2131         if( get4byte(pCell)==iFrom ){
       
  2132           put4byte(pCell, iTo);
       
  2133           break;
       
  2134         }
       
  2135       }
       
  2136     }
       
  2137   
       
  2138     if( i==nCell ){
       
  2139       if( eType!=PTRMAP_BTREE || 
       
  2140           get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
       
  2141         return SQLITE_CORRUPT_BKPT;
       
  2142       }
       
  2143       put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
       
  2144     }
       
  2145 
       
  2146     pPage->isInit = isInitOrig;
       
  2147   }
       
  2148   return SQLITE_OK;
       
  2149 }
       
  2150 
       
  2151 
       
  2152 /*
       
  2153 ** Move the open database page pDbPage to location iFreePage in the 
       
  2154 ** database. The pDbPage reference remains valid.
       
  2155 */
       
  2156 static int relocatePage(
       
  2157   BtShared *pBt,           /* Btree */
       
  2158   MemPage *pDbPage,        /* Open page to move */
       
  2159   u8 eType,                /* Pointer map 'type' entry for pDbPage */
       
  2160   Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
       
  2161   Pgno iFreePage,          /* The location to move pDbPage to */
       
  2162   int isCommit
       
  2163 ){
       
  2164   MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
       
  2165   Pgno iDbPage = pDbPage->pgno;
       
  2166   Pager *pPager = pBt->pPager;
       
  2167   int rc;
       
  2168 
       
  2169   assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 || 
       
  2170       eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
       
  2171   assert( sqlite3_mutex_held(pBt->mutex) );
       
  2172   assert( pDbPage->pBt==pBt );
       
  2173 
       
  2174   /* Move page iDbPage from its current location to page number iFreePage */
       
  2175   TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n", 
       
  2176       iDbPage, iFreePage, iPtrPage, eType));
       
  2177   rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
       
  2178   if( rc!=SQLITE_OK ){
       
  2179     return rc;
       
  2180   }
       
  2181   pDbPage->pgno = iFreePage;
       
  2182 
       
  2183   /* If pDbPage was a btree-page, then it may have child pages and/or cells
       
  2184   ** that point to overflow pages. The pointer map entries for all these
       
  2185   ** pages need to be changed.
       
  2186   **
       
  2187   ** If pDbPage is an overflow page, then the first 4 bytes may store a
       
  2188   ** pointer to a subsequent overflow page. If this is the case, then
       
  2189   ** the pointer map needs to be updated for the subsequent overflow page.
       
  2190   */
       
  2191   if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
       
  2192     rc = setChildPtrmaps(pDbPage);
       
  2193     if( rc!=SQLITE_OK ){
       
  2194       return rc;
       
  2195     }
       
  2196   }else{
       
  2197     Pgno nextOvfl = get4byte(pDbPage->aData);
       
  2198     if( nextOvfl!=0 ){
       
  2199       rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage);
       
  2200       if( rc!=SQLITE_OK ){
       
  2201         return rc;
       
  2202       }
       
  2203     }
       
  2204   }
       
  2205 
       
  2206   /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
       
  2207   ** that it points at iFreePage. Also fix the pointer map entry for
       
  2208   ** iPtrPage.
       
  2209   */
       
  2210   if( eType!=PTRMAP_ROOTPAGE ){
       
  2211     rc = sqlite3BtreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
       
  2212     if( rc!=SQLITE_OK ){
       
  2213       return rc;
       
  2214     }
       
  2215     rc = sqlite3PagerWrite(pPtrPage->pDbPage);
       
  2216     if( rc!=SQLITE_OK ){
       
  2217       releasePage(pPtrPage);
       
  2218       return rc;
       
  2219     }
       
  2220     rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
       
  2221     releasePage(pPtrPage);
       
  2222     if( rc==SQLITE_OK ){
       
  2223       rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage);
       
  2224     }
       
  2225   }
       
  2226   return rc;
       
  2227 }
       
  2228 
       
  2229 /* Forward declaration required by incrVacuumStep(). */
       
  2230 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
       
  2231 
       
  2232 /*
       
  2233 ** Perform a single step of an incremental-vacuum. If successful,
       
  2234 ** return SQLITE_OK. If there is no work to do (and therefore no
       
  2235 ** point in calling this function again), return SQLITE_DONE.
       
  2236 **
       
  2237 ** More specificly, this function attempts to re-organize the 
       
  2238 ** database so that the last page of the file currently in use
       
  2239 ** is no longer in use.
       
  2240 **
       
  2241 ** If the nFin parameter is non-zero, the implementation assumes
       
  2242 ** that the caller will keep calling incrVacuumStep() until
       
  2243 ** it returns SQLITE_DONE or an error, and that nFin is the
       
  2244 ** number of pages the database file will contain after this 
       
  2245 ** process is complete.
       
  2246 */
       
  2247 static int incrVacuumStep(BtShared *pBt, Pgno nFin){
       
  2248   Pgno iLastPg;             /* Last page in the database */
       
  2249   Pgno nFreeList;           /* Number of pages still on the free-list */
       
  2250 
       
  2251   assert( sqlite3_mutex_held(pBt->mutex) );
       
  2252   iLastPg = pBt->nTrunc;
       
  2253   if( iLastPg==0 ){
       
  2254     iLastPg = pagerPagecount(pBt->pPager);
       
  2255   }
       
  2256 
       
  2257   if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
       
  2258     int rc;
       
  2259     u8 eType;
       
  2260     Pgno iPtrPage;
       
  2261 
       
  2262     nFreeList = get4byte(&pBt->pPage1->aData[36]);
       
  2263     if( nFreeList==0 || nFin==iLastPg ){
       
  2264       return SQLITE_DONE;
       
  2265     }
       
  2266 
       
  2267     rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
       
  2268     if( rc!=SQLITE_OK ){
       
  2269       return rc;
       
  2270     }
       
  2271     if( eType==PTRMAP_ROOTPAGE ){
       
  2272       return SQLITE_CORRUPT_BKPT;
       
  2273     }
       
  2274 
       
  2275     if( eType==PTRMAP_FREEPAGE ){
       
  2276       if( nFin==0 ){
       
  2277         /* Remove the page from the files free-list. This is not required
       
  2278         ** if nFin is non-zero. In that case, the free-list will be
       
  2279         ** truncated to zero after this function returns, so it doesn't 
       
  2280         ** matter if it still contains some garbage entries.
       
  2281         */
       
  2282         Pgno iFreePg;
       
  2283         MemPage *pFreePg;
       
  2284         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1);
       
  2285         if( rc!=SQLITE_OK ){
       
  2286           return rc;
       
  2287         }
       
  2288         assert( iFreePg==iLastPg );
       
  2289         releasePage(pFreePg);
       
  2290       }
       
  2291     } else {
       
  2292       Pgno iFreePg;             /* Index of free page to move pLastPg to */
       
  2293       MemPage *pLastPg;
       
  2294 
       
  2295       rc = sqlite3BtreeGetPage(pBt, iLastPg, &pLastPg, 0);
       
  2296       if( rc!=SQLITE_OK ){
       
  2297         return rc;
       
  2298       }
       
  2299 
       
  2300       /* If nFin is zero, this loop runs exactly once and page pLastPg
       
  2301       ** is swapped with the first free page pulled off the free list.
       
  2302       **
       
  2303       ** On the other hand, if nFin is greater than zero, then keep
       
  2304       ** looping until a free-page located within the first nFin pages
       
  2305       ** of the file is found.
       
  2306       */
       
  2307       do {
       
  2308         MemPage *pFreePg;
       
  2309         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0);
       
  2310         if( rc!=SQLITE_OK ){
       
  2311           releasePage(pLastPg);
       
  2312           return rc;
       
  2313         }
       
  2314         releasePage(pFreePg);
       
  2315       }while( nFin!=0 && iFreePg>nFin );
       
  2316       assert( iFreePg<iLastPg );
       
  2317       
       
  2318       rc = sqlite3PagerWrite(pLastPg->pDbPage);
       
  2319       if( rc==SQLITE_OK ){
       
  2320         rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0);
       
  2321       }
       
  2322       releasePage(pLastPg);
       
  2323       if( rc!=SQLITE_OK ){
       
  2324         return rc;
       
  2325       }
       
  2326     }
       
  2327   }
       
  2328 
       
  2329   pBt->nTrunc = iLastPg - 1;
       
  2330   while( pBt->nTrunc==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, pBt->nTrunc) ){
       
  2331     pBt->nTrunc--;
       
  2332   }
       
  2333   return SQLITE_OK;
       
  2334 }
       
  2335 
       
  2336 /*
       
  2337 ** A write-transaction must be opened before calling this function.
       
  2338 ** It performs a single unit of work towards an incremental vacuum.
       
  2339 **
       
  2340 ** If the incremental vacuum is finished after this function has run,
       
  2341 ** SQLITE_DONE is returned. If it is not finished, but no error occured,
       
  2342 ** SQLITE_OK is returned. Otherwise an SQLite error code. 
       
  2343 */
       
  2344 int sqlite3BtreeIncrVacuum(Btree *p){
       
  2345   int rc;
       
  2346   BtShared *pBt = p->pBt;
       
  2347 
       
  2348   sqlite3BtreeEnter(p);
       
  2349   pBt->db = p->db;
       
  2350   assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
       
  2351   if( !pBt->autoVacuum ){
       
  2352     rc = SQLITE_DONE;
       
  2353   }else{
       
  2354     invalidateAllOverflowCache(pBt);
       
  2355     rc = incrVacuumStep(pBt, 0);
       
  2356   }
       
  2357   sqlite3BtreeLeave(p);
       
  2358   return rc;
       
  2359 }
       
  2360 
       
  2361 /*
       
  2362 ** This routine is called prior to sqlite3PagerCommit when a transaction
       
  2363 ** is commited for an auto-vacuum database.
       
  2364 **
       
  2365 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
       
  2366 ** the database file should be truncated to during the commit process. 
       
  2367 ** i.e. the database has been reorganized so that only the first *pnTrunc
       
  2368 ** pages are in use.
       
  2369 */
       
  2370 static int autoVacuumCommit(BtShared *pBt, Pgno *pnTrunc){
       
  2371   int rc = SQLITE_OK;
       
  2372   Pager *pPager = pBt->pPager;
       
  2373   VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) );
       
  2374 
       
  2375   assert( sqlite3_mutex_held(pBt->mutex) );
       
  2376   invalidateAllOverflowCache(pBt);
       
  2377   assert(pBt->autoVacuum);
       
  2378   if( !pBt->incrVacuum ){
       
  2379     Pgno nFin = 0;
       
  2380 
       
  2381     if( pBt->nTrunc==0 ){
       
  2382       Pgno nFree;
       
  2383       Pgno nPtrmap;
       
  2384       const int pgsz = pBt->pageSize;
       
  2385       int nOrig = pagerPagecount(pBt->pPager);
       
  2386 
       
  2387       if( PTRMAP_ISPAGE(pBt, nOrig) ){
       
  2388         return SQLITE_CORRUPT_BKPT;
       
  2389       }
       
  2390       if( nOrig==PENDING_BYTE_PAGE(pBt) ){
       
  2391         nOrig--;
       
  2392       }
       
  2393       nFree = get4byte(&pBt->pPage1->aData[36]);
       
  2394       nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+pgsz/5)/(pgsz/5);
       
  2395       nFin = nOrig - nFree - nPtrmap;
       
  2396       if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<=PENDING_BYTE_PAGE(pBt) ){
       
  2397         nFin--;
       
  2398       }
       
  2399       while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
       
  2400         nFin--;
       
  2401       }
       
  2402     }
       
  2403 
       
  2404     while( rc==SQLITE_OK ){
       
  2405       rc = incrVacuumStep(pBt, nFin);
       
  2406     }
       
  2407     if( rc==SQLITE_DONE ){
       
  2408       assert(nFin==0 || pBt->nTrunc==0 || nFin<=pBt->nTrunc);
       
  2409       rc = SQLITE_OK;
       
  2410       if( pBt->nTrunc && nFin ){
       
  2411         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
       
  2412         put4byte(&pBt->pPage1->aData[32], 0);
       
  2413         put4byte(&pBt->pPage1->aData[36], 0);
       
  2414         pBt->nTrunc = nFin;
       
  2415       }
       
  2416     }
       
  2417     if( rc!=SQLITE_OK ){
       
  2418       sqlite3PagerRollback(pPager);
       
  2419     }
       
  2420   }
       
  2421 
       
  2422   if( rc==SQLITE_OK ){
       
  2423     *pnTrunc = pBt->nTrunc;
       
  2424     pBt->nTrunc = 0;
       
  2425   }
       
  2426   assert( nRef==sqlite3PagerRefcount(pPager) );
       
  2427   return rc;
       
  2428 }
       
  2429 
       
  2430 #endif
       
  2431 
       
  2432 /*
       
  2433 ** This routine does the first phase of a two-phase commit.  This routine
       
  2434 ** causes a rollback journal to be created (if it does not already exist)
       
  2435 ** and populated with enough information so that if a power loss occurs
       
  2436 ** the database can be restored to its original state by playing back
       
  2437 ** the journal.  Then the contents of the journal are flushed out to
       
  2438 ** the disk.  After the journal is safely on oxide, the changes to the
       
  2439 ** database are written into the database file and flushed to oxide.
       
  2440 ** At the end of this call, the rollback journal still exists on the
       
  2441 ** disk and we are still holding all locks, so the transaction has not
       
  2442 ** committed.  See sqlite3BtreeCommit() for the second phase of the
       
  2443 ** commit process.
       
  2444 **
       
  2445 ** This call is a no-op if no write-transaction is currently active on pBt.
       
  2446 **
       
  2447 ** Otherwise, sync the database file for the btree pBt. zMaster points to
       
  2448 ** the name of a master journal file that should be written into the
       
  2449 ** individual journal file, or is NULL, indicating no master journal file 
       
  2450 ** (single database transaction).
       
  2451 **
       
  2452 ** When this is called, the master journal should already have been
       
  2453 ** created, populated with this journal pointer and synced to disk.
       
  2454 **
       
  2455 ** Once this is routine has returned, the only thing required to commit
       
  2456 ** the write-transaction for this database file is to delete the journal.
       
  2457 */
       
  2458 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
       
  2459   int rc = SQLITE_OK;
       
  2460   if( p->inTrans==TRANS_WRITE ){
       
  2461     BtShared *pBt = p->pBt;
       
  2462     Pgno nTrunc = 0;
       
  2463     sqlite3BtreeEnter(p);
       
  2464     pBt->db = p->db;
       
  2465 #ifndef SQLITE_OMIT_AUTOVACUUM
       
  2466     if( pBt->autoVacuum ){
       
  2467       rc = autoVacuumCommit(pBt, &nTrunc); 
       
  2468       if( rc!=SQLITE_OK ){
       
  2469         sqlite3BtreeLeave(p);
       
  2470         return rc;
       
  2471       }
       
  2472     }
       
  2473 #endif
       
  2474     rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, nTrunc, 0);
       
  2475     sqlite3BtreeLeave(p);
       
  2476   }
       
  2477   return rc;
       
  2478 }
       
  2479 
       
  2480 /*
       
  2481 ** Commit the transaction currently in progress.
       
  2482 **
       
  2483 ** This routine implements the second phase of a 2-phase commit.  The
       
  2484 ** sqlite3BtreeSync() routine does the first phase and should be invoked
       
  2485 ** prior to calling this routine.  The sqlite3BtreeSync() routine did
       
  2486 ** all the work of writing information out to disk and flushing the
       
  2487 ** contents so that they are written onto the disk platter.  All this
       
  2488 ** routine has to do is delete or truncate the rollback journal
       
  2489 ** (which causes the transaction to commit) and drop locks.
       
  2490 **
       
  2491 ** This will release the write lock on the database file.  If there
       
  2492 ** are no active cursors, it also releases the read lock.
       
  2493 */
       
  2494 int sqlite3BtreeCommitPhaseTwo(Btree *p){
       
  2495   BtShared *pBt = p->pBt;
       
  2496 
       
  2497   sqlite3BtreeEnter(p);
       
  2498   pBt->db = p->db;
       
  2499   btreeIntegrity(p);
       
  2500 
       
  2501   /* If the handle has a write-transaction open, commit the shared-btrees 
       
  2502   ** transaction and set the shared state to TRANS_READ.
       
  2503   */
       
  2504   if( p->inTrans==TRANS_WRITE ){
       
  2505     int rc;
       
  2506     assert( pBt->inTransaction==TRANS_WRITE );
       
  2507     assert( pBt->nTransaction>0 );
       
  2508     rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
       
  2509     if( rc!=SQLITE_OK ){
       
  2510       sqlite3BtreeLeave(p);
       
  2511       return rc;
       
  2512     }
       
  2513     pBt->inTransaction = TRANS_READ;
       
  2514     pBt->inStmt = 0;
       
  2515   }
       
  2516   unlockAllTables(p);
       
  2517 
       
  2518   /* If the handle has any kind of transaction open, decrement the transaction
       
  2519   ** count of the shared btree. If the transaction count reaches 0, set
       
  2520   ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below
       
  2521   ** will unlock the pager.
       
  2522   */
       
  2523   if( p->inTrans!=TRANS_NONE ){
       
  2524     pBt->nTransaction--;
       
  2525     if( 0==pBt->nTransaction ){
       
  2526       pBt->inTransaction = TRANS_NONE;
       
  2527     }
       
  2528   }
       
  2529 
       
  2530   /* Set the handles current transaction state to TRANS_NONE and unlock
       
  2531   ** the pager if this call closed the only read or write transaction.
       
  2532   */
       
  2533   p->inTrans = TRANS_NONE;
       
  2534   unlockBtreeIfUnused(pBt);
       
  2535 
       
  2536   btreeIntegrity(p);
       
  2537   sqlite3BtreeLeave(p);
       
  2538   return SQLITE_OK;
       
  2539 }
       
  2540 
       
  2541 /*
       
  2542 ** Do both phases of a commit.
       
  2543 */
       
  2544 int sqlite3BtreeCommit(Btree *p){
       
  2545   int rc;
       
  2546   sqlite3BtreeEnter(p);
       
  2547   rc = sqlite3BtreeCommitPhaseOne(p, 0);
       
  2548   if( rc==SQLITE_OK ){
       
  2549     rc = sqlite3BtreeCommitPhaseTwo(p);
       
  2550   }
       
  2551   sqlite3BtreeLeave(p);
       
  2552   return rc;
       
  2553 }
       
  2554 
       
  2555 #ifndef NDEBUG
       
  2556 /*
       
  2557 ** Return the number of write-cursors open on this handle. This is for use
       
  2558 ** in assert() expressions, so it is only compiled if NDEBUG is not
       
  2559 ** defined.
       
  2560 **
       
  2561 ** For the purposes of this routine, a write-cursor is any cursor that
       
  2562 ** is capable of writing to the databse.  That means the cursor was
       
  2563 ** originally opened for writing and the cursor has not be disabled
       
  2564 ** by having its state changed to CURSOR_FAULT.
       
  2565 */
       
  2566 static int countWriteCursors(BtShared *pBt){
       
  2567   BtCursor *pCur;
       
  2568   int r = 0;
       
  2569   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
       
  2570     if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++; 
       
  2571   }
       
  2572   return r;
       
  2573 }
       
  2574 #endif
       
  2575 
       
  2576 /*
       
  2577 ** This routine sets the state to CURSOR_FAULT and the error
       
  2578 ** code to errCode for every cursor on BtShared that pBtree
       
  2579 ** references.
       
  2580 **
       
  2581 ** Every cursor is tripped, including cursors that belong
       
  2582 ** to other database connections that happen to be sharing
       
  2583 ** the cache with pBtree.
       
  2584 **
       
  2585 ** This routine gets called when a rollback occurs.
       
  2586 ** All cursors using the same cache must be tripped
       
  2587 ** to prevent them from trying to use the btree after
       
  2588 ** the rollback.  The rollback may have deleted tables
       
  2589 ** or moved root pages, so it is not sufficient to
       
  2590 ** save the state of the cursor.  The cursor must be
       
  2591 ** invalidated.
       
  2592 */
       
  2593 void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
       
  2594   BtCursor *p;
       
  2595   sqlite3BtreeEnter(pBtree);
       
  2596   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
       
  2597     clearCursorPosition(p);
       
  2598     p->eState = CURSOR_FAULT;
       
  2599     p->skip = errCode;
       
  2600   }
       
  2601   sqlite3BtreeLeave(pBtree);
       
  2602 }
       
  2603 
       
  2604 /*
       
  2605 ** Rollback the transaction in progress.  All cursors will be
       
  2606 ** invalided by this operation.  Any attempt to use a cursor
       
  2607 ** that was open at the beginning of this operation will result
       
  2608 ** in an error.
       
  2609 **
       
  2610 ** This will release the write lock on the database file.  If there
       
  2611 ** are no active cursors, it also releases the read lock.
       
  2612 */
       
  2613 int sqlite3BtreeRollback(Btree *p){
       
  2614   int rc;
       
  2615   BtShared *pBt = p->pBt;
       
  2616   MemPage *pPage1;
       
  2617 
       
  2618   sqlite3BtreeEnter(p);
       
  2619   pBt->db = p->db;
       
  2620   rc = saveAllCursors(pBt, 0, 0);
       
  2621 #ifndef SQLITE_OMIT_SHARED_CACHE
       
  2622   if( rc!=SQLITE_OK ){
       
  2623     /* This is a horrible situation. An IO or malloc() error occured whilst
       
  2624     ** trying to save cursor positions. If this is an automatic rollback (as
       
  2625     ** the result of a constraint, malloc() failure or IO error) then 
       
  2626     ** the cache may be internally inconsistent (not contain valid trees) so
       
  2627     ** we cannot simply return the error to the caller. Instead, abort 
       
  2628     ** all queries that may be using any of the cursors that failed to save.
       
  2629     */
       
  2630     sqlite3BtreeTripAllCursors(p, rc);
       
  2631   }
       
  2632 #endif
       
  2633   btreeIntegrity(p);
       
  2634   unlockAllTables(p);
       
  2635 
       
  2636   if( p->inTrans==TRANS_WRITE ){
       
  2637     int rc2;
       
  2638 
       
  2639 #ifndef SQLITE_OMIT_AUTOVACUUM
       
  2640     pBt->nTrunc = 0;
       
  2641 #endif
       
  2642 
       
  2643     assert( TRANS_WRITE==pBt->inTransaction );
       
  2644     rc2 = sqlite3PagerRollback(pBt->pPager);
       
  2645     if( rc2!=SQLITE_OK ){
       
  2646       rc = rc2;
       
  2647     }
       
  2648 
       
  2649     /* The rollback may have destroyed the pPage1->aData value.  So
       
  2650     ** call sqlite3BtreeGetPage() on page 1 again to make
       
  2651     ** sure pPage1->aData is set correctly. */
       
  2652     if( sqlite3BtreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
       
  2653       releasePage(pPage1);
       
  2654     }
       
  2655     assert( countWriteCursors(pBt)==0 );
       
  2656     pBt->inTransaction = TRANS_READ;
       
  2657   }
       
  2658 
       
  2659   if( p->inTrans!=TRANS_NONE ){
       
  2660     assert( pBt->nTransaction>0 );
       
  2661     pBt->nTransaction--;
       
  2662     if( 0==pBt->nTransaction ){
       
  2663       pBt->inTransaction = TRANS_NONE;
       
  2664     }
       
  2665   }
       
  2666 
       
  2667   p->inTrans = TRANS_NONE;
       
  2668   pBt->inStmt = 0;
       
  2669   unlockBtreeIfUnused(pBt);
       
  2670 
       
  2671   btreeIntegrity(p);
       
  2672   sqlite3BtreeLeave(p);
       
  2673   return rc;
       
  2674 }
       
  2675 
       
  2676 /*
       
  2677 ** Start a statement subtransaction.  The subtransaction can
       
  2678 ** can be rolled back independently of the main transaction.
       
  2679 ** You must start a transaction before starting a subtransaction.
       
  2680 ** The subtransaction is ended automatically if the main transaction
       
  2681 ** commits or rolls back.
       
  2682 **
       
  2683 ** Only one subtransaction may be active at a time.  It is an error to try
       
  2684 ** to start a new subtransaction if another subtransaction is already active.
       
  2685 **
       
  2686 ** Statement subtransactions are used around individual SQL statements
       
  2687 ** that are contained within a BEGIN...COMMIT block.  If a constraint
       
  2688 ** error occurs within the statement, the effect of that one statement
       
  2689 ** can be rolled back without having to rollback the entire transaction.
       
  2690 */
       
  2691 int sqlite3BtreeBeginStmt(Btree *p){
       
  2692   int rc;
       
  2693   BtShared *pBt = p->pBt;
       
  2694   sqlite3BtreeEnter(p);
       
  2695   pBt->db = p->db;
       
  2696   if( (p->inTrans!=TRANS_WRITE) || pBt->inStmt ){
       
  2697     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
       
  2698   }else{
       
  2699     assert( pBt->inTransaction==TRANS_WRITE );
       
  2700     rc = pBt->readOnly ? SQLITE_OK : sqlite3PagerStmtBegin(pBt->pPager);
       
  2701     pBt->inStmt = 1;
       
  2702   }
       
  2703   sqlite3BtreeLeave(p);
       
  2704   return rc;
       
  2705 }
       
  2706 
       
  2707 
       
  2708 /*
       
  2709 ** Commit the statment subtransaction currently in progress.  If no
       
  2710 ** subtransaction is active, this is a no-op.
       
  2711 */
       
  2712 int sqlite3BtreeCommitStmt(Btree *p){
       
  2713   int rc;
       
  2714   BtShared *pBt = p->pBt;
       
  2715   sqlite3BtreeEnter(p);
       
  2716   pBt->db = p->db;
       
  2717   if( pBt->inStmt && !pBt->readOnly ){
       
  2718     rc = sqlite3PagerStmtCommit(pBt->pPager);
       
  2719   }else{
       
  2720     rc = SQLITE_OK;
       
  2721   }
       
  2722   pBt->inStmt = 0;
       
  2723   sqlite3BtreeLeave(p);
       
  2724   return rc;
       
  2725 }
       
  2726 
       
  2727 /*
       
  2728 ** Rollback the active statement subtransaction.  If no subtransaction
       
  2729 ** is active this routine is a no-op.
       
  2730 **
       
  2731 ** All cursors will be invalidated by this operation.  Any attempt
       
  2732 ** to use a cursor that was open at the beginning of this operation
       
  2733 ** will result in an error.
       
  2734 */
       
  2735 int sqlite3BtreeRollbackStmt(Btree *p){
       
  2736   int rc = SQLITE_OK;
       
  2737   BtShared *pBt = p->pBt;
       
  2738   sqlite3BtreeEnter(p);
       
  2739   pBt->db = p->db;
       
  2740   if( pBt->inStmt && !pBt->readOnly ){
       
  2741     rc = sqlite3PagerStmtRollback(pBt->pPager);
       
  2742     pBt->inStmt = 0;
       
  2743   }
       
  2744   sqlite3BtreeLeave(p);
       
  2745   return rc;
       
  2746 }
       
  2747 
       
  2748 /*
       
  2749 ** Create a new cursor for the BTree whose root is on the page
       
  2750 ** iTable.  The act of acquiring a cursor gets a read lock on 
       
  2751 ** the database file.
       
  2752 **
       
  2753 ** If wrFlag==0, then the cursor can only be used for reading.
       
  2754 ** If wrFlag==1, then the cursor can be used for reading or for
       
  2755 ** writing if other conditions for writing are also met.  These
       
  2756 ** are the conditions that must be met in order for writing to
       
  2757 ** be allowed:
       
  2758 **
       
  2759 ** 1:  The cursor must have been opened with wrFlag==1
       
  2760 **
       
  2761 ** 2:  Other database connections that share the same pager cache
       
  2762 **     but which are not in the READ_UNCOMMITTED state may not have
       
  2763 **     cursors open with wrFlag==0 on the same table.  Otherwise
       
  2764 **     the changes made by this write cursor would be visible to
       
  2765 **     the read cursors in the other database connection.
       
  2766 **
       
  2767 ** 3:  The database must be writable (not on read-only media)
       
  2768 **
       
  2769 ** 4:  There must be an active transaction.
       
  2770 **
       
  2771 ** No checking is done to make sure that page iTable really is the
       
  2772 ** root page of a b-tree.  If it is not, then the cursor acquired
       
  2773 ** will not work correctly.
       
  2774 **
       
  2775 ** It is assumed that the sqlite3BtreeCursorSize() bytes of memory 
       
  2776 ** pointed to by pCur have been zeroed by the caller.
       
  2777 */
       
  2778 static int btreeCursor(
       
  2779   Btree *p,                              /* The btree */
       
  2780   int iTable,                            /* Root page of table to open */
       
  2781   int wrFlag,                            /* 1 to write. 0 read-only */
       
  2782   struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
       
  2783   BtCursor *pCur                         /* Space for new cursor */
       
  2784 ){
       
  2785   int rc;
       
  2786   BtShared *pBt = p->pBt;
       
  2787 
       
  2788   assert( sqlite3BtreeHoldsMutex(p) );
       
  2789   if( wrFlag ){
       
  2790     if( pBt->readOnly ){
       
  2791       return SQLITE_READONLY;
       
  2792     }
       
  2793     if( checkReadLocks(p, iTable, 0, 0) ){
       
  2794       return SQLITE_LOCKED;
       
  2795     }
       
  2796   }
       
  2797 
       
  2798   if( pBt->pPage1==0 ){
       
  2799     rc = lockBtreeWithRetry(p);
       
  2800     if( rc!=SQLITE_OK ){
       
  2801       return rc;
       
  2802     }
       
  2803     if( pBt->readOnly && wrFlag ){
       
  2804       return SQLITE_READONLY;
       
  2805     }
       
  2806   }
       
  2807   pCur->pgnoRoot = (Pgno)iTable;
       
  2808   if( iTable==1 && pagerPagecount(pBt->pPager)==0 ){
       
  2809     rc = SQLITE_EMPTY;
       
  2810     goto create_cursor_exception;
       
  2811   }
       
  2812   rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]);
       
  2813   if( rc!=SQLITE_OK ){
       
  2814     goto create_cursor_exception;
       
  2815   }
       
  2816 
       
  2817   /* Now that no other errors can occur, finish filling in the BtCursor
       
  2818   ** variables, link the cursor into the BtShared list and set *ppCur (the
       
  2819   ** output argument to this function).
       
  2820   */
       
  2821   pCur->pKeyInfo = pKeyInfo;
       
  2822   pCur->pBtree = p;
       
  2823   pCur->pBt = pBt;
       
  2824   pCur->wrFlag = wrFlag;
       
  2825   pCur->pNext = pBt->pCursor;
       
  2826   if( pCur->pNext ){
       
  2827     pCur->pNext->pPrev = pCur;
       
  2828   }
       
  2829   pBt->pCursor = pCur;
       
  2830   pCur->eState = CURSOR_INVALID;
       
  2831 
       
  2832   return SQLITE_OK;
       
  2833 
       
  2834 create_cursor_exception:
       
  2835   releasePage(pCur->apPage[0]);
       
  2836   unlockBtreeIfUnused(pBt);
       
  2837   return rc;
       
  2838 }
       
  2839 int sqlite3BtreeCursor(
       
  2840   Btree *p,                                   /* The btree */
       
  2841   int iTable,                                 /* Root page of table to open */
       
  2842   int wrFlag,                                 /* 1 to write. 0 read-only */
       
  2843   struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
       
  2844   BtCursor *pCur                              /* Write new cursor here */
       
  2845 ){
       
  2846   int rc;
       
  2847   sqlite3BtreeEnter(p);
       
  2848   p->pBt->db = p->db;
       
  2849   rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
       
  2850   sqlite3BtreeLeave(p);
       
  2851   return rc;
       
  2852 }
       
  2853 int sqlite3BtreeCursorSize(){
       
  2854   return sizeof(BtCursor);
       
  2855 }
       
  2856 
       
  2857 
       
  2858 
       
  2859 /*
       
  2860 ** Close a cursor.  The read lock on the database file is released
       
  2861 ** when the last cursor is closed.
       
  2862 */
       
  2863 int sqlite3BtreeCloseCursor(BtCursor *pCur){
       
  2864   Btree *pBtree = pCur->pBtree;
       
  2865   if( pBtree ){
       
  2866     int i;
       
  2867     BtShared *pBt = pCur->pBt;
       
  2868     sqlite3BtreeEnter(pBtree);
       
  2869     pBt->db = pBtree->db;
       
  2870     clearCursorPosition(pCur);
       
  2871     if( pCur->pPrev ){
       
  2872       pCur->pPrev->pNext = pCur->pNext;
       
  2873     }else{
       
  2874       pBt->pCursor = pCur->pNext;
       
  2875     }
       
  2876     if( pCur->pNext ){
       
  2877       pCur->pNext->pPrev = pCur->pPrev;
       
  2878     }
       
  2879     for(i=0; i<=pCur->iPage; i++){
       
  2880       releasePage(pCur->apPage[i]);
       
  2881     }
       
  2882     unlockBtreeIfUnused(pBt);
       
  2883     invalidateOverflowCache(pCur);
       
  2884     /* sqlite3_free(pCur); */
       
  2885     sqlite3BtreeLeave(pBtree);
       
  2886   }
       
  2887   return SQLITE_OK;
       
  2888 }
       
  2889 
       
  2890 /*
       
  2891 ** Make a temporary cursor by filling in the fields of pTempCur.
       
  2892 ** The temporary cursor is not on the cursor list for the Btree.
       
  2893 */
       
  2894 void sqlite3BtreeGetTempCursor(BtCursor *pCur, BtCursor *pTempCur){
       
  2895   int i;
       
  2896   assert( cursorHoldsMutex(pCur) );
       
  2897   memcpy(pTempCur, pCur, sizeof(BtCursor));
       
  2898   pTempCur->pNext = 0;
       
  2899   pTempCur->pPrev = 0;
       
  2900   for(i=0; i<=pTempCur->iPage; i++){
       
  2901     sqlite3PagerRef(pTempCur->apPage[i]->pDbPage);
       
  2902   }
       
  2903   assert( pCur->pKey==0 );
       
  2904 }
       
  2905 
       
  2906 /*
       
  2907 ** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
       
  2908 ** function above.
       
  2909 */
       
  2910 void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){
       
  2911   int i;
       
  2912   assert( cursorHoldsMutex(pCur) );
       
  2913   for(i=0; i<=pCur->iPage; i++){
       
  2914     sqlite3PagerUnref(pCur->apPage[i]->pDbPage);
       
  2915   }
       
  2916   sqlite3_free(pCur->pKey);
       
  2917 }
       
  2918 
       
  2919 /*
       
  2920 ** Make sure the BtCursor* given in the argument has a valid
       
  2921 ** BtCursor.info structure.  If it is not already valid, call
       
  2922 ** sqlite3BtreeParseCell() to fill it in.
       
  2923 **
       
  2924 ** BtCursor.info is a cache of the information in the current cell.
       
  2925 ** Using this cache reduces the number of calls to sqlite3BtreeParseCell().
       
  2926 **
       
  2927 ** 2007-06-25:  There is a bug in some versions of MSVC that cause the
       
  2928 ** compiler to crash when getCellInfo() is implemented as a macro.
       
  2929 ** But there is a measureable speed advantage to using the macro on gcc
       
  2930 ** (when less compiler optimizations like -Os or -O0 are used and the
       
  2931 ** compiler is not doing agressive inlining.)  So we use a real function
       
  2932 ** for MSVC and a macro for everything else.  Ticket #2457.
       
  2933 */
       
  2934 #ifndef NDEBUG
       
  2935   static void assertCellInfo(BtCursor *pCur){
       
  2936     CellInfo info;
       
  2937     int iPage = pCur->iPage;
       
  2938     memset(&info, 0, sizeof(info));
       
  2939     sqlite3BtreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
       
  2940     assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
       
  2941   }
       
  2942 #else
       
  2943   #define assertCellInfo(x)
       
  2944 #endif
       
  2945 #ifdef _MSC_VER
       
  2946   /* Use a real function in MSVC to work around bugs in that compiler. */
       
  2947   static void getCellInfo(BtCursor *pCur){
       
  2948     if( pCur->info.nSize==0 ){
       
  2949       int iPage = pCur->iPage;
       
  2950       sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
       
  2951       pCur->validNKey = 1;
       
  2952     }else{
       
  2953       assertCellInfo(pCur);
       
  2954     }
       
  2955   }
       
  2956 #else /* if not _MSC_VER */
       
  2957   /* Use a macro in all other compilers so that the function is inlined */
       
  2958 #define getCellInfo(pCur)                                                      \
       
  2959   if( pCur->info.nSize==0 ){                                                   \
       
  2960     int iPage = pCur->iPage;                                                   \
       
  2961     sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \
       
  2962     pCur->validNKey = 1;                                                       \
       
  2963   }else{                                                                       \
       
  2964     assertCellInfo(pCur);                                                      \
       
  2965   }
       
  2966 #endif /* _MSC_VER */
       
  2967 
       
  2968 /*
       
  2969 ** Set *pSize to the size of the buffer needed to hold the value of
       
  2970 ** the key for the current entry.  If the cursor is not pointing
       
  2971 ** to a valid entry, *pSize is set to 0. 
       
  2972 **
       
  2973 ** For a table with the INTKEY flag set, this routine returns the key
       
  2974 ** itself, not the number of bytes in the key.
       
  2975 */
       
  2976 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
       
  2977   int rc;
       
  2978 
       
  2979   assert( cursorHoldsMutex(pCur) );
       
  2980   rc = restoreCursorPosition(pCur);
       
  2981   if( rc==SQLITE_OK ){
       
  2982     assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
       
  2983     if( pCur->eState==CURSOR_INVALID ){
       
  2984       *pSize = 0;
       
  2985     }else{
       
  2986       getCellInfo(pCur);
       
  2987       *pSize = pCur->info.nKey;
       
  2988     }
       
  2989   }
       
  2990   return rc;
       
  2991 }
       
  2992 
       
  2993 /*
       
  2994 ** Set *pSize to the number of bytes of data in the entry the
       
  2995 ** cursor currently points to.  Always return SQLITE_OK.
       
  2996 ** Failure is not possible.  If the cursor is not currently
       
  2997 ** pointing to an entry (which can happen, for example, if
       
  2998 ** the database is empty) then *pSize is set to 0.
       
  2999 */
       
  3000 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
       
  3001   int rc;
       
  3002 
       
  3003   assert( cursorHoldsMutex(pCur) );
       
  3004   rc = restoreCursorPosition(pCur);
       
  3005   if( rc==SQLITE_OK ){
       
  3006     assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
       
  3007     if( pCur->eState==CURSOR_INVALID ){
       
  3008       /* Not pointing at a valid entry - set *pSize to 0. */
       
  3009       *pSize = 0;
       
  3010     }else{
       
  3011       getCellInfo(pCur);
       
  3012       *pSize = pCur->info.nData;
       
  3013     }
       
  3014   }
       
  3015   return rc;
       
  3016 }
       
  3017 
       
  3018 /*
       
  3019 ** Given the page number of an overflow page in the database (parameter
       
  3020 ** ovfl), this function finds the page number of the next page in the 
       
  3021 ** linked list of overflow pages. If possible, it uses the auto-vacuum
       
  3022 ** pointer-map data instead of reading the content of page ovfl to do so. 
       
  3023 **
       
  3024 ** If an error occurs an SQLite error code is returned. Otherwise:
       
  3025 **
       
  3026 ** Unless pPgnoNext is NULL, the page number of the next overflow 
       
  3027 ** page in the linked list is written to *pPgnoNext. If page ovfl
       
  3028 ** is the last page in its linked list, *pPgnoNext is set to zero. 
       
  3029 **
       
  3030 ** If ppPage is not NULL, *ppPage is set to the MemPage* handle
       
  3031 ** for page ovfl. The underlying pager page may have been requested
       
  3032 ** with the noContent flag set, so the page data accessable via
       
  3033 ** this handle may not be trusted.
       
  3034 */
       
  3035 static int getOverflowPage(
       
  3036   BtShared *pBt, 
       
  3037   Pgno ovfl,                   /* Overflow page */
       
  3038   MemPage **ppPage,            /* OUT: MemPage handle */
       
  3039   Pgno *pPgnoNext              /* OUT: Next overflow page number */
       
  3040 ){
       
  3041   Pgno next = 0;
       
  3042   int rc;
       
  3043 
       
  3044   assert( sqlite3_mutex_held(pBt->mutex) );
       
  3045   /* One of these must not be NULL. Otherwise, why call this function? */
       
  3046   assert(ppPage || pPgnoNext);
       
  3047 
       
  3048   /* If pPgnoNext is NULL, then this function is being called to obtain
       
  3049   ** a MemPage* reference only. No page-data is required in this case.
       
  3050   */
       
  3051   if( !pPgnoNext ){
       
  3052     return sqlite3BtreeGetPage(pBt, ovfl, ppPage, 1);
       
  3053   }
       
  3054 
       
  3055 #ifndef SQLITE_OMIT_AUTOVACUUM
       
  3056   /* Try to find the next page in the overflow list using the
       
  3057   ** autovacuum pointer-map pages. Guess that the next page in 
       
  3058   ** the overflow list is page number (ovfl+1). If that guess turns 
       
  3059   ** out to be wrong, fall back to loading the data of page 
       
  3060   ** number ovfl to determine the next page number.
       
  3061   */
       
  3062   if( pBt->autoVacuum ){
       
  3063     Pgno pgno;
       
  3064     Pgno iGuess = ovfl+1;
       
  3065     u8 eType;
       
  3066 
       
  3067     while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
       
  3068       iGuess++;
       
  3069     }
       
  3070 
       
  3071     if( iGuess<=pagerPagecount(pBt->pPager) ){
       
  3072       rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
       
  3073       if( rc!=SQLITE_OK ){
       
  3074         return rc;
       
  3075       }
       
  3076       if( eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
       
  3077         next = iGuess;
       
  3078       }
       
  3079     }
       
  3080   }
       
  3081 #endif
       
  3082 
       
  3083   if( next==0 || ppPage ){
       
  3084     MemPage *pPage = 0;
       
  3085 
       
  3086     rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, next!=0);
       
  3087     assert(rc==SQLITE_OK || pPage==0);
       
  3088     if( next==0 && rc==SQLITE_OK ){
       
  3089       next = get4byte(pPage->aData);
       
  3090     }
       
  3091 
       
  3092     if( ppPage ){
       
  3093       *ppPage = pPage;
       
  3094     }else{
       
  3095       releasePage(pPage);
       
  3096     }
       
  3097   }
       
  3098   *pPgnoNext = next;
       
  3099 
       
  3100   return rc;
       
  3101 }
       
  3102 
       
  3103 /*
       
  3104 ** Copy data from a buffer to a page, or from a page to a buffer.
       
  3105 **
       
  3106 ** pPayload is a pointer to data stored on database page pDbPage.
       
  3107 ** If argument eOp is false, then nByte bytes of data are copied
       
  3108 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
       
  3109 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
       
  3110 ** of data are copied from the buffer pBuf to pPayload.
       
  3111 **
       
  3112 ** SQLITE_OK is returned on success, otherwise an error code.
       
  3113 */
       
  3114 static int copyPayload(
       
  3115   void *pPayload,           /* Pointer to page data */
       
  3116   void *pBuf,               /* Pointer to buffer */
       
  3117   int nByte,                /* Number of bytes to copy */
       
  3118   int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
       
  3119   DbPage *pDbPage           /* Page containing pPayload */
       
  3120 ){
       
  3121   if( eOp ){
       
  3122     /* Copy data from buffer to page (a write operation) */
       
  3123     int rc = sqlite3PagerWrite(pDbPage);
       
  3124     if( rc!=SQLITE_OK ){
       
  3125       return rc;
       
  3126     }
       
  3127     memcpy(pPayload, pBuf, nByte);
       
  3128   }else{
       
  3129     /* Copy data from page to buffer (a read operation) */
       
  3130     memcpy(pBuf, pPayload, nByte);
       
  3131   }
       
  3132   return SQLITE_OK;
       
  3133 }
       
  3134 
       
  3135 /*
       
  3136 ** This function is used to read or overwrite payload information
       
  3137 ** for the entry that the pCur cursor is pointing to. If the eOp
       
  3138 ** parameter is 0, this is a read operation (data copied into
       
  3139 ** buffer pBuf). If it is non-zero, a write (data copied from
       
  3140 ** buffer pBuf).
       
  3141 **
       
  3142 ** A total of "amt" bytes are read or written beginning at "offset".
       
  3143 ** Data is read to or from the buffer pBuf.
       
  3144 **
       
  3145 ** This routine does not make a distinction between key and data.
       
  3146 ** It just reads or writes bytes from the payload area.  Data might 
       
  3147 ** appear on the main page or be scattered out on multiple overflow 
       
  3148 ** pages.
       
  3149 **
       
  3150 ** If the BtCursor.isIncrblobHandle flag is set, and the current
       
  3151 ** cursor entry uses one or more overflow pages, this function
       
  3152 ** allocates space for and lazily popluates the overflow page-list 
       
  3153 ** cache array (BtCursor.aOverflow). Subsequent calls use this
       
  3154 ** cache to make seeking to the supplied offset more efficient.
       
  3155 **
       
  3156 ** Once an overflow page-list cache has been allocated, it may be
       
  3157 ** invalidated if some other cursor writes to the same table, or if
       
  3158 ** the cursor is moved to a different row. Additionally, in auto-vacuum
       
  3159 ** mode, the following events may invalidate an overflow page-list cache.
       
  3160 **
       
  3161 **   * An incremental vacuum,
       
  3162 **   * A commit in auto_vacuum="full" mode,
       
  3163 **   * Creating a table (may require moving an overflow page).
       
  3164 */
       
  3165 static int accessPayload(
       
  3166   BtCursor *pCur,      /* Cursor pointing to entry to read from */
       
  3167   int offset,          /* Begin reading this far into payload */
       
  3168   int amt,             /* Read this many bytes */
       
  3169   unsigned char *pBuf, /* Write the bytes into this buffer */ 
       
  3170   int skipKey,         /* offset begins at data if this is true */
       
  3171   int eOp              /* zero to read. non-zero to write. */
       
  3172 ){
       
  3173   unsigned char *aPayload;
       
  3174   int rc = SQLITE_OK;
       
  3175   u32 nKey;
       
  3176   int iIdx = 0;
       
  3177   MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
       
  3178   BtShared *pBt;                              /* Btree this cursor belongs to */
       
  3179 
       
  3180   assert( pPage );
       
  3181   assert( pCur->eState==CURSOR_VALID );
       
  3182   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
       
  3183   assert( offset>=0 );
       
  3184   assert( cursorHoldsMutex(pCur) );
       
  3185 
       
  3186   getCellInfo(pCur);
       
  3187   aPayload = pCur->info.pCell + pCur->info.nHeader;
       
  3188   nKey = (pPage->intKey ? 0 : pCur->info.nKey);
       
  3189 
       
  3190   if( skipKey ){
       
  3191     offset += nKey;
       
  3192   }
       
  3193   if( offset+amt > nKey+pCur->info.nData ){
       
  3194     /* Trying to read or write past the end of the data is an error */
       
  3195     return SQLITE_CORRUPT_BKPT;
       
  3196   }
       
  3197 
       
  3198   /* Check if data must be read/written to/from the btree page itself. */
       
  3199   if( offset<pCur->info.nLocal ){
       
  3200     int a = amt;
       
  3201     if( a+offset>pCur->info.nLocal ){
       
  3202       a = pCur->info.nLocal - offset;
       
  3203     }
       
  3204     rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
       
  3205     offset = 0;
       
  3206     pBuf += a;
       
  3207     amt -= a;
       
  3208   }else{
       
  3209     offset -= pCur->info.nLocal;
       
  3210   }
       
  3211 
       
  3212   pBt = pCur->pBt;
       
  3213   if( rc==SQLITE_OK && amt>0 ){
       
  3214     const int ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
       
  3215     Pgno nextPage;
       
  3216 
       
  3217     nextPage = get4byte(&aPayload[pCur->info.nLocal]);
       
  3218 
       
  3219 #ifndef SQLITE_OMIT_INCRBLOB
       
  3220     /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
       
  3221     ** has not been allocated, allocate it now. The array is sized at
       
  3222     ** one entry for each overflow page in the overflow chain. The
       
  3223     ** page number of the first overflow page is stored in aOverflow[0],
       
  3224     ** etc. A value of 0 in the aOverflow[] array means "not yet known"
       
  3225     ** (the cache is lazily populated).
       
  3226     */
       
  3227     if( pCur->isIncrblobHandle && !pCur->aOverflow ){
       
  3228       int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
       
  3229       pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl);
       
  3230       if( nOvfl && !pCur->aOverflow ){
       
  3231         rc = SQLITE_NOMEM;
       
  3232       }
       
  3233     }
       
  3234 
       
  3235     /* If the overflow page-list cache has been allocated and the
       
  3236     ** entry for the first required overflow page is valid, skip
       
  3237     ** directly to it.
       
  3238     */
       
  3239     if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
       
  3240       iIdx = (offset/ovflSize);
       
  3241       nextPage = pCur->aOverflow[iIdx];
       
  3242       offset = (offset%ovflSize);
       
  3243     }
       
  3244 #endif
       
  3245 
       
  3246     for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
       
  3247 
       
  3248 #ifndef SQLITE_OMIT_INCRBLOB
       
  3249       /* If required, populate the overflow page-list cache. */
       
  3250       if( pCur->aOverflow ){
       
  3251         assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
       
  3252         pCur->aOverflow[iIdx] = nextPage;
       
  3253       }
       
  3254 #endif
       
  3255 
       
  3256       if( offset>=ovflSize ){
       
  3257         /* The only reason to read this page is to obtain the page
       
  3258         ** number for the next page in the overflow chain. The page
       
  3259         ** data is not required. So first try to lookup the overflow
       
  3260         ** page-list cache, if any, then fall back to the getOverflowPage()
       
  3261         ** function.
       
  3262         */
       
  3263 #ifndef SQLITE_OMIT_INCRBLOB
       
  3264         if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
       
  3265           nextPage = pCur->aOverflow[iIdx+1];
       
  3266         } else 
       
  3267 #endif
       
  3268           rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
       
  3269         offset -= ovflSize;
       
  3270       }else{
       
  3271         /* Need to read this page properly. It contains some of the
       
  3272         ** range of data that is being read (eOp==0) or written (eOp!=0).
       
  3273         */
       
  3274         DbPage *pDbPage;
       
  3275         int a = amt;
       
  3276         rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
       
  3277         if( rc==SQLITE_OK ){
       
  3278           aPayload = sqlite3PagerGetData(pDbPage);
       
  3279           nextPage = get4byte(aPayload);
       
  3280           if( a + offset > ovflSize ){
       
  3281             a = ovflSize - offset;
       
  3282           }
       
  3283           rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
       
  3284           sqlite3PagerUnref(pDbPage);
       
  3285           offset = 0;
       
  3286           amt -= a;
       
  3287           pBuf += a;
       
  3288         }
       
  3289       }
       
  3290     }
       
  3291   }
       
  3292 
       
  3293   if( rc==SQLITE_OK && amt>0 ){
       
  3294     return SQLITE_CORRUPT_BKPT;
       
  3295   }
       
  3296   return rc;
       
  3297 }
       
  3298 
       
  3299 /*
       
  3300 ** Read part of the key associated with cursor pCur.  Exactly
       
  3301 ** "amt" bytes will be transfered into pBuf[].  The transfer
       
  3302 ** begins at "offset".
       
  3303 **
       
  3304 ** Return SQLITE_OK on success or an error code if anything goes
       
  3305 ** wrong.  An error is returned if "offset+amt" is larger than
       
  3306 ** the available payload.
       
  3307 */
       
  3308 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
       
  3309   int rc;
       
  3310 
       
  3311   assert( cursorHoldsMutex(pCur) );
       
  3312   rc = restoreCursorPosition(pCur);
       
  3313   if( rc==SQLITE_OK ){
       
  3314     assert( pCur->eState==CURSOR_VALID );
       
  3315     assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
       
  3316     if( pCur->apPage[0]->intKey ){
       
  3317       return SQLITE_CORRUPT_BKPT;
       
  3318     }
       
  3319     assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
       
  3320     rc = accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0, 0);
       
  3321   }
       
  3322   return rc;
       
  3323 }
       
  3324 
       
  3325 /*
       
  3326 ** Read part of the data associated with cursor pCur.  Exactly
       
  3327 ** "amt" bytes will be transfered into pBuf[].  The transfer
       
  3328 ** begins at "offset".
       
  3329 **
       
  3330 ** Return SQLITE_OK on success or an error code if anything goes
       
  3331 ** wrong.  An error is returned if "offset+amt" is larger than
       
  3332 ** the available payload.
       
  3333 */
       
  3334 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
       
  3335   int rc;
       
  3336 
       
  3337 #ifndef SQLITE_OMIT_INCRBLOB
       
  3338   if ( pCur->eState==CURSOR_INVALID ){
       
  3339     return SQLITE_ABORT;
       
  3340   }
       
  3341 #endif
       
  3342 
       
  3343   assert( cursorHoldsMutex(pCur) );
       
  3344   rc = restoreCursorPosition(pCur);
       
  3345   if( rc==SQLITE_OK ){
       
  3346     assert( pCur->eState==CURSOR_VALID );
       
  3347     assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
       
  3348     assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
       
  3349     rc = accessPayload(pCur, offset, amt, pBuf, 1, 0);
       
  3350   }
       
  3351   return rc;
       
  3352 }
       
  3353 
       
  3354 /*
       
  3355 ** Return a pointer to payload information from the entry that the 
       
  3356 ** pCur cursor is pointing to.  The pointer is to the beginning of
       
  3357 ** the key if skipKey==0 and it points to the beginning of data if
       
  3358 ** skipKey==1.  The number of bytes of available key/data is written
       
  3359 ** into *pAmt.  If *pAmt==0, then the value returned will not be
       
  3360 ** a valid pointer.
       
  3361 **
       
  3362 ** This routine is an optimization.  It is common for the entire key
       
  3363 ** and data to fit on the local page and for there to be no overflow
       
  3364 ** pages.  When that is so, this routine can be used to access the
       
  3365 ** key and data without making a copy.  If the key and/or data spills
       
  3366 ** onto overflow pages, then accessPayload() must be used to reassembly
       
  3367 ** the key/data and copy it into a preallocated buffer.
       
  3368 **
       
  3369 ** The pointer returned by this routine looks directly into the cached
       
  3370 ** page of the database.  The data might change or move the next time
       
  3371 ** any btree routine is called.
       
  3372 */
       
  3373 static const unsigned char *fetchPayload(
       
  3374   BtCursor *pCur,      /* Cursor pointing to entry to read from */
       
  3375   int *pAmt,           /* Write the number of available bytes here */
       
  3376   int skipKey          /* read beginning at data if this is true */
       
  3377 ){
       
  3378   unsigned char *aPayload;
       
  3379   MemPage *pPage;
       
  3380   u32 nKey;
       
  3381   int nLocal;
       
  3382 
       
  3383   assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
       
  3384   assert( pCur->eState==CURSOR_VALID );
       
  3385   assert( cursorHoldsMutex(pCur) );
       
  3386   pPage = pCur->apPage[pCur->iPage];
       
  3387   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
       
  3388   getCellInfo(pCur);
       
  3389   aPayload = pCur->info.pCell;
       
  3390   aPayload += pCur->info.nHeader;
       
  3391   if( pPage->intKey ){
       
  3392     nKey = 0;
       
  3393   }else{
       
  3394     nKey = pCur->info.nKey;
       
  3395   }
       
  3396   if( skipKey ){
       
  3397     aPayload += nKey;
       
  3398     nLocal = pCur->info.nLocal - nKey;
       
  3399   }else{
       
  3400     nLocal = pCur->info.nLocal;
       
  3401     if( nLocal>nKey ){
       
  3402       nLocal = nKey;
       
  3403     }
       
  3404   }
       
  3405   *pAmt = nLocal;
       
  3406   return aPayload;
       
  3407 }
       
  3408 
       
  3409 
       
  3410 /*
       
  3411 ** For the entry that cursor pCur is point to, return as
       
  3412 ** many bytes of the key or data as are available on the local
       
  3413 ** b-tree page.  Write the number of available bytes into *pAmt.
       
  3414 **
       
  3415 ** The pointer returned is ephemeral.  The key/data may move
       
  3416 ** or be destroyed on the next call to any Btree routine,
       
  3417 ** including calls from other threads against the same cache.
       
  3418 ** Hence, a mutex on the BtShared should be held prior to calling
       
  3419 ** this routine.
       
  3420 **
       
  3421 ** These routines is used to get quick access to key and data
       
  3422 ** in the common case where no overflow pages are used.
       
  3423 */
       
  3424 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
       
  3425   assert( cursorHoldsMutex(pCur) );
       
  3426   if( pCur->eState==CURSOR_VALID ){
       
  3427     return (const void*)fetchPayload(pCur, pAmt, 0);
       
  3428   }
       
  3429   return 0;
       
  3430 }
       
  3431 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
       
  3432   assert( cursorHoldsMutex(pCur) );
       
  3433   if( pCur->eState==CURSOR_VALID ){
       
  3434     return (const void*)fetchPayload(pCur, pAmt, 1);
       
  3435   }
       
  3436   return 0;
       
  3437 }
       
  3438 
       
  3439 
       
  3440 /*
       
  3441 ** Move the cursor down to a new child page.  The newPgno argument is the
       
  3442 ** page number of the child page to move to.
       
  3443 */
       
  3444 static int moveToChild(BtCursor *pCur, u32 newPgno){
       
  3445   int rc;
       
  3446   int i = pCur->iPage;
       
  3447   MemPage *pNewPage;
       
  3448   BtShared *pBt = pCur->pBt;
       
  3449 
       
  3450   assert( cursorHoldsMutex(pCur) );
       
  3451   assert( pCur->eState==CURSOR_VALID );
       
  3452   assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
       
  3453   if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
       
  3454     return SQLITE_CORRUPT_BKPT;
       
  3455   }
       
  3456   rc = getAndInitPage(pBt, newPgno, &pNewPage);
       
  3457   if( rc ) return rc;
       
  3458   pCur->apPage[i+1] = pNewPage;
       
  3459   pCur->aiIdx[i+1] = 0;
       
  3460   pCur->iPage++;
       
  3461 
       
  3462   pCur->info.nSize = 0;
       
  3463   pCur->validNKey = 0;
       
  3464   if( pNewPage->nCell<1 ){
       
  3465     return SQLITE_CORRUPT_BKPT;
       
  3466   }
       
  3467   return SQLITE_OK;
       
  3468 }
       
  3469 
       
  3470 #ifndef NDEBUG
       
  3471 /*
       
  3472 ** Page pParent is an internal (non-leaf) tree page. This function 
       
  3473 ** asserts that page number iChild is the left-child if the iIdx'th
       
  3474 ** cell in page pParent. Or, if iIdx is equal to the total number of
       
  3475 ** cells in pParent, that page number iChild is the right-child of
       
  3476 ** the page.
       
  3477 */
       
  3478 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
       
  3479   assert( iIdx<=pParent->nCell );
       
  3480   if( iIdx==pParent->nCell ){
       
  3481     assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
       
  3482   }else{
       
  3483     assert( get4byte(findCell(pParent, iIdx))==iChild );
       
  3484   }
       
  3485 }
       
  3486 #else
       
  3487 #  define assertParentIndex(x,y,z) 
       
  3488 #endif
       
  3489 
       
  3490 /*
       
  3491 ** Move the cursor up to the parent page.
       
  3492 **
       
  3493 ** pCur->idx is set to the cell index that contains the pointer
       
  3494 ** to the page we are coming from.  If we are coming from the
       
  3495 ** right-most child page then pCur->idx is set to one more than
       
  3496 ** the largest cell index.
       
  3497 */
       
  3498 void sqlite3BtreeMoveToParent(BtCursor *pCur){
       
  3499   assert( cursorHoldsMutex(pCur) );
       
  3500   assert( pCur->eState==CURSOR_VALID );
       
  3501   assert( pCur->iPage>0 );
       
  3502   assert( pCur->apPage[pCur->iPage] );
       
  3503   assertParentIndex(
       
  3504     pCur->apPage[pCur->iPage-1], 
       
  3505     pCur->aiIdx[pCur->iPage-1], 
       
  3506     pCur->apPage[pCur->iPage]->pgno
       
  3507   );
       
  3508   releasePage(pCur->apPage[pCur->iPage]);
       
  3509   pCur->iPage--;
       
  3510   pCur->info.nSize = 0;
       
  3511   pCur->validNKey = 0;
       
  3512 }
       
  3513 
       
  3514 /*
       
  3515 ** Move the cursor to the root page
       
  3516 */
       
  3517 static int moveToRoot(BtCursor *pCur){
       
  3518   MemPage *pRoot;
       
  3519   int rc = SQLITE_OK;
       
  3520   Btree *p = pCur->pBtree;
       
  3521   BtShared *pBt = p->pBt;
       
  3522 
       
  3523   assert( cursorHoldsMutex(pCur) );
       
  3524   assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
       
  3525   assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
       
  3526   assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
       
  3527   if( pCur->eState>=CURSOR_REQUIRESEEK ){
       
  3528     if( pCur->eState==CURSOR_FAULT ){
       
  3529       return pCur->skip;
       
  3530     }
       
  3531     clearCursorPosition(pCur);
       
  3532   }
       
  3533 
       
  3534   if( pCur->iPage>=0 ){
       
  3535     int i;
       
  3536     for(i=1; i<=pCur->iPage; i++){
       
  3537       releasePage(pCur->apPage[i]);
       
  3538     }
       
  3539   }else{
       
  3540     if( 
       
  3541       SQLITE_OK!=(rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]))
       
  3542     ){
       
  3543       pCur->eState = CURSOR_INVALID;
       
  3544       return rc;
       
  3545     }
       
  3546   }
       
  3547 
       
  3548   pRoot = pCur->apPage[0];
       
  3549   assert( pRoot->pgno==pCur->pgnoRoot );
       
  3550   pCur->iPage = 0;
       
  3551   pCur->aiIdx[0] = 0;
       
  3552   pCur->info.nSize = 0;
       
  3553   pCur->atLast = 0;
       
  3554   pCur->validNKey = 0;
       
  3555 
       
  3556   if( pRoot->nCell==0 && !pRoot->leaf ){
       
  3557     Pgno subpage;
       
  3558     assert( pRoot->pgno==1 );
       
  3559     subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
       
  3560     assert( subpage>0 );
       
  3561     pCur->eState = CURSOR_VALID;
       
  3562     rc = moveToChild(pCur, subpage);
       
  3563   }else{
       
  3564     pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
       
  3565   }
       
  3566   return rc;
       
  3567 }
       
  3568 
       
  3569 /*
       
  3570 ** Move the cursor down to the left-most leaf entry beneath the
       
  3571 ** entry to which it is currently pointing.
       
  3572 **
       
  3573 ** The left-most leaf is the one with the smallest key - the first
       
  3574 ** in ascending order.
       
  3575 */
       
  3576 static int moveToLeftmost(BtCursor *pCur){
       
  3577   Pgno pgno;
       
  3578   int rc = SQLITE_OK;
       
  3579   MemPage *pPage;
       
  3580 
       
  3581   assert( cursorHoldsMutex(pCur) );
       
  3582   assert( pCur->eState==CURSOR_VALID );
       
  3583   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
       
  3584     assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
       
  3585     pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
       
  3586     rc = moveToChild(pCur, pgno);
       
  3587   }
       
  3588   return rc;
       
  3589 }
       
  3590 
       
  3591 /*
       
  3592 ** Move the cursor down to the right-most leaf entry beneath the
       
  3593 ** page to which it is currently pointing.  Notice the difference
       
  3594 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
       
  3595 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
       
  3596 ** finds the right-most entry beneath the *page*.
       
  3597 **
       
  3598 ** The right-most entry is the one with the largest key - the last
       
  3599 ** key in ascending order.
       
  3600 */
       
  3601 static int moveToRightmost(BtCursor *pCur){
       
  3602   Pgno pgno;
       
  3603   int rc = SQLITE_OK;
       
  3604   MemPage *pPage;
       
  3605 
       
  3606   assert( cursorHoldsMutex(pCur) );
       
  3607   assert( pCur->eState==CURSOR_VALID );
       
  3608   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
       
  3609     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
       
  3610     pCur->aiIdx[pCur->iPage] = pPage->nCell;
       
  3611     rc = moveToChild(pCur, pgno);
       
  3612   }
       
  3613   if( rc==SQLITE_OK ){
       
  3614     pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
       
  3615     pCur->info.nSize = 0;
       
  3616     pCur->validNKey = 0;
       
  3617   }
       
  3618   return rc;
       
  3619 }
       
  3620 
       
  3621 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
       
  3622 ** on success.  Set *pRes to 0 if the cursor actually points to something
       
  3623 ** or set *pRes to 1 if the table is empty.
       
  3624 */
       
  3625 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
       
  3626   int rc;
       
  3627 
       
  3628   assert( cursorHoldsMutex(pCur) );
       
  3629   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
       
  3630   rc = moveToRoot(pCur);
       
  3631   if( rc==SQLITE_OK ){
       
  3632     if( pCur->eState==CURSOR_INVALID ){
       
  3633       assert( pCur->apPage[pCur->iPage]->nCell==0 );
       
  3634       *pRes = 1;
       
  3635       rc = SQLITE_OK;
       
  3636     }else{
       
  3637       assert( pCur->apPage[pCur->iPage]->nCell>0 );
       
  3638       *pRes = 0;
       
  3639       rc = moveToLeftmost(pCur);
       
  3640     }
       
  3641   }
       
  3642   return rc;
       
  3643 }
       
  3644 
       
  3645 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
       
  3646 ** on success.  Set *pRes to 0 if the cursor actually points to something
       
  3647 ** or set *pRes to 1 if the table is empty.
       
  3648 */
       
  3649 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
       
  3650   int rc;
       
  3651  
       
  3652   assert( cursorHoldsMutex(pCur) );
       
  3653   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
       
  3654   rc = moveToRoot(pCur);
       
  3655   if( rc==SQLITE_OK ){
       
  3656     if( CURSOR_INVALID==pCur->eState ){
       
  3657       assert( pCur->apPage[pCur->iPage]->nCell==0 );
       
  3658       *pRes = 1;
       
  3659     }else{
       
  3660       assert( pCur->eState==CURSOR_VALID );
       
  3661       *pRes = 0;
       
  3662       rc = moveToRightmost(pCur);
       
  3663       getCellInfo(pCur);
       
  3664       pCur->atLast = rc==SQLITE_OK;
       
  3665     }
       
  3666   }
       
  3667   return rc;
       
  3668 }
       
  3669 
       
  3670 /* Move the cursor so that it points to an entry near the key 
       
  3671 ** specified by pIdxKey or intKey.   Return a success code.
       
  3672 **
       
  3673 ** For INTKEY tables, the intKey parameter is used.  pIdxKey 
       
  3674 ** must be NULL.  For index tables, pIdxKey is used and intKey
       
  3675 ** is ignored.
       
  3676 **
       
  3677 ** If an exact match is not found, then the cursor is always
       
  3678 ** left pointing at a leaf page which would hold the entry if it
       
  3679 ** were present.  The cursor might point to an entry that comes
       
  3680 ** before or after the key.
       
  3681 **
       
  3682 ** The result of comparing the key with the entry to which the
       
  3683 ** cursor is written to *pRes if pRes!=NULL.  The meaning of
       
  3684 ** this value is as follows:
       
  3685 **
       
  3686 **     *pRes<0      The cursor is left pointing at an entry that
       
  3687 **                  is smaller than pKey or if the table is empty
       
  3688 **                  and the cursor is therefore left point to nothing.
       
  3689 **
       
  3690 **     *pRes==0     The cursor is left pointing at an entry that
       
  3691 **                  exactly matches pKey.
       
  3692 **
       
  3693 **     *pRes>0      The cursor is left pointing at an entry that
       
  3694 **                  is larger than pKey.
       
  3695 **
       
  3696 */
       
  3697 int sqlite3BtreeMovetoUnpacked(
       
  3698   BtCursor *pCur,          /* The cursor to be moved */
       
  3699   UnpackedRecord *pIdxKey, /* Unpacked index key */
       
  3700   i64 intKey,              /* The table key */
       
  3701   int biasRight,           /* If true, bias the search to the high end */
       
  3702   int *pRes                /* Write search results here */
       
  3703 ){
       
  3704   int rc;
       
  3705 
       
  3706   assert( cursorHoldsMutex(pCur) );
       
  3707   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
       
  3708 
       
  3709   /* If the cursor is already positioned at the point we are trying
       
  3710   ** to move to, then just return without doing any work */
       
  3711   if( pCur->eState==CURSOR_VALID && pCur->validNKey 
       
  3712    && pCur->apPage[0]->intKey 
       
  3713   ){
       
  3714     if( pCur->info.nKey==intKey ){
       
  3715       *pRes = 0;
       
  3716       return SQLITE_OK;
       
  3717     }
       
  3718     if( pCur->atLast && pCur->info.nKey<intKey ){
       
  3719       *pRes = -1;
       
  3720       return SQLITE_OK;
       
  3721     }
       
  3722   }
       
  3723 
       
  3724   rc = moveToRoot(pCur);
       
  3725   if( rc ){
       
  3726     return rc;
       
  3727   }
       
  3728   assert( pCur->apPage[pCur->iPage] );
       
  3729   assert( pCur->apPage[pCur->iPage]->isInit );
       
  3730   if( pCur->eState==CURSOR_INVALID ){
       
  3731     *pRes = -1;
       
  3732     assert( pCur->apPage[pCur->iPage]->nCell==0 );
       
  3733     return SQLITE_OK;
       
  3734   }
       
  3735   assert( pCur->apPage[0]->intKey || pIdxKey );
       
  3736   for(;;){
       
  3737     int lwr, upr;
       
  3738     Pgno chldPg;
       
  3739     MemPage *pPage = pCur->apPage[pCur->iPage];
       
  3740     int c = -1;  /* pRes return if table is empty must be -1 */
       
  3741     lwr = 0;
       
  3742     upr = pPage->nCell-1;
       
  3743     if( !pPage->intKey && pIdxKey==0 ){
       
  3744       rc = SQLITE_CORRUPT_BKPT;
       
  3745       goto moveto_finish;
       
  3746     }
       
  3747     if( biasRight ){
       
  3748       pCur->aiIdx[pCur->iPage] = upr;
       
  3749     }else{
       
  3750       pCur->aiIdx[pCur->iPage] = (upr+lwr)/2;
       
  3751     }
       
  3752     if( lwr<=upr ) for(;;){
       
  3753       void *pCellKey;
       
  3754       i64 nCellKey;
       
  3755       int idx = pCur->aiIdx[pCur->iPage];
       
  3756       pCur->info.nSize = 0;
       
  3757       pCur->validNKey = 1;
       
  3758       if( pPage->intKey ){
       
  3759         u8 *pCell;
       
  3760         pCell = findCell(pPage, idx) + pPage->childPtrSize;
       
  3761         if( pPage->hasData ){
       
  3762           u32 dummy;
       
  3763           pCell += getVarint32(pCell, dummy);
       
  3764         }
       
  3765         getVarint(pCell, (u64*)&nCellKey);
       
  3766         if( nCellKey==intKey ){
       
  3767           c = 0;
       
  3768         }else if( nCellKey<intKey ){
       
  3769           c = -1;
       
  3770         }else{
       
  3771           assert( nCellKey>intKey );
       
  3772           c = +1;
       
  3773         }
       
  3774       }else{
       
  3775         int available;
       
  3776         pCellKey = (void *)fetchPayload(pCur, &available, 0);
       
  3777         nCellKey = pCur->info.nKey;
       
  3778         if( available>=nCellKey ){
       
  3779           c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pIdxKey);
       
  3780         }else{
       
  3781           pCellKey = sqlite3Malloc( nCellKey );
       
  3782           if( pCellKey==0 ){
       
  3783             rc = SQLITE_NOMEM;
       
  3784             goto moveto_finish;
       
  3785           }
       
  3786           rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey);
       
  3787           c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pIdxKey);
       
  3788           sqlite3_free(pCellKey);
       
  3789           if( rc ) goto moveto_finish;
       
  3790         }
       
  3791       }
       
  3792       if( c==0 ){
       
  3793         pCur->info.nKey = nCellKey;
       
  3794         if( pPage->intKey && !pPage->leaf ){
       
  3795           lwr = idx;
       
  3796           upr = lwr - 1;
       
  3797           break;
       
  3798         }else{
       
  3799           if( pRes ) *pRes = 0;
       
  3800           rc = SQLITE_OK;
       
  3801           goto moveto_finish;
       
  3802         }
       
  3803       }
       
  3804       if( c<0 ){
       
  3805         lwr = idx+1;
       
  3806       }else{
       
  3807         upr = idx-1;
       
  3808       }
       
  3809       if( lwr>upr ){
       
  3810         pCur->info.nKey = nCellKey;
       
  3811         break;
       
  3812       }
       
  3813       pCur->aiIdx[pCur->iPage] = (lwr+upr)/2;
       
  3814     }
       
  3815     assert( lwr==upr+1 );
       
  3816     assert( pPage->isInit );
       
  3817     if( pPage->leaf ){
       
  3818       chldPg = 0;
       
  3819     }else if( lwr>=pPage->nCell ){
       
  3820       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
       
  3821     }else{
       
  3822       chldPg = get4byte(findCell(pPage, lwr));
       
  3823     }
       
  3824     if( chldPg==0 ){
       
  3825       assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
       
  3826       if( pRes ) *pRes = c;
       
  3827       rc = SQLITE_OK;
       
  3828       goto moveto_finish;
       
  3829     }
       
  3830     pCur->aiIdx[pCur->iPage] = lwr;
       
  3831     pCur->info.nSize = 0;
       
  3832     pCur->validNKey = 0;
       
  3833     rc = moveToChild(pCur, chldPg);
       
  3834     if( rc ) goto moveto_finish;
       
  3835   }
       
  3836 moveto_finish:
       
  3837   return rc;
       
  3838 }
       
  3839 
       
  3840 /*
       
  3841 ** In this version of BtreeMoveto, pKey is a packed index record
       
  3842 ** such as is generated by the OP_MakeRecord opcode.  Unpack the
       
  3843 ** record and then call BtreeMovetoUnpacked() to do the work.
       
  3844 */
       
  3845 int sqlite3BtreeMoveto(
       
  3846   BtCursor *pCur,     /* Cursor open on the btree to be searched */
       
  3847   const void *pKey,   /* Packed key if the btree is an index */
       
  3848   i64 nKey,           /* Integer key for tables.  Size of pKey for indices */
       
  3849   int bias,           /* Bias search to the high end */
       
  3850   int *pRes           /* Write search results here */
       
  3851 ){
       
  3852   int rc;                    /* Status code */
       
  3853   UnpackedRecord *pIdxKey;   /* Unpacked index key */
       
  3854   UnpackedRecord aSpace[16]; /* Temp space for pIdxKey - to avoid a malloc */
       
  3855 
       
  3856   if( pKey ){
       
  3857     pIdxKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, nKey, pKey,
       
  3858                                       aSpace, sizeof(aSpace));
       
  3859     if( pIdxKey==0 ) return SQLITE_NOMEM;
       
  3860   }else{
       
  3861     pIdxKey = 0;
       
  3862   }
       
  3863   rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
       
  3864   if( pKey ){
       
  3865     sqlite3VdbeDeleteUnpackedRecord(pIdxKey);
       
  3866   }
       
  3867   return rc;
       
  3868 }
       
  3869 
       
  3870 
       
  3871 /*
       
  3872 ** Return TRUE if the cursor is not pointing at an entry of the table.
       
  3873 **
       
  3874 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
       
  3875 ** past the last entry in the table or sqlite3BtreePrev() moves past
       
  3876 ** the first entry.  TRUE is also returned if the table is empty.
       
  3877 */
       
  3878 int sqlite3BtreeEof(BtCursor *pCur){
       
  3879   /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
       
  3880   ** have been deleted? This API will need to change to return an error code
       
  3881   ** as well as the boolean result value.
       
  3882   */
       
  3883   return (CURSOR_VALID!=pCur->eState);
       
  3884 }
       
  3885 
       
  3886 /*
       
  3887 ** Return the database connection handle for a cursor.
       
  3888 */
       
  3889 sqlite3 *sqlite3BtreeCursorDb(const BtCursor *pCur){
       
  3890   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
       
  3891   return pCur->pBtree->db;
       
  3892 }
       
  3893 
       
  3894 /*
       
  3895 ** Advance the cursor to the next entry in the database.  If
       
  3896 ** successful then set *pRes=0.  If the cursor
       
  3897 ** was already pointing to the last entry in the database before
       
  3898 ** this routine was called, then set *pRes=1.
       
  3899 */
       
  3900 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
       
  3901   int rc;
       
  3902   int idx;
       
  3903   MemPage *pPage;
       
  3904 
       
  3905   assert( cursorHoldsMutex(pCur) );
       
  3906   rc = restoreCursorPosition(pCur);
       
  3907   if( rc!=SQLITE_OK ){
       
  3908     return rc;
       
  3909   }
       
  3910   assert( pRes!=0 );
       
  3911   if( CURSOR_INVALID==pCur->eState ){
       
  3912     *pRes = 1;
       
  3913     return SQLITE_OK;
       
  3914   }
       
  3915   if( pCur->skip>0 ){
       
  3916     pCur->skip = 0;
       
  3917     *pRes = 0;
       
  3918     return SQLITE_OK;
       
  3919   }
       
  3920   pCur->skip = 0;
       
  3921 
       
  3922   pPage = pCur->apPage[pCur->iPage];
       
  3923   idx = ++pCur->aiIdx[pCur->iPage];
       
  3924   assert( pPage->isInit );
       
  3925   assert( idx<=pPage->nCell );
       
  3926 
       
  3927   pCur->info.nSize = 0;
       
  3928   pCur->validNKey = 0;
       
  3929   if( idx>=pPage->nCell ){
       
  3930     if( !pPage->leaf ){
       
  3931       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
       
  3932       if( rc ) return rc;
       
  3933       rc = moveToLeftmost(pCur);
       
  3934       *pRes = 0;
       
  3935       return rc;
       
  3936     }
       
  3937     do{
       
  3938       if( pCur->iPage==0 ){
       
  3939         *pRes = 1;
       
  3940         pCur->eState = CURSOR_INVALID;
       
  3941         return SQLITE_OK;
       
  3942       }
       
  3943       sqlite3BtreeMoveToParent(pCur);
       
  3944       pPage = pCur->apPage[pCur->iPage];
       
  3945     }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
       
  3946     *pRes = 0;
       
  3947     if( pPage->intKey ){
       
  3948       rc = sqlite3BtreeNext(pCur, pRes);
       
  3949     }else{
       
  3950       rc = SQLITE_OK;
       
  3951     }
       
  3952     return rc;
       
  3953   }
       
  3954   *pRes = 0;
       
  3955   if( pPage->leaf ){
       
  3956     return SQLITE_OK;
       
  3957   }
       
  3958   rc = moveToLeftmost(pCur);
       
  3959   return rc;
       
  3960 }
       
  3961 
       
  3962 
       
  3963 /*
       
  3964 ** Step the cursor to the back to the previous entry in the database.  If
       
  3965 ** successful then set *pRes=0.  If the cursor
       
  3966 ** was already pointing to the first entry in the database before
       
  3967 ** this routine was called, then set *pRes=1.
       
  3968 */
       
  3969 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
       
  3970   int rc;
       
  3971   MemPage *pPage;
       
  3972 
       
  3973   assert( cursorHoldsMutex(pCur) );
       
  3974   rc = restoreCursorPosition(pCur);
       
  3975   if( rc!=SQLITE_OK ){
       
  3976     return rc;
       
  3977   }
       
  3978   pCur->atLast = 0;
       
  3979   if( CURSOR_INVALID==pCur->eState ){
       
  3980     *pRes = 1;
       
  3981     return SQLITE_OK;
       
  3982   }
       
  3983   if( pCur->skip<0 ){
       
  3984     pCur->skip = 0;
       
  3985     *pRes = 0;
       
  3986     return SQLITE_OK;
       
  3987   }
       
  3988   pCur->skip = 0;
       
  3989 
       
  3990   pPage = pCur->apPage[pCur->iPage];
       
  3991   assert( pPage->isInit );
       
  3992   if( !pPage->leaf ){
       
  3993     int idx = pCur->aiIdx[pCur->iPage];
       
  3994     rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
       
  3995     if( rc ){
       
  3996       return rc;
       
  3997     }
       
  3998     rc = moveToRightmost(pCur);
       
  3999   }else{
       
  4000     while( pCur->aiIdx[pCur->iPage]==0 ){
       
  4001       if( pCur->iPage==0 ){
       
  4002         pCur->eState = CURSOR_INVALID;
       
  4003         *pRes = 1;
       
  4004         return SQLITE_OK;
       
  4005       }
       
  4006       sqlite3BtreeMoveToParent(pCur);
       
  4007     }
       
  4008     pCur->info.nSize = 0;
       
  4009     pCur->validNKey = 0;
       
  4010 
       
  4011     pCur->aiIdx[pCur->iPage]--;
       
  4012     pPage = pCur->apPage[pCur->iPage];
       
  4013     if( pPage->intKey && !pPage->leaf ){
       
  4014       rc = sqlite3BtreePrevious(pCur, pRes);
       
  4015     }else{
       
  4016       rc = SQLITE_OK;
       
  4017     }
       
  4018   }
       
  4019   *pRes = 0;
       
  4020   return rc;
       
  4021 }
       
  4022 
       
  4023 /*
       
  4024 ** Allocate a new page from the database file.
       
  4025 **
       
  4026 ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
       
  4027 ** has already been called on the new page.)  The new page has also
       
  4028 ** been referenced and the calling routine is responsible for calling
       
  4029 ** sqlite3PagerUnref() on the new page when it is done.
       
  4030 **
       
  4031 ** SQLITE_OK is returned on success.  Any other return value indicates
       
  4032 ** an error.  *ppPage and *pPgno are undefined in the event of an error.
       
  4033 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
       
  4034 **
       
  4035 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to 
       
  4036 ** locate a page close to the page number "nearby".  This can be used in an
       
  4037 ** attempt to keep related pages close to each other in the database file,
       
  4038 ** which in turn can make database access faster.
       
  4039 **
       
  4040 ** If the "exact" parameter is not 0, and the page-number nearby exists 
       
  4041 ** anywhere on the free-list, then it is guarenteed to be returned. This
       
  4042 ** is only used by auto-vacuum databases when allocating a new table.
       
  4043 */
       
  4044 static int allocateBtreePage(
       
  4045   BtShared *pBt, 
       
  4046   MemPage **ppPage, 
       
  4047   Pgno *pPgno, 
       
  4048   Pgno nearby,
       
  4049   u8 exact
       
  4050 ){
       
  4051   MemPage *pPage1;
       
  4052   int rc;
       
  4053   int n;     /* Number of pages on the freelist */
       
  4054   int k;     /* Number of leaves on the trunk of the freelist */
       
  4055   MemPage *pTrunk = 0;
       
  4056   MemPage *pPrevTrunk = 0;
       
  4057 
       
  4058   assert( sqlite3_mutex_held(pBt->mutex) );
       
  4059   pPage1 = pBt->pPage1;
       
  4060   n = get4byte(&pPage1->aData[36]);
       
  4061   if( n>0 ){
       
  4062     /* There are pages on the freelist.  Reuse one of those pages. */
       
  4063     Pgno iTrunk;
       
  4064     u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
       
  4065     
       
  4066     /* If the 'exact' parameter was true and a query of the pointer-map
       
  4067     ** shows that the page 'nearby' is somewhere on the free-list, then
       
  4068     ** the entire-list will be searched for that page.
       
  4069     */
       
  4070 #ifndef SQLITE_OMIT_AUTOVACUUM
       
  4071     if( exact && nearby<=pagerPagecount(pBt->pPager) ){
       
  4072       u8 eType;
       
  4073       assert( nearby>0 );
       
  4074       assert( pBt->autoVacuum );
       
  4075       rc = ptrmapGet(pBt, nearby, &eType, 0);
       
  4076       if( rc ) return rc;
       
  4077       if( eType==PTRMAP_FREEPAGE ){
       
  4078         searchList = 1;
       
  4079       }
       
  4080       *pPgno = nearby;
       
  4081     }
       
  4082 #endif
       
  4083 
       
  4084     /* Decrement the free-list count by 1. Set iTrunk to the index of the
       
  4085     ** first free-list trunk page. iPrevTrunk is initially 1.
       
  4086     */
       
  4087     rc = sqlite3PagerWrite(pPage1->pDbPage);
       
  4088     if( rc ) return rc;
       
  4089     put4byte(&pPage1->aData[36], n-1);
       
  4090 
       
  4091     /* The code within this loop is run only once if the 'searchList' variable
       
  4092     ** is not true. Otherwise, it runs once for each trunk-page on the
       
  4093     ** free-list until the page 'nearby' is located.
       
  4094     */
       
  4095     do {
       
  4096       pPrevTrunk = pTrunk;
       
  4097       if( pPrevTrunk ){
       
  4098         iTrunk = get4byte(&pPrevTrunk->aData[0]);
       
  4099       }else{
       
  4100         iTrunk = get4byte(&pPage1->aData[32]);
       
  4101       }
       
  4102       rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0);
       
  4103       if( rc ){
       
  4104         pTrunk = 0;
       
  4105         goto end_allocate_page;
       
  4106       }
       
  4107 
       
  4108       k = get4byte(&pTrunk->aData[4]);
       
  4109       if( k==0 && !searchList ){
       
  4110         /* The trunk has no leaves and the list is not being searched. 
       
  4111         ** So extract the trunk page itself and use it as the newly 
       
  4112         ** allocated page */
       
  4113         assert( pPrevTrunk==0 );
       
  4114         rc = sqlite3PagerWrite(pTrunk->pDbPage);
       
  4115         if( rc ){
       
  4116           goto end_allocate_page;
       
  4117         }
       
  4118         *pPgno = iTrunk;
       
  4119         memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
       
  4120         *ppPage = pTrunk;
       
  4121         pTrunk = 0;
       
  4122         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
       
  4123       }else if( k>pBt->usableSize/4 - 2 ){
       
  4124         /* Value of k is out of range.  Database corruption */
       
  4125         rc = SQLITE_CORRUPT_BKPT;
       
  4126         goto end_allocate_page;
       
  4127 #ifndef SQLITE_OMIT_AUTOVACUUM
       
  4128       }else if( searchList && nearby==iTrunk ){
       
  4129         /* The list is being searched and this trunk page is the page
       
  4130         ** to allocate, regardless of whether it has leaves.
       
  4131         */
       
  4132         assert( *pPgno==iTrunk );
       
  4133         *ppPage = pTrunk;
       
  4134         searchList = 0;
       
  4135         rc = sqlite3PagerWrite(pTrunk->pDbPage);
       
  4136         if( rc ){
       
  4137           goto end_allocate_page;
       
  4138         }
       
  4139         if( k==0 ){
       
  4140           if( !pPrevTrunk ){
       
  4141             memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
       
  4142           }else{
       
  4143             memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
       
  4144           }
       
  4145         }else{
       
  4146           /* The trunk page is required by the caller but it contains 
       
  4147           ** pointers to free-list leaves. The first leaf becomes a trunk
       
  4148           ** page in this case.
       
  4149           */
       
  4150           MemPage *pNewTrunk;
       
  4151           Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
       
  4152           rc = sqlite3BtreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
       
  4153           if( rc!=SQLITE_OK ){
       
  4154             goto end_allocate_page;
       
  4155           }
       
  4156           rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
       
  4157           if( rc!=SQLITE_OK ){
       
  4158             releasePage(pNewTrunk);
       
  4159             goto end_allocate_page;
       
  4160           }
       
  4161           memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
       
  4162           put4byte(&pNewTrunk->aData[4], k-1);
       
  4163           memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
       
  4164           releasePage(pNewTrunk);
       
  4165           if( !pPrevTrunk ){
       
  4166             put4byte(&pPage1->aData[32], iNewTrunk);
       
  4167           }else{
       
  4168             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
       
  4169             if( rc ){
       
  4170               goto end_allocate_page;
       
  4171             }
       
  4172             put4byte(&pPrevTrunk->aData[0], iNewTrunk);
       
  4173           }
       
  4174         }
       
  4175         pTrunk = 0;
       
  4176         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
       
  4177 #endif
       
  4178       }else{
       
  4179         /* Extract a leaf from the trunk */
       
  4180         int closest;
       
  4181         Pgno iPage;
       
  4182         unsigned char *aData = pTrunk->aData;
       
  4183         rc = sqlite3PagerWrite(pTrunk->pDbPage);
       
  4184         if( rc ){
       
  4185           goto end_allocate_page;
       
  4186         }
       
  4187         if( nearby>0 ){
       
  4188           int i, dist;
       
  4189           closest = 0;
       
  4190           dist = get4byte(&aData[8]) - nearby;
       
  4191           if( dist<0 ) dist = -dist;
       
  4192           for(i=1; i<k; i++){
       
  4193             int d2 = get4byte(&aData[8+i*4]) - nearby;
       
  4194             if( d2<0 ) d2 = -d2;
       
  4195             if( d2<dist ){
       
  4196               closest = i;
       
  4197               dist = d2;
       
  4198             }
       
  4199           }
       
  4200         }else{
       
  4201           closest = 0;
       
  4202         }
       
  4203 
       
  4204         iPage = get4byte(&aData[8+closest*4]);
       
  4205         if( !searchList || iPage==nearby ){
       
  4206           int nPage;
       
  4207           *pPgno = iPage;
       
  4208           nPage = pagerPagecount(pBt->pPager);
       
  4209           if( *pPgno>nPage ){
       
  4210             /* Free page off the end of the file */
       
  4211             rc = SQLITE_CORRUPT_BKPT;
       
  4212             goto end_allocate_page;
       
  4213           }
       
  4214           TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
       
  4215                  ": %d more free pages\n",
       
  4216                  *pPgno, closest+1, k, pTrunk->pgno, n-1));
       
  4217           if( closest<k-1 ){
       
  4218             memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
       
  4219           }
       
  4220           put4byte(&aData[4], k-1);
       
  4221           rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 1);
       
  4222           if( rc==SQLITE_OK ){
       
  4223             sqlite3PagerDontRollback((*ppPage)->pDbPage);
       
  4224             rc = sqlite3PagerWrite((*ppPage)->pDbPage);
       
  4225             if( rc!=SQLITE_OK ){
       
  4226               releasePage(*ppPage);
       
  4227             }
       
  4228           }
       
  4229           searchList = 0;
       
  4230         }
       
  4231       }
       
  4232       releasePage(pPrevTrunk);
       
  4233       pPrevTrunk = 0;
       
  4234     }while( searchList );
       
  4235   }else{
       
  4236     /* There are no pages on the freelist, so create a new page at the
       
  4237     ** end of the file */
       
  4238     int nPage = pagerPagecount(pBt->pPager);
       
  4239     *pPgno = nPage + 1;
       
  4240 
       
  4241 #ifndef SQLITE_OMIT_AUTOVACUUM
       
  4242     if( pBt->nTrunc ){
       
  4243       /* An incr-vacuum has already run within this transaction. So the
       
  4244       ** page to allocate is not from the physical end of the file, but
       
  4245       ** at pBt->nTrunc. 
       
  4246       */
       
  4247       *pPgno = pBt->nTrunc+1;
       
  4248       if( *pPgno==PENDING_BYTE_PAGE(pBt) ){
       
  4249         (*pPgno)++;
       
  4250       }
       
  4251     }
       
  4252     if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){
       
  4253       /* If *pPgno refers to a pointer-map page, allocate two new pages
       
  4254       ** at the end of the file instead of one. The first allocated page
       
  4255       ** becomes a new pointer-map page, the second is used by the caller.
       
  4256       */
       
  4257       TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno));
       
  4258       assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
       
  4259       (*pPgno)++;
       
  4260       if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ (*pPgno)++; }
       
  4261     }
       
  4262     if( pBt->nTrunc ){
       
  4263       pBt->nTrunc = *pPgno;
       
  4264     }
       
  4265 #endif
       
  4266 
       
  4267     assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
       
  4268     rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 0);
       
  4269     if( rc ) return rc;
       
  4270     rc = sqlite3PagerWrite((*ppPage)->pDbPage);
       
  4271     if( rc!=SQLITE_OK ){
       
  4272       releasePage(*ppPage);
       
  4273     }
       
  4274     TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
       
  4275   }
       
  4276 
       
  4277   assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
       
  4278 
       
  4279 end_allocate_page:
       
  4280   releasePage(pTrunk);
       
  4281   releasePage(pPrevTrunk);
       
  4282   if( rc==SQLITE_OK && sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
       
  4283     releasePage(*ppPage);
       
  4284     return SQLITE_CORRUPT_BKPT;
       
  4285   }
       
  4286   return rc;
       
  4287 }
       
  4288 
       
  4289 /*
       
  4290 ** Add a page of the database file to the freelist.
       
  4291 **
       
  4292 ** sqlite3PagerUnref() is NOT called for pPage.
       
  4293 */
       
  4294 static int freePage(MemPage *pPage){
       
  4295   BtShared *pBt = pPage->pBt;
       
  4296   MemPage *pPage1 = pBt->pPage1;
       
  4297   int rc, n, k;
       
  4298 
       
  4299   /* Prepare the page for freeing */
       
  4300   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
       
  4301   assert( pPage->pgno>1 );
       
  4302   pPage->isInit = 0;
       
  4303 
       
  4304   /* Increment the free page count on pPage1 */
       
  4305   rc = sqlite3PagerWrite(pPage1->pDbPage);
       
  4306   if( rc ) return rc;
       
  4307   n = get4byte(&pPage1->aData[36]);
       
  4308   put4byte(&pPage1->aData[36], n+1);
       
  4309 
       
  4310 #ifdef SQLITE_SECURE_DELETE
       
  4311   /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then
       
  4312   ** always fully overwrite deleted information with zeros.
       
  4313   */
       
  4314   rc = sqlite3PagerWrite(pPage->pDbPage);
       
  4315   if( rc ) return rc;
       
  4316   memset(pPage->aData, 0, pPage->pBt->pageSize);
       
  4317 #endif
       
  4318 
       
  4319   /* If the database supports auto-vacuum, write an entry in the pointer-map
       
  4320   ** to indicate that the page is free.
       
  4321   */
       
  4322   if( ISAUTOVACUUM ){
       
  4323     rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0);
       
  4324     if( rc ) return rc;
       
  4325   }
       
  4326 
       
  4327   if( n==0 ){
       
  4328     /* This is the first free page */
       
  4329     rc = sqlite3PagerWrite(pPage->pDbPage);
       
  4330     if( rc ) return rc;
       
  4331     memset(pPage->aData, 0, 8);
       
  4332     put4byte(&pPage1->aData[32], pPage->pgno);
       
  4333     TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
       
  4334   }else{
       
  4335     /* Other free pages already exist.  Retrive the first trunk page
       
  4336     ** of the freelist and find out how many leaves it has. */
       
  4337     MemPage *pTrunk;
       
  4338     rc = sqlite3BtreeGetPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0);
       
  4339     if( rc ) return rc;
       
  4340     k = get4byte(&pTrunk->aData[4]);
       
  4341     if( k>=pBt->usableSize/4 - 8 ){
       
  4342       /* The trunk is full.  Turn the page being freed into a new
       
  4343       ** trunk page with no leaves.
       
  4344       **
       
  4345       ** Note that the trunk page is not really full until it contains
       
  4346       ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
       
  4347       ** coded.  But due to a coding error in versions of SQLite prior to
       
  4348       ** 3.6.0, databases with freelist trunk pages holding more than
       
  4349       ** usableSize/4 - 8 entries will be reported as corrupt.  In order
       
  4350       ** to maintain backwards compatibility with older versions of SQLite,
       
  4351       ** we will contain to restrict the number of entries to usableSize/4 - 8
       
  4352       ** for now.  At some point in the future (once everyone has upgraded
       
  4353       ** to 3.6.0 or later) we should consider fixing the conditional above
       
  4354       ** to read "usableSize/4-2" instead of "usableSize/4-8".
       
  4355       */
       
  4356       rc = sqlite3PagerWrite(pPage->pDbPage);
       
  4357       if( rc==SQLITE_OK ){
       
  4358         put4byte(pPage->aData, pTrunk->pgno);
       
  4359         put4byte(&pPage->aData[4], 0);
       
  4360         put4byte(&pPage1->aData[32], pPage->pgno);
       
  4361         TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
       
  4362                 pPage->pgno, pTrunk->pgno));
       
  4363       }
       
  4364     }else if( k<0 ){
       
  4365       rc = SQLITE_CORRUPT;
       
  4366     }else{
       
  4367       /* Add the newly freed page as a leaf on the current trunk */
       
  4368       rc = sqlite3PagerWrite(pTrunk->pDbPage);
       
  4369       if( rc==SQLITE_OK ){
       
  4370         put4byte(&pTrunk->aData[4], k+1);
       
  4371         put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
       
  4372 #ifndef SQLITE_SECURE_DELETE
       
  4373         rc = sqlite3PagerDontWrite(pPage->pDbPage);
       
  4374 #endif
       
  4375       }
       
  4376       TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
       
  4377     }
       
  4378     releasePage(pTrunk);
       
  4379   }
       
  4380   return rc;
       
  4381 }
       
  4382 
       
  4383 /*
       
  4384 ** Free any overflow pages associated with the given Cell.
       
  4385 */
       
  4386 static int clearCell(MemPage *pPage, unsigned char *pCell){
       
  4387   BtShared *pBt = pPage->pBt;
       
  4388   CellInfo info;
       
  4389   Pgno ovflPgno;
       
  4390   int rc;
       
  4391   int nOvfl;
       
  4392   int ovflPageSize;
       
  4393 
       
  4394   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
       
  4395   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
       
  4396   if( info.iOverflow==0 ){
       
  4397     return SQLITE_OK;  /* No overflow pages. Return without doing anything */
       
  4398   }
       
  4399   ovflPgno = get4byte(&pCell[info.iOverflow]);
       
  4400   ovflPageSize = pBt->usableSize - 4;
       
  4401   nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
       
  4402   assert( ovflPgno==0 || nOvfl>0 );
       
  4403   while( nOvfl-- ){
       
  4404     MemPage *pOvfl;
       
  4405     if( ovflPgno==0 || ovflPgno>pagerPagecount(pBt->pPager) ){
       
  4406       return SQLITE_CORRUPT_BKPT;
       
  4407     }
       
  4408 
       
  4409     rc = getOverflowPage(pBt, ovflPgno, &pOvfl, (nOvfl==0)?0:&ovflPgno);
       
  4410     if( rc ) return rc;
       
  4411     rc = freePage(pOvfl);
       
  4412     sqlite3PagerUnref(pOvfl->pDbPage);
       
  4413     if( rc ) return rc;
       
  4414   }
       
  4415   return SQLITE_OK;
       
  4416 }
       
  4417 
       
  4418 /*
       
  4419 ** Create the byte sequence used to represent a cell on page pPage
       
  4420 ** and write that byte sequence into pCell[].  Overflow pages are
       
  4421 ** allocated and filled in as necessary.  The calling procedure
       
  4422 ** is responsible for making sure sufficient space has been allocated
       
  4423 ** for pCell[].
       
  4424 **
       
  4425 ** Note that pCell does not necessary need to point to the pPage->aData
       
  4426 ** area.  pCell might point to some temporary storage.  The cell will
       
  4427 ** be constructed in this temporary area then copied into pPage->aData
       
  4428 ** later.
       
  4429 */
       
  4430 static int fillInCell(
       
  4431   MemPage *pPage,                /* The page that contains the cell */
       
  4432   unsigned char *pCell,          /* Complete text of the cell */
       
  4433   const void *pKey, i64 nKey,    /* The key */
       
  4434   const void *pData,int nData,   /* The data */
       
  4435   int nZero,                     /* Extra zero bytes to append to pData */
       
  4436   int *pnSize                    /* Write cell size here */
       
  4437 ){
       
  4438   int nPayload;
       
  4439   const u8 *pSrc;
       
  4440   int nSrc, n, rc;
       
  4441   int spaceLeft;
       
  4442   MemPage *pOvfl = 0;
       
  4443   MemPage *pToRelease = 0;
       
  4444   unsigned char *pPrior;
       
  4445   unsigned char *pPayload;
       
  4446   BtShared *pBt = pPage->pBt;
       
  4447   Pgno pgnoOvfl = 0;
       
  4448   int nHeader;
       
  4449   CellInfo info;
       
  4450 
       
  4451   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
       
  4452 
       
  4453   /* Fill in the header. */
       
  4454   nHeader = 0;
       
  4455   if( !pPage->leaf ){
       
  4456     nHeader += 4;
       
  4457   }
       
  4458   if( pPage->hasData ){
       
  4459     nHeader += putVarint(&pCell[nHeader], nData+nZero);
       
  4460   }else{
       
  4461     nData = nZero = 0;
       
  4462   }
       
  4463   nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
       
  4464   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
       
  4465   assert( info.nHeader==nHeader );
       
  4466   assert( info.nKey==nKey );
       
  4467   assert( info.nData==nData+nZero );
       
  4468   
       
  4469   /* Fill in the payload */
       
  4470   nPayload = nData + nZero;
       
  4471   if( pPage->intKey ){
       
  4472     pSrc = pData;
       
  4473     nSrc = nData;
       
  4474     nData = 0;
       
  4475   }else{
       
  4476     nPayload += nKey;
       
  4477     pSrc = pKey;
       
  4478     nSrc = nKey;
       
  4479   }
       
  4480   *pnSize = info.nSize;
       
  4481   spaceLeft = info.nLocal;
       
  4482   pPayload = &pCell[nHeader];
       
  4483   pPrior = &pCell[info.iOverflow];
       
  4484 
       
  4485   while( nPayload>0 ){
       
  4486     if( spaceLeft==0 ){
       
  4487       int isExact = 0;
       
  4488 #ifndef SQLITE_OMIT_AUTOVACUUM
       
  4489       Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
       
  4490       if( pBt->autoVacuum ){
       
  4491         do{
       
  4492           pgnoOvfl++;
       
  4493         } while( 
       
  4494           PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) 
       
  4495         );
       
  4496         if( pgnoOvfl>1 ){
       
  4497           /* isExact = 1; */
       
  4498         }
       
  4499       }
       
  4500 #endif
       
  4501       rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, isExact);
       
  4502 #ifndef SQLITE_OMIT_AUTOVACUUM
       
  4503       /* If the database supports auto-vacuum, and the second or subsequent
       
  4504       ** overflow page is being allocated, add an entry to the pointer-map
       
  4505       ** for that page now. 
       
  4506       **
       
  4507       ** If this is the first overflow page, then write a partial entry 
       
  4508       ** to the pointer-map. If we write nothing to this pointer-map slot,
       
  4509       ** then the optimistic overflow chain processing in clearCell()
       
  4510       ** may misinterpret the uninitialised values and delete the
       
  4511       ** wrong pages from the database.
       
  4512       */
       
  4513       if( pBt->autoVacuum && rc==SQLITE_OK ){
       
  4514         u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
       
  4515         rc = ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap);
       
  4516         if( rc ){
       
  4517           releasePage(pOvfl);
       
  4518         }
       
  4519       }
       
  4520 #endif
       
  4521       if( rc ){
       
  4522         releasePage(pToRelease);
       
  4523         return rc;
       
  4524       }
       
  4525       put4byte(pPrior, pgnoOvfl);
       
  4526       releasePage(pToRelease);
       
  4527       pToRelease = pOvfl;
       
  4528       pPrior = pOvfl->aData;
       
  4529       put4byte(pPrior, 0);
       
  4530       pPayload = &pOvfl->aData[4];
       
  4531       spaceLeft = pBt->usableSize - 4;
       
  4532     }
       
  4533     n = nPayload;
       
  4534     if( n>spaceLeft ) n = spaceLeft;
       
  4535     if( nSrc>0 ){
       
  4536       if( n>nSrc ) n = nSrc;
       
  4537       assert( pSrc );
       
  4538       memcpy(pPayload, pSrc, n);
       
  4539     }else{
       
  4540       memset(pPayload, 0, n);
       
  4541     }
       
  4542     nPayload -= n;
       
  4543     pPayload += n;
       
  4544     pSrc += n;
       
  4545     nSrc -= n;
       
  4546     spaceLeft -= n;
       
  4547     if( nSrc==0 ){
       
  4548       nSrc = nData;
       
  4549       pSrc = pData;
       
  4550     }
       
  4551   }
       
  4552   releasePage(pToRelease);
       
  4553   return SQLITE_OK;
       
  4554 }
       
  4555 
       
  4556 /*
       
  4557 ** Remove the i-th cell from pPage.  This routine effects pPage only.
       
  4558 ** The cell content is not freed or deallocated.  It is assumed that
       
  4559 ** the cell content has been copied someplace else.  This routine just
       
  4560 ** removes the reference to the cell from pPage.
       
  4561 **
       
  4562 ** "sz" must be the number of bytes in the cell.
       
  4563 */
       
  4564 static void dropCell(MemPage *pPage, int idx, int sz){
       
  4565   int i;          /* Loop counter */
       
  4566   int pc;         /* Offset to cell content of cell being deleted */
       
  4567   u8 *data;       /* pPage->aData */
       
  4568   u8 *ptr;        /* Used to move bytes around within data[] */
       
  4569 
       
  4570   assert( idx>=0 && idx<pPage->nCell );
       
  4571   assert( sz==cellSize(pPage, idx) );
       
  4572   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
       
  4573   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
       
  4574   data = pPage->aData;
       
  4575   ptr = &data[pPage->cellOffset + 2*idx];
       
  4576   pc = get2byte(ptr);
       
  4577   assert( pc>10 && pc+sz<=pPage->pBt->usableSize );
       
  4578   freeSpace(pPage, pc, sz);
       
  4579   for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
       
  4580     ptr[0] = ptr[2];
       
  4581     ptr[1] = ptr[3];
       
  4582   }
       
  4583   pPage->nCell--;
       
  4584   put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
       
  4585   pPage->nFree += 2;
       
  4586 }
       
  4587 
       
  4588 /*
       
  4589 ** Insert a new cell on pPage at cell index "i".  pCell points to the
       
  4590 ** content of the cell.
       
  4591 **
       
  4592 ** If the cell content will fit on the page, then put it there.  If it
       
  4593 ** will not fit, then make a copy of the cell content into pTemp if
       
  4594 ** pTemp is not null.  Regardless of pTemp, allocate a new entry
       
  4595 ** in pPage->aOvfl[] and make it point to the cell content (either
       
  4596 ** in pTemp or the original pCell) and also record its index. 
       
  4597 ** Allocating a new entry in pPage->aCell[] implies that 
       
  4598 ** pPage->nOverflow is incremented.
       
  4599 **
       
  4600 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the
       
  4601 ** cell. The caller will overwrite them after this function returns. If
       
  4602 ** nSkip is non-zero, then pCell may not point to an invalid memory location 
       
  4603 ** (but pCell+nSkip is always valid).
       
  4604 */
       
  4605 static int insertCell(
       
  4606   MemPage *pPage,   /* Page into which we are copying */
       
  4607   int i,            /* New cell becomes the i-th cell of the page */
       
  4608   u8 *pCell,        /* Content of the new cell */
       
  4609   int sz,           /* Bytes of content in pCell */
       
  4610   u8 *pTemp,        /* Temp storage space for pCell, if needed */
       
  4611   u8 nSkip          /* Do not write the first nSkip bytes of the cell */
       
  4612 ){
       
  4613   int idx;          /* Where to write new cell content in data[] */
       
  4614   int j;            /* Loop counter */
       
  4615   int top;          /* First byte of content for any cell in data[] */
       
  4616   int end;          /* First byte past the last cell pointer in data[] */
       
  4617   int ins;          /* Index in data[] where new cell pointer is inserted */
       
  4618   int hdr;          /* Offset into data[] of the page header */
       
  4619   int cellOffset;   /* Address of first cell pointer in data[] */
       
  4620   u8 *data;         /* The content of the whole page */
       
  4621   u8 *ptr;          /* Used for moving information around in data[] */
       
  4622 
       
  4623   assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
       
  4624   assert( sz==cellSizePtr(pPage, pCell) );
       
  4625   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
       
  4626   if( pPage->nOverflow || sz+2>pPage->nFree ){
       
  4627     if( pTemp ){
       
  4628       memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
       
  4629       pCell = pTemp;
       
  4630     }
       
  4631     j = pPage->nOverflow++;
       
  4632     assert( j<sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0]) );
       
  4633     pPage->aOvfl[j].pCell = pCell;
       
  4634     pPage->aOvfl[j].idx = i;
       
  4635     pPage->nFree = 0;
       
  4636   }else{
       
  4637     int rc = sqlite3PagerWrite(pPage->pDbPage);
       
  4638     if( rc!=SQLITE_OK ){
       
  4639       return rc;
       
  4640     }
       
  4641     assert( sqlite3PagerIswriteable(pPage->pDbPage) );
       
  4642     data = pPage->aData;
       
  4643     hdr = pPage->hdrOffset;
       
  4644     top = get2byte(&data[hdr+5]);
       
  4645     cellOffset = pPage->cellOffset;
       
  4646     end = cellOffset + 2*pPage->nCell + 2;
       
  4647     ins = cellOffset + 2*i;
       
  4648     if( end > top - sz ){
       
  4649       defragmentPage(pPage);
       
  4650       top = get2byte(&data[hdr+5]);
       
  4651       assert( end + sz <= top );
       
  4652     }
       
  4653     idx = allocateSpace(pPage, sz);
       
  4654     assert( idx>0 );
       
  4655     assert( end <= get2byte(&data[hdr+5]) );
       
  4656     pPage->nCell++;
       
  4657     pPage->nFree -= 2;
       
  4658     memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
       
  4659     for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){
       
  4660       ptr[0] = ptr[-2];
       
  4661       ptr[1] = ptr[-1];
       
  4662     }
       
  4663     put2byte(&data[ins], idx);
       
  4664     put2byte(&data[hdr+3], pPage->nCell);
       
  4665 #ifndef SQLITE_OMIT_AUTOVACUUM
       
  4666     if( pPage->pBt->autoVacuum ){
       
  4667       /* The cell may contain a pointer to an overflow page. If so, write
       
  4668       ** the entry for the overflow page into the pointer map.
       
  4669       */
       
  4670       CellInfo info;
       
  4671       sqlite3BtreeParseCellPtr(pPage, pCell, &info);
       
  4672       assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
       
  4673       if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
       
  4674         Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
       
  4675         rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno);
       
  4676         if( rc!=SQLITE_OK ) return rc;
       
  4677       }
       
  4678     }
       
  4679 #endif
       
  4680   }
       
  4681 
       
  4682   return SQLITE_OK;
       
  4683 }
       
  4684 
       
  4685 /*
       
  4686 ** Add a list of cells to a page.  The page should be initially empty.
       
  4687 ** The cells are guaranteed to fit on the page.
       
  4688 */
       
  4689 static void assemblePage(
       
  4690   MemPage *pPage,   /* The page to be assemblied */
       
  4691   int nCell,        /* The number of cells to add to this page */
       
  4692   u8 **apCell,      /* Pointers to cell bodies */
       
  4693   u16 *aSize        /* Sizes of the cells */
       
  4694 ){
       
  4695   int i;            /* Loop counter */
       
  4696   int totalSize;    /* Total size of all cells */
       
  4697   int hdr;          /* Index of page header */
       
  4698   int cellptr;      /* Address of next cell pointer */
       
  4699   int cellbody;     /* Address of next cell body */
       
  4700   u8 *data;         /* Data for the page */
       
  4701 
       
  4702   assert( pPage->nOverflow==0 );
       
  4703   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
       
  4704   totalSize = 0;
       
  4705   for(i=0; i<nCell; i++){
       
  4706     totalSize += aSize[i];
       
  4707   }
       
  4708   assert( totalSize+2*nCell<=pPage->nFree );
       
  4709   assert( pPage->nCell==0 );
       
  4710   cellptr = pPage->cellOffset;
       
  4711   data = pPage->aData;
       
  4712   hdr = pPage->hdrOffset;
       
  4713   put2byte(&data[hdr+3], nCell);
       
  4714   if( nCell ){
       
  4715     cellbody = allocateSpace(pPage, totalSize);
       
  4716     assert( cellbody>0 );
       
  4717     assert( pPage->nFree >= 2*nCell );
       
  4718     pPage->nFree -= 2*nCell;
       
  4719     for(i=0; i<nCell; i++){
       
  4720       put2byte(&data[cellptr], cellbody);
       
  4721       memcpy(&data[cellbody], apCell[i], aSize[i]);
       
  4722       cellptr += 2;
       
  4723       cellbody += aSize[i];
       
  4724     }
       
  4725     assert( cellbody==pPage->pBt->usableSize );
       
  4726   }
       
  4727   pPage->nCell = nCell;
       
  4728 }
       
  4729 
       
  4730 /*
       
  4731 ** The following parameters determine how many adjacent pages get involved
       
  4732 ** in a balancing operation.  NN is the number of neighbors on either side
       
  4733 ** of the page that participate in the balancing operation.  NB is the
       
  4734 ** total number of pages that participate, including the target page and
       
  4735 ** NN neighbors on either side.
       
  4736 **
       
  4737 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
       
  4738 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
       
  4739 ** in exchange for a larger degradation in INSERT and UPDATE performance.
       
  4740 ** The value of NN appears to give the best results overall.
       
  4741 */
       
  4742 #define NN 1             /* Number of neighbors on either side of pPage */
       
  4743 #define NB (NN*2+1)      /* Total pages involved in the balance */
       
  4744 
       
  4745 /* Forward reference */
       
  4746 static int balance(BtCursor*, int);
       
  4747 
       
  4748 #ifndef SQLITE_OMIT_QUICKBALANCE
       
  4749 /*
       
  4750 ** This version of balance() handles the common special case where
       
  4751 ** a new entry is being inserted on the extreme right-end of the
       
  4752 ** tree, in other words, when the new entry will become the largest
       
  4753 ** entry in the tree.
       
  4754 **
       
  4755 ** Instead of trying balance the 3 right-most leaf pages, just add
       
  4756 ** a new page to the right-hand side and put the one new entry in
       
  4757 ** that page.  This leaves the right side of the tree somewhat
       
  4758 ** unbalanced.  But odds are that we will be inserting new entries
       
  4759 ** at the end soon afterwards so the nearly empty page will quickly
       
  4760 ** fill up.  On average.
       
  4761 **
       
  4762 ** pPage is the leaf page which is the right-most page in the tree.
       
  4763 ** pParent is its parent.  pPage must have a single overflow entry
       
  4764 ** which is also the right-most entry on the page.
       
  4765 */
       
  4766 static int balance_quick(BtCursor *pCur){
       
  4767   int rc;
       
  4768   MemPage *pNew = 0;
       
  4769   Pgno pgnoNew;
       
  4770   u8 *pCell;
       
  4771   u16 szCell;
       
  4772   CellInfo info;
       
  4773   MemPage *pPage = pCur->apPage[pCur->iPage];
       
  4774   MemPage *pParent = pCur->apPage[pCur->iPage-1];
       
  4775   BtShared *pBt = pPage->pBt;
       
  4776   int parentIdx = pParent->nCell;   /* pParent new divider cell index */
       
  4777   int parentSize;                   /* Size of new divider cell */
       
  4778   u8 parentCell[64];                /* Space for the new divider cell */
       
  4779 
       
  4780   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
       
  4781 
       
  4782   /* Allocate a new page. Insert the overflow cell from pPage
       
  4783   ** into it. Then remove the overflow cell from pPage.
       
  4784   */
       
  4785   rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
       
  4786   if( rc==SQLITE_OK ){
       
  4787     pCell = pPage->aOvfl[0].pCell;
       
  4788     szCell = cellSizePtr(pPage, pCell);
       
  4789     zeroPage(pNew, pPage->aData[0]);
       
  4790     assemblePage(pNew, 1, &pCell, &szCell);
       
  4791     pPage->nOverflow = 0;
       
  4792   
       
  4793     /* pPage is currently the right-child of pParent. Change this
       
  4794     ** so that the right-child is the new page allocated above and
       
  4795     ** pPage is the next-to-right child. 
       
  4796     **
       
  4797     ** Ignore the return value of the call to fillInCell(). fillInCell()
       
  4798     ** may only return other than SQLITE_OK if it is required to allocate
       
  4799     ** one or more overflow pages. Since an internal table B-Tree cell 
       
  4800     ** may never spill over onto an overflow page (it is a maximum of 
       
  4801     ** 13 bytes in size), it is not neccessary to check the return code.
       
  4802     **
       
  4803     ** Similarly, the insertCell() function cannot fail if the page
       
  4804     ** being inserted into is already writable and the cell does not 
       
  4805     ** contain an overflow pointer. So ignore this return code too.
       
  4806     */
       
  4807     assert( pPage->nCell>0 );
       
  4808     pCell = findCell(pPage, pPage->nCell-1);
       
  4809     sqlite3BtreeParseCellPtr(pPage, pCell, &info);
       
  4810     fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, 0, &parentSize);
       
  4811     assert( parentSize<64 );
       
  4812     assert( sqlite3PagerIswriteable(pParent->pDbPage) );
       
  4813     insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4);
       
  4814     put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno);
       
  4815     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
       
  4816   
       
  4817     /* If this is an auto-vacuum database, update the pointer map
       
  4818     ** with entries for the new page, and any pointer from the 
       
  4819     ** cell on the page to an overflow page.
       
  4820     */
       
  4821     if( ISAUTOVACUUM ){
       
  4822       rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno);
       
  4823       if( rc==SQLITE_OK ){
       
  4824         rc = ptrmapPutOvfl(pNew, 0);
       
  4825       }
       
  4826     }
       
  4827 
       
  4828     /* Release the reference to the new page. */
       
  4829     releasePage(pNew);
       
  4830   }
       
  4831 
       
  4832   /* At this point the pPage->nFree variable is not set correctly with
       
  4833   ** respect to the content of the page (because it was set to 0 by 
       
  4834   ** insertCell). So call sqlite3BtreeInitPage() to make sure it is
       
  4835   ** correct.
       
  4836   **
       
  4837   ** This has to be done even if an error will be returned. Normally, if
       
  4838   ** an error occurs during tree balancing, the contents of MemPage are
       
  4839   ** not important, as they will be recalculated when the page is rolled
       
  4840   ** back. But here, in balance_quick(), it is possible that pPage has 
       
  4841   ** not yet been marked dirty or written into the journal file. Therefore
       
  4842   ** it will not be rolled back and so it is important to make sure that
       
  4843   ** the page data and contents of MemPage are consistent.
       
  4844   */
       
  4845   pPage->isInit = 0;
       
  4846   sqlite3BtreeInitPage(pPage);
       
  4847 
       
  4848   /* If everything else succeeded, balance the parent page, in 
       
  4849   ** case the divider cell inserted caused it to become overfull.
       
  4850   */
       
  4851   if( rc==SQLITE_OK ){
       
  4852     releasePage(pPage);
       
  4853     pCur->iPage--;
       
  4854     rc = balance(pCur, 0);
       
  4855   }
       
  4856   return rc;
       
  4857 }
       
  4858 #endif /* SQLITE_OMIT_QUICKBALANCE */
       
  4859 
       
  4860 /*
       
  4861 ** This routine redistributes Cells on pPage and up to NN*2 siblings
       
  4862 ** of pPage so that all pages have about the same amount of free space.
       
  4863 ** Usually NN siblings on either side of pPage is used in the balancing,
       
  4864 ** though more siblings might come from one side if pPage is the first
       
  4865 ** or last child of its parent.  If pPage has fewer than 2*NN siblings
       
  4866 ** (something which can only happen if pPage is the root page or a 
       
  4867 ** child of root) then all available siblings participate in the balancing.
       
  4868 **
       
  4869 ** The number of siblings of pPage might be increased or decreased by one or
       
  4870 ** two in an effort to keep pages nearly full but not over full. The root page
       
  4871 ** is special and is allowed to be nearly empty. If pPage is 
       
  4872 ** the root page, then the depth of the tree might be increased
       
  4873 ** or decreased by one, as necessary, to keep the root page from being
       
  4874 ** overfull or completely empty.
       
  4875 **
       
  4876 ** Note that when this routine is called, some of the Cells on pPage
       
  4877 ** might not actually be stored in pPage->aData[].  This can happen
       
  4878 ** if the page is overfull.  Part of the job of this routine is to
       
  4879 ** make sure all Cells for pPage once again fit in pPage->aData[].
       
  4880 **
       
  4881 ** In the course of balancing the siblings of pPage, the parent of pPage
       
  4882 ** might become overfull or underfull.  If that happens, then this routine
       
  4883 ** is called recursively on the parent.
       
  4884 **
       
  4885 ** If this routine fails for any reason, it might leave the database
       
  4886 ** in a corrupted state.  So if this routine fails, the database should
       
  4887 ** be rolled back.
       
  4888 */
       
  4889 static int balance_nonroot(BtCursor *pCur){
       
  4890   MemPage *pPage;              /* The over or underfull page to balance */
       
  4891   MemPage *pParent;            /* The parent of pPage */
       
  4892   BtShared *pBt;               /* The whole database */
       
  4893   int nCell = 0;               /* Number of cells in apCell[] */
       
  4894   int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
       
  4895   int nOld;                    /* Number of pages in apOld[] */
       
  4896   int nNew;                    /* Number of pages in apNew[] */
       
  4897   int nDiv;                    /* Number of cells in apDiv[] */
       
  4898   int i, j, k;                 /* Loop counters */
       
  4899   int idx;                     /* Index of pPage in pParent->aCell[] */
       
  4900   int nxDiv;                   /* Next divider slot in pParent->aCell[] */
       
  4901   int rc;                      /* The return code */
       
  4902   int leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
       
  4903   int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
       
  4904   int usableSpace;             /* Bytes in pPage beyond the header */
       
  4905   int pageFlags;               /* Value of pPage->aData[0] */
       
  4906   int subtotal;                /* Subtotal of bytes in cells on one page */
       
  4907   int iSpace1 = 0;             /* First unused byte of aSpace1[] */
       
  4908   int iSpace2 = 0;             /* First unused byte of aSpace2[] */
       
  4909   int szScratch;               /* Size of scratch memory requested */
       
  4910   MemPage *apOld[NB];          /* pPage and up to two siblings */
       
  4911   Pgno pgnoOld[NB];            /* Page numbers for each page in apOld[] */
       
  4912   MemPage *apCopy[NB];         /* Private copies of apOld[] pages */
       
  4913   MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
       
  4914   Pgno pgnoNew[NB+2];          /* Page numbers for each page in apNew[] */
       
  4915   u8 *apDiv[NB];               /* Divider cells in pParent */
       
  4916   int cntNew[NB+2];            /* Index in aCell[] of cell after i-th page */
       
  4917   int szNew[NB+2];             /* Combined size of cells place on i-th page */
       
  4918   u8 **apCell = 0;             /* All cells begin balanced */
       
  4919   u16 *szCell;                 /* Local size of all cells in apCell[] */
       
  4920   u8 *aCopy[NB];         /* Space for holding data of apCopy[] */
       
  4921   u8 *aSpace1;           /* Space for copies of dividers cells before balance */
       
  4922   u8 *aSpace2 = 0;       /* Space for overflow dividers cells after balance */
       
  4923   u8 *aFrom = 0;
       
  4924 
       
  4925   pPage = pCur->apPage[pCur->iPage];
       
  4926   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
       
  4927   VVA_ONLY( pCur->pagesShuffled = 1 );
       
  4928 
       
  4929   /* 
       
  4930   ** Find the parent page.
       
  4931   */
       
  4932   assert( pCur->iPage>0 );
       
  4933   assert( pPage->isInit );
       
  4934   assert( sqlite3PagerIswriteable(pPage->pDbPage) || pPage->nOverflow==1 );
       
  4935   pBt = pPage->pBt;
       
  4936   pParent = pCur->apPage[pCur->iPage-1];
       
  4937   assert( pParent );
       
  4938   if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){
       
  4939     return rc;
       
  4940   }
       
  4941 
       
  4942   TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
       
  4943 
       
  4944 #ifndef SQLITE_OMIT_QUICKBALANCE
       
  4945   /*
       
  4946   ** A special case:  If a new entry has just been inserted into a
       
  4947   ** table (that is, a btree with integer keys and all data at the leaves)
       
  4948   ** and the new entry is the right-most entry in the tree (it has the
       
  4949   ** largest key) then use the special balance_quick() routine for
       
  4950   ** balancing.  balance_quick() is much faster and results in a tighter
       
  4951   ** packing of data in the common case.
       
  4952   */
       
  4953   if( pPage->leaf &&
       
  4954       pPage->intKey &&
       
  4955       pPage->nOverflow==1 &&
       
  4956       pPage->aOvfl[0].idx==pPage->nCell &&
       
  4957       pParent->pgno!=1 &&
       
  4958       get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno
       
  4959   ){
       
  4960     assert( pPage->intKey );
       
  4961     /*
       
  4962     ** TODO: Check the siblings to the left of pPage. It may be that
       
  4963     ** they are not full and no new page is required.
       
  4964     */
       
  4965     return balance_quick(pCur);
       
  4966   }
       
  4967 #endif
       
  4968 
       
  4969   if( SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) ){
       
  4970     return rc;
       
  4971   }
       
  4972 
       
  4973   /*
       
  4974   ** Find the cell in the parent page whose left child points back
       
  4975   ** to pPage.  The "idx" variable is the index of that cell.  If pPage
       
  4976   ** is the rightmost child of pParent then set idx to pParent->nCell 
       
  4977   */
       
  4978   idx = pCur->aiIdx[pCur->iPage-1];
       
  4979   assertParentIndex(pParent, idx, pPage->pgno);
       
  4980 
       
  4981   /*
       
  4982   ** Initialize variables so that it will be safe to jump
       
  4983   ** directly to balance_cleanup at any moment.
       
  4984   */
       
  4985   nOld = nNew = 0;
       
  4986 
       
  4987   /*
       
  4988   ** Find sibling pages to pPage and the cells in pParent that divide
       
  4989   ** the siblings.  An attempt is made to find NN siblings on either
       
  4990   ** side of pPage.  More siblings are taken from one side, however, if
       
  4991   ** pPage there are fewer than NN siblings on the other side.  If pParent
       
  4992   ** has NB or fewer children then all children of pParent are taken.
       
  4993   */
       
  4994   nxDiv = idx - NN;
       
  4995   if( nxDiv + NB > pParent->nCell ){
       
  4996     nxDiv = pParent->nCell - NB + 1;
       
  4997   }
       
  4998   if( nxDiv<0 ){
       
  4999     nxDiv = 0;
       
  5000   }
       
  5001   nDiv = 0;
       
  5002   for(i=0, k=nxDiv; i<NB; i++, k++){
       
  5003     if( k<pParent->nCell ){
       
  5004       apDiv[i] = findCell(pParent, k);
       
  5005       nDiv++;
       
  5006       assert( !pParent->leaf );
       
  5007       pgnoOld[i] = get4byte(apDiv[i]);
       
  5008     }else if( k==pParent->nCell ){
       
  5009       pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]);
       
  5010     }else{
       
  5011       break;
       
  5012     }
       
  5013     rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i]);
       
  5014     if( rc ) goto balance_cleanup;
       
  5015     /* apOld[i]->idxParent = k; */
       
  5016     apCopy[i] = 0;
       
  5017     assert( i==nOld );
       
  5018     nOld++;
       
  5019     nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
       
  5020   }
       
  5021 
       
  5022   /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
       
  5023   ** alignment */
       
  5024   nMaxCells = (nMaxCells + 3)&~3;
       
  5025 
       
  5026   /*
       
  5027   ** Allocate space for memory structures
       
  5028   */
       
  5029   szScratch =
       
  5030        nMaxCells*sizeof(u8*)                       /* apCell */
       
  5031      + nMaxCells*sizeof(u16)                       /* szCell */
       
  5032      + (ROUND8(sizeof(MemPage))+pBt->pageSize)*NB  /* aCopy */
       
  5033      + pBt->pageSize                               /* aSpace1 */
       
  5034      + (ISAUTOVACUUM ? nMaxCells : 0);             /* aFrom */
       
  5035   apCell = sqlite3ScratchMalloc( szScratch ); 
       
  5036   if( apCell==0 ){
       
  5037     rc = SQLITE_NOMEM;
       
  5038     goto balance_cleanup;
       
  5039   }
       
  5040   szCell = (u16*)&apCell[nMaxCells];
       
  5041   aCopy[0] = (u8*)&szCell[nMaxCells];
       
  5042   assert( ((aCopy[0] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
       
  5043   for(i=1; i<NB; i++){
       
  5044     aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
       
  5045     assert( ((aCopy[i] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
       
  5046   }
       
  5047   aSpace1 = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
       
  5048   assert( ((aSpace1 - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
       
  5049   if( ISAUTOVACUUM ){
       
  5050     aFrom = &aSpace1[pBt->pageSize];
       
  5051   }
       
  5052   aSpace2 = sqlite3PageMalloc(pBt->pageSize);
       
  5053   if( aSpace2==0 ){
       
  5054     rc = SQLITE_NOMEM;
       
  5055     goto balance_cleanup;
       
  5056   }
       
  5057   
       
  5058   /*
       
  5059   ** Make copies of the content of pPage and its siblings into aOld[].
       
  5060   ** The rest of this function will use data from the copies rather
       
  5061   ** that the original pages since the original pages will be in the
       
  5062   ** process of being overwritten.
       
  5063   */
       
  5064   for(i=0; i<nOld; i++){
       
  5065     MemPage *p = apCopy[i] = (MemPage*)aCopy[i];
       
  5066     memcpy(p, apOld[i], sizeof(MemPage));
       
  5067     p->aData = (void*)&p[1];
       
  5068     memcpy(p->aData, apOld[i]->aData, pBt->pageSize);
       
  5069   }
       
  5070 
       
  5071   /*
       
  5072   ** Load pointers to all cells on sibling pages and the divider cells
       
  5073   ** into the local apCell[] array.  Make copies of the divider cells
       
  5074   ** into space obtained form aSpace1[] and remove the the divider Cells
       
  5075   ** from pParent.
       
  5076   **
       
  5077   ** If the siblings are on leaf pages, then the child pointers of the
       
  5078   ** divider cells are stripped from the cells before they are copied
       
  5079   ** into aSpace1[].  In this way, all cells in apCell[] are without
       
  5080   ** child pointers.  If siblings are not leaves, then all cell in
       
  5081   ** apCell[] include child pointers.  Either way, all cells in apCell[]
       
  5082   ** are alike.
       
  5083   **
       
  5084   ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
       
  5085   **       leafData:  1 if pPage holds key+data and pParent holds only keys.
       
  5086   */
       
  5087   nCell = 0;
       
  5088   leafCorrection = pPage->leaf*4;
       
  5089   leafData = pPage->hasData;
       
  5090   for(i=0; i<nOld; i++){
       
  5091     MemPage *pOld = apCopy[i];
       
  5092     int limit = pOld->nCell+pOld->nOverflow;
       
  5093     for(j=0; j<limit; j++){
       
  5094       assert( nCell<nMaxCells );
       
  5095       apCell[nCell] = findOverflowCell(pOld, j);
       
  5096       szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
       
  5097       if( ISAUTOVACUUM ){
       
  5098         int a;
       
  5099         aFrom[nCell] = i;
       
  5100         for(a=0; a<pOld->nOverflow; a++){
       
  5101           if( pOld->aOvfl[a].pCell==apCell[nCell] ){
       
  5102             aFrom[nCell] = 0xFF;
       
  5103             break;
       
  5104           }
       
  5105         }
       
  5106       }
       
  5107       nCell++;
       
  5108     }
       
  5109     if( i<nOld-1 ){
       
  5110       u16 sz = cellSizePtr(pParent, apDiv[i]);
       
  5111       if( leafData ){
       
  5112         /* With the LEAFDATA flag, pParent cells hold only INTKEYs that
       
  5113         ** are duplicates of keys on the child pages.  We need to remove
       
  5114         ** the divider cells from pParent, but the dividers cells are not
       
  5115         ** added to apCell[] because they are duplicates of child cells.
       
  5116         */
       
  5117         dropCell(pParent, nxDiv, sz);
       
  5118       }else{
       
  5119         u8 *pTemp;
       
  5120         assert( nCell<nMaxCells );
       
  5121         szCell[nCell] = sz;
       
  5122         pTemp = &aSpace1[iSpace1];
       
  5123         iSpace1 += sz;
       
  5124         assert( sz<=pBt->pageSize/4 );
       
  5125         assert( iSpace1<=pBt->pageSize );
       
  5126         memcpy(pTemp, apDiv[i], sz);
       
  5127         apCell[nCell] = pTemp+leafCorrection;
       
  5128         if( ISAUTOVACUUM ){
       
  5129           aFrom[nCell] = 0xFF;
       
  5130         }
       
  5131         dropCell(pParent, nxDiv, sz);
       
  5132         szCell[nCell] -= leafCorrection;
       
  5133         assert( get4byte(pTemp)==pgnoOld[i] );
       
  5134         if( !pOld->leaf ){
       
  5135           assert( leafCorrection==0 );
       
  5136           /* The right pointer of the child page pOld becomes the left
       
  5137           ** pointer of the divider cell */
       
  5138           memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4);
       
  5139         }else{
       
  5140           assert( leafCorrection==4 );
       
  5141           if( szCell[nCell]<4 ){
       
  5142             /* Do not allow any cells smaller than 4 bytes. */
       
  5143             szCell[nCell] = 4;
       
  5144           }
       
  5145         }
       
  5146         nCell++;
       
  5147       }
       
  5148     }
       
  5149   }
       
  5150 
       
  5151   /*
       
  5152   ** Figure out the number of pages needed to hold all nCell cells.
       
  5153   ** Store this number in "k".  Also compute szNew[] which is the total
       
  5154   ** size of all cells on the i-th page and cntNew[] which is the index
       
  5155   ** in apCell[] of the cell that divides page i from page i+1.  
       
  5156   ** cntNew[k] should equal nCell.
       
  5157   **
       
  5158   ** Values computed by this block:
       
  5159   **
       
  5160   **           k: The total number of sibling pages
       
  5161   **    szNew[i]: Spaced used on the i-th sibling page.
       
  5162   **   cntNew[i]: Index in apCell[] and szCell[] for the first cell to
       
  5163   **              the right of the i-th sibling page.
       
  5164   ** usableSpace: Number of bytes of space available on each sibling.
       
  5165   ** 
       
  5166   */
       
  5167   usableSpace = pBt->usableSize - 12 + leafCorrection;
       
  5168   for(subtotal=k=i=0; i<nCell; i++){
       
  5169     assert( i<nMaxCells );
       
  5170     subtotal += szCell[i] + 2;
       
  5171     if( subtotal > usableSpace ){
       
  5172       szNew[k] = subtotal - szCell[i];
       
  5173       cntNew[k] = i;
       
  5174       if( leafData ){ i--; }
       
  5175       subtotal = 0;
       
  5176       k++;
       
  5177     }
       
  5178   }
       
  5179   szNew[k] = subtotal;
       
  5180   cntNew[k] = nCell;
       
  5181   k++;
       
  5182 
       
  5183   /*
       
  5184   ** The packing computed by the previous block is biased toward the siblings
       
  5185   ** on the left side.  The left siblings are always nearly full, while the
       
  5186   ** right-most sibling might be nearly empty.  This block of code attempts
       
  5187   ** to adjust the packing of siblings to get a better balance.
       
  5188   **
       
  5189   ** This adjustment is more than an optimization.  The packing above might
       
  5190   ** be so out of balance as to be illegal.  For example, the right-most
       
  5191   ** sibling might be completely empty.  This adjustment is not optional.
       
  5192   */
       
  5193   for(i=k-1; i>0; i--){
       
  5194     int szRight = szNew[i];  /* Size of sibling on the right */
       
  5195     int szLeft = szNew[i-1]; /* Size of sibling on the left */
       
  5196     int r;              /* Index of right-most cell in left sibling */
       
  5197     int d;              /* Index of first cell to the left of right sibling */
       
  5198 
       
  5199     r = cntNew[i-1] - 1;
       
  5200     d = r + 1 - leafData;
       
  5201     assert( d<nMaxCells );
       
  5202     assert( r<nMaxCells );
       
  5203     while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
       
  5204       szRight += szCell[d] + 2;
       
  5205       szLeft -= szCell[r] + 2;
       
  5206       cntNew[i-1]--;
       
  5207       r = cntNew[i-1] - 1;
       
  5208       d = r + 1 - leafData;
       
  5209     }
       
  5210     szNew[i] = szRight;
       
  5211     szNew[i-1] = szLeft;
       
  5212   }
       
  5213 
       
  5214   /* Either we found one or more cells (cntnew[0])>0) or we are the
       
  5215   ** a virtual root page.  A virtual root page is when the real root
       
  5216   ** page is page 1 and we are the only child of that page.
       
  5217   */
       
  5218   assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
       
  5219 
       
  5220   /*
       
  5221   ** Allocate k new pages.  Reuse old pages where possible.
       
  5222   */
       
  5223   assert( pPage->pgno>1 );
       
  5224   pageFlags = pPage->aData[0];
       
  5225   for(i=0; i<k; i++){
       
  5226     MemPage *pNew;
       
  5227     if( i<nOld ){
       
  5228       pNew = apNew[i] = apOld[i];
       
  5229       pgnoNew[i] = pgnoOld[i];
       
  5230       apOld[i] = 0;
       
  5231       rc = sqlite3PagerWrite(pNew->pDbPage);
       
  5232       nNew++;
       
  5233       if( rc ) goto balance_cleanup;
       
  5234     }else{
       
  5235       assert( i>0 );
       
  5236       rc = allocateBtreePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0);
       
  5237       if( rc ) goto balance_cleanup;
       
  5238       apNew[i] = pNew;
       
  5239       nNew++;
       
  5240     }
       
  5241   }
       
  5242 
       
  5243   /* Free any old pages that were not reused as new pages.
       
  5244   */
       
  5245   while( i<nOld ){
       
  5246     rc = freePage(apOld[i]);
       
  5247     if( rc ) goto balance_cleanup;
       
  5248     releasePage(apOld[i]);
       
  5249     apOld[i] = 0;
       
  5250     i++;
       
  5251   }
       
  5252 
       
  5253   /*
       
  5254   ** Put the new pages in accending order.  This helps to
       
  5255   ** keep entries in the disk file in order so that a scan
       
  5256   ** of the table is a linear scan through the file.  That
       
  5257   ** in turn helps the operating system to deliver pages
       
  5258   ** from the disk more rapidly.
       
  5259   **
       
  5260   ** An O(n^2) insertion sort algorithm is used, but since
       
  5261   ** n is never more than NB (a small constant), that should
       
  5262   ** not be a problem.
       
  5263   **
       
  5264   ** When NB==3, this one optimization makes the database
       
  5265   ** about 25% faster for large insertions and deletions.
       
  5266   */
       
  5267   for(i=0; i<k-1; i++){
       
  5268     int minV = pgnoNew[i];
       
  5269     int minI = i;
       
  5270     for(j=i+1; j<k; j++){
       
  5271       if( pgnoNew[j]<(unsigned)minV ){
       
  5272         minI = j;
       
  5273         minV = pgnoNew[j];
       
  5274       }
       
  5275     }
       
  5276     if( minI>i ){
       
  5277       int t;
       
  5278       MemPage *pT;
       
  5279       t = pgnoNew[i];
       
  5280       pT = apNew[i];
       
  5281       pgnoNew[i] = pgnoNew[minI];
       
  5282       apNew[i] = apNew[minI];
       
  5283       pgnoNew[minI] = t;
       
  5284       apNew[minI] = pT;
       
  5285     }
       
  5286   }
       
  5287   TRACE(("BALANCE: old: %d %d %d  new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
       
  5288     pgnoOld[0], 
       
  5289     nOld>=2 ? pgnoOld[1] : 0,
       
  5290     nOld>=3 ? pgnoOld[2] : 0,
       
  5291     pgnoNew[0], szNew[0],
       
  5292     nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0,
       
  5293     nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0,
       
  5294     nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0,
       
  5295     nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0));
       
  5296 
       
  5297   /*
       
  5298   ** Evenly distribute the data in apCell[] across the new pages.
       
  5299   ** Insert divider cells into pParent as necessary.
       
  5300   */
       
  5301   j = 0;
       
  5302   for(i=0; i<nNew; i++){
       
  5303     /* Assemble the new sibling page. */
       
  5304     MemPage *pNew = apNew[i];
       
  5305     assert( j<nMaxCells );
       
  5306     assert( pNew->pgno==pgnoNew[i] );
       
  5307     zeroPage(pNew, pageFlags);
       
  5308     assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
       
  5309     assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
       
  5310     assert( pNew->nOverflow==0 );
       
  5311 
       
  5312     /* If this is an auto-vacuum database, update the pointer map entries
       
  5313     ** that point to the siblings that were rearranged. These can be: left
       
  5314     ** children of cells, the right-child of the page, or overflow pages
       
  5315     ** pointed to by cells.
       
  5316     */
       
  5317     if( ISAUTOVACUUM ){
       
  5318       for(k=j; k<cntNew[i]; k++){
       
  5319         assert( k<nMaxCells );
       
  5320         if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){
       
  5321           rc = ptrmapPutOvfl(pNew, k-j);
       
  5322           if( rc==SQLITE_OK && leafCorrection==0 ){
       
  5323             rc = ptrmapPut(pBt, get4byte(apCell[k]), PTRMAP_BTREE, pNew->pgno);
       
  5324           }
       
  5325           if( rc!=SQLITE_OK ){
       
  5326             goto balance_cleanup;
       
  5327           }
       
  5328         }
       
  5329       }
       
  5330     }
       
  5331 
       
  5332     j = cntNew[i];
       
  5333 
       
  5334     /* If the sibling page assembled above was not the right-most sibling,
       
  5335     ** insert a divider cell into the parent page.
       
  5336     */
       
  5337     if( i<nNew-1 && j<nCell ){
       
  5338       u8 *pCell;
       
  5339       u8 *pTemp;
       
  5340       int sz;
       
  5341 
       
  5342       assert( j<nMaxCells );
       
  5343       pCell = apCell[j];
       
  5344       sz = szCell[j] + leafCorrection;
       
  5345       pTemp = &aSpace2[iSpace2];
       
  5346       if( !pNew->leaf ){
       
  5347         memcpy(&pNew->aData[8], pCell, 4);
       
  5348         if( ISAUTOVACUUM 
       
  5349          && (aFrom[j]==0xFF || apCopy[aFrom[j]]->pgno!=pNew->pgno)
       
  5350         ){
       
  5351           rc = ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno);
       
  5352           if( rc!=SQLITE_OK ){
       
  5353             goto balance_cleanup;
       
  5354           }
       
  5355         }
       
  5356       }else if( leafData ){
       
  5357         /* If the tree is a leaf-data tree, and the siblings are leaves, 
       
  5358         ** then there is no divider cell in apCell[]. Instead, the divider 
       
  5359         ** cell consists of the integer key for the right-most cell of 
       
  5360         ** the sibling-page assembled above only.
       
  5361         */
       
  5362         CellInfo info;
       
  5363         j--;
       
  5364         sqlite3BtreeParseCellPtr(pNew, apCell[j], &info);
       
  5365         pCell = pTemp;
       
  5366         fillInCell(pParent, pCell, 0, info.nKey, 0, 0, 0, &sz);
       
  5367         pTemp = 0;
       
  5368       }else{
       
  5369         pCell -= 4;
       
  5370         /* Obscure case for non-leaf-data trees: If the cell at pCell was
       
  5371         ** previously stored on a leaf node, and its reported size was 4
       
  5372         ** bytes, then it may actually be smaller than this 
       
  5373         ** (see sqlite3BtreeParseCellPtr(), 4 bytes is the minimum size of
       
  5374         ** any cell). But it is important to pass the correct size to 
       
  5375         ** insertCell(), so reparse the cell now.
       
  5376         **
       
  5377         ** Note that this can never happen in an SQLite data file, as all
       
  5378         ** cells are at least 4 bytes. It only happens in b-trees used
       
  5379         ** to evaluate "IN (SELECT ...)" and similar clauses.
       
  5380         */
       
  5381         if( szCell[j]==4 ){
       
  5382           assert(leafCorrection==4);
       
  5383           sz = cellSizePtr(pParent, pCell);
       
  5384         }
       
  5385       }
       
  5386       iSpace2 += sz;
       
  5387       assert( sz<=pBt->pageSize/4 );
       
  5388       assert( iSpace2<=pBt->pageSize );
       
  5389       rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4);
       
  5390       if( rc!=SQLITE_OK ) goto balance_cleanup;
       
  5391       put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno);
       
  5392 
       
  5393       /* If this is an auto-vacuum database, and not a leaf-data tree,
       
  5394       ** then update the pointer map with an entry for the overflow page
       
  5395       ** that the cell just inserted points to (if any).
       
  5396       */
       
  5397       if( ISAUTOVACUUM && !leafData ){
       
  5398         rc = ptrmapPutOvfl(pParent, nxDiv);
       
  5399         if( rc!=SQLITE_OK ){
       
  5400           goto balance_cleanup;
       
  5401         }
       
  5402       }
       
  5403       j++;
       
  5404       nxDiv++;
       
  5405     }
       
  5406 
       
  5407     /* Set the pointer-map entry for the new sibling page. */
       
  5408     if( ISAUTOVACUUM ){
       
  5409       rc = ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno);
       
  5410       if( rc!=SQLITE_OK ){
       
  5411         goto balance_cleanup;
       
  5412       }
       
  5413     }
       
  5414   }
       
  5415   assert( j==nCell );
       
  5416   assert( nOld>0 );
       
  5417   assert( nNew>0 );
       
  5418   if( (pageFlags & PTF_LEAF)==0 ){
       
  5419     u8 *zChild = &apCopy[nOld-1]->aData[8];
       
  5420     memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
       
  5421     if( ISAUTOVACUUM ){
       
  5422       rc = ptrmapPut(pBt, get4byte(zChild), PTRMAP_BTREE, apNew[nNew-1]->pgno);
       
  5423       if( rc!=SQLITE_OK ){
       
  5424         goto balance_cleanup;
       
  5425       }
       
  5426     }
       
  5427   }
       
  5428   if( nxDiv==pParent->nCell+pParent->nOverflow ){
       
  5429     /* Right-most sibling is the right-most child of pParent */
       
  5430     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]);
       
  5431   }else{
       
  5432     /* Right-most sibling is the left child of the first entry in pParent
       
  5433     ** past the right-most divider entry */
       
  5434     put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]);
       
  5435   }
       
  5436 
       
  5437   /*
       
  5438   ** Balance the parent page.  Note that the current page (pPage) might
       
  5439   ** have been added to the freelist so it might no longer be initialized.
       
  5440   ** But the parent page will always be initialized.
       
  5441   */
       
  5442   assert( pParent->isInit );
       
  5443   sqlite3ScratchFree(apCell);
       
  5444   apCell = 0;
       
  5445   releasePage(pPage);
       
  5446   pCur->iPage--;
       
  5447   rc = balance(pCur, 0);
       
  5448   
       
  5449   /*
       
  5450   ** Cleanup before returning.
       
  5451   */
       
  5452 balance_cleanup:
       
  5453   sqlite3PageFree(aSpace2);
       
  5454   sqlite3ScratchFree(apCell);
       
  5455   for(i=0; i<nOld; i++){
       
  5456     releasePage(apOld[i]);
       
  5457   }
       
  5458   for(i=0; i<nNew; i++){
       
  5459     releasePage(apNew[i]);
       
  5460   }
       
  5461 
       
  5462   /* releasePage(pParent); */
       
  5463   TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
       
  5464           pPage->pgno, nOld, nNew, nCell));
       
  5465 
       
  5466   return rc;
       
  5467 }
       
  5468 
       
  5469 /*
       
  5470 ** This routine is called for the root page of a btree when the root
       
  5471 ** page contains no cells.  This is an opportunity to make the tree
       
  5472 ** shallower by one level.
       
  5473 */
       
  5474 static int balance_shallower(BtCursor *pCur){
       
  5475   MemPage *pPage;              /* Root page of B-Tree */
       
  5476   MemPage *pChild;             /* The only child page of pPage */
       
  5477   Pgno pgnoChild;              /* Page number for pChild */
       
  5478   int rc = SQLITE_OK;          /* Return code from subprocedures */
       
  5479   BtShared *pBt;                  /* The main BTree structure */
       
  5480   int mxCellPerPage;           /* Maximum number of cells per page */
       
  5481   u8 **apCell;                 /* All cells from pages being balanced */
       
  5482   u16 *szCell;                 /* Local size of all cells */
       
  5483 
       
  5484   assert( pCur->iPage==0 );
       
  5485   pPage = pCur->apPage[0];
       
  5486 
       
  5487   assert( pPage->nCell==0 );
       
  5488   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
       
  5489   pBt = pPage->pBt;
       
  5490   mxCellPerPage = MX_CELL(pBt);
       
  5491   apCell = sqlite3Malloc( mxCellPerPage*(sizeof(u8*)+sizeof(u16)) );
       
  5492   if( apCell==0 ) return SQLITE_NOMEM;
       
  5493   szCell = (u16*)&apCell[mxCellPerPage];
       
  5494   if( pPage->leaf ){
       
  5495     /* The table is completely empty */
       
  5496     TRACE(("BALANCE: empty table %d\n", pPage->pgno));
       
  5497   }else{
       
  5498     /* The root page is empty but has one child.  Transfer the
       
  5499     ** information from that one child into the root page if it 
       
  5500     ** will fit.  This reduces the depth of the tree by one.
       
  5501     **
       
  5502     ** If the root page is page 1, it has less space available than
       
  5503     ** its child (due to the 100 byte header that occurs at the beginning
       
  5504     ** of the database fle), so it might not be able to hold all of the 
       
  5505     ** information currently contained in the child.  If this is the 
       
  5506     ** case, then do not do the transfer.  Leave page 1 empty except
       
  5507     ** for the right-pointer to the child page.  The child page becomes
       
  5508     ** the virtual root of the tree.
       
  5509     */
       
  5510     VVA_ONLY( pCur->pagesShuffled = 1 );
       
  5511     pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]);
       
  5512     assert( pgnoChild>0 );
       
  5513     assert( pgnoChild<=pagerPagecount(pPage->pBt->pPager) );
       
  5514     rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0);
       
  5515     if( rc ) goto end_shallow_balance;
       
  5516     if( pPage->pgno==1 ){
       
  5517       rc = sqlite3BtreeInitPage(pChild);
       
  5518       if( rc ) goto end_shallow_balance;
       
  5519       assert( pChild->nOverflow==0 );
       
  5520       if( pChild->nFree>=100 ){
       
  5521         /* The child information will fit on the root page, so do the
       
  5522         ** copy */
       
  5523         int i;
       
  5524         zeroPage(pPage, pChild->aData[0]);
       
  5525         for(i=0; i<pChild->nCell; i++){
       
  5526           apCell[i] = findCell(pChild,i);
       
  5527           szCell[i] = cellSizePtr(pChild, apCell[i]);
       
  5528         }
       
  5529         assemblePage(pPage, pChild->nCell, apCell, szCell);
       
  5530         /* Copy the right-pointer of the child to the parent. */
       
  5531         put4byte(&pPage->aData[pPage->hdrOffset+8], 
       
  5532             get4byte(&pChild->aData[pChild->hdrOffset+8]));
       
  5533         freePage(pChild);
       
  5534         TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
       
  5535       }else{
       
  5536         /* The child has more information that will fit on the root.
       
  5537         ** The tree is already balanced.  Do nothing. */
       
  5538         TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
       
  5539       }
       
  5540     }else{
       
  5541       memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize);
       
  5542       pPage->isInit = 0;
       
  5543       rc = sqlite3BtreeInitPage(pPage);
       
  5544       assert( rc==SQLITE_OK );
       
  5545       freePage(pChild);
       
  5546       TRACE(("BALANCE: transfer child %d into root %d\n",
       
  5547               pChild->pgno, pPage->pgno));
       
  5548     }
       
  5549     assert( pPage->nOverflow==0 );
       
  5550     if( ISAUTOVACUUM ){
       
  5551       rc = setChildPtrmaps(pPage);
       
  5552     }
       
  5553     releasePage(pChild);
       
  5554   }
       
  5555 end_shallow_balance:
       
  5556   sqlite3_free(apCell);
       
  5557   return rc;
       
  5558 }
       
  5559 
       
  5560 
       
  5561 /*
       
  5562 ** The root page is overfull
       
  5563 **
       
  5564 ** When this happens, Create a new child page and copy the
       
  5565 ** contents of the root into the child.  Then make the root
       
  5566 ** page an empty page with rightChild pointing to the new
       
  5567 ** child.   Finally, call balance_internal() on the new child
       
  5568 ** to cause it to split.
       
  5569 */
       
  5570 static int balance_deeper(BtCursor *pCur){
       
  5571   int rc;             /* Return value from subprocedures */
       
  5572   MemPage *pPage;     /* Pointer to the root page */
       
  5573   MemPage *pChild;    /* Pointer to a new child page */
       
  5574   Pgno pgnoChild;     /* Page number of the new child page */
       
  5575   BtShared *pBt;         /* The BTree */
       
  5576   int usableSize;     /* Total usable size of a page */
       
  5577   u8 *data;           /* Content of the parent page */
       
  5578   u8 *cdata;          /* Content of the child page */
       
  5579   int hdr;            /* Offset to page header in parent */
       
  5580   int cbrk;           /* Offset to content of first cell in parent */
       
  5581 
       
  5582   assert( pCur->iPage==0 );
       
  5583   assert( pCur->apPage[0]->nOverflow>0 );
       
  5584 
       
  5585   VVA_ONLY( pCur->pagesShuffled = 1 );
       
  5586   pPage = pCur->apPage[0];
       
  5587   pBt = pPage->pBt;
       
  5588   assert( sqlite3_mutex_held(pBt->mutex) );
       
  5589   rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0);
       
  5590   if( rc ) return rc;
       
  5591   assert( sqlite3PagerIswriteable(pChild->pDbPage) );
       
  5592   usableSize = pBt->usableSize;
       
  5593   data = pPage->aData;
       
  5594   hdr = pPage->hdrOffset;
       
  5595   cbrk = get2byte(&data[hdr+5]);
       
  5596   cdata = pChild->aData;
       
  5597   memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr);
       
  5598   memcpy(&cdata[cbrk], &data[cbrk], usableSize-cbrk);
       
  5599   
       
  5600   rc = sqlite3BtreeInitPage(pChild);
       
  5601   if( rc==SQLITE_OK ){
       
  5602     int nCopy = pPage->nOverflow*sizeof(pPage->aOvfl[0]);
       
  5603     memcpy(pChild->aOvfl, pPage->aOvfl, nCopy);
       
  5604     pChild->nOverflow = pPage->nOverflow;
       
  5605     if( pChild->nOverflow ){
       
  5606       pChild->nFree = 0;
       
  5607     }
       
  5608     assert( pChild->nCell==pPage->nCell );
       
  5609     zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
       
  5610     put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild);
       
  5611     TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno));
       
  5612     if( ISAUTOVACUUM ){
       
  5613       rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno);
       
  5614       if( rc==SQLITE_OK ){
       
  5615         rc = setChildPtrmaps(pChild);
       
  5616       }
       
  5617     }
       
  5618   }
       
  5619 
       
  5620   if( rc==SQLITE_OK ){
       
  5621     pCur->iPage++;
       
  5622     pCur->apPage[1] = pChild;
       
  5623     pCur->aiIdx[0] = 0;
       
  5624     rc = balance_nonroot(pCur);
       
  5625   }else{
       
  5626     releasePage(pChild);
       
  5627   }
       
  5628 
       
  5629   return rc;
       
  5630 }
       
  5631 
       
  5632 /*
       
  5633 ** The page that pCur currently points to has just been modified in
       
  5634 ** some way. This function figures out if this modification means the
       
  5635 ** tree needs to be balanced, and if so calls the appropriate balancing 
       
  5636 ** routine.
       
  5637 ** 
       
  5638 ** Parameter isInsert is true if a new cell was just inserted into the
       
  5639 ** page, or false otherwise.
       
  5640 */
       
  5641 static int balance(BtCursor *pCur, int isInsert){
       
  5642   int rc = SQLITE_OK;
       
  5643   MemPage *pPage = pCur->apPage[pCur->iPage];
       
  5644 
       
  5645   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
       
  5646   if( pCur->iPage==0 ){
       
  5647     rc = sqlite3PagerWrite(pPage->pDbPage);
       
  5648     if( rc==SQLITE_OK && pPage->nOverflow>0 ){
       
  5649       rc = balance_deeper(pCur);
       
  5650     }
       
  5651     if( rc==SQLITE_OK && pPage->nCell==0 ){
       
  5652       rc = balance_shallower(pCur);
       
  5653     }
       
  5654   }else{
       
  5655     if( pPage->nOverflow>0 || 
       
  5656         (!isInsert && pPage->nFree>pPage->pBt->usableSize*2/3) ){
       
  5657       rc = balance_nonroot(pCur);
       
  5658     }
       
  5659   }
       
  5660   return rc;
       
  5661 }
       
  5662 
       
  5663 /*
       
  5664 ** This routine checks all cursors that point to table pgnoRoot.
       
  5665 ** If any of those cursors were opened with wrFlag==0 in a different
       
  5666 ** database connection (a database connection that shares the pager
       
  5667 ** cache with the current connection) and that other connection 
       
  5668 ** is not in the ReadUncommmitted state, then this routine returns 
       
  5669 ** SQLITE_LOCKED.
       
  5670 **
       
  5671 ** As well as cursors with wrFlag==0, cursors with wrFlag==1 and 
       
  5672 ** isIncrblobHandle==1 are also considered 'read' cursors. Incremental 
       
  5673 ** blob cursors are used for both reading and writing.
       
  5674 **
       
  5675 ** When pgnoRoot is the root page of an intkey table, this function is also
       
  5676 ** responsible for invalidating incremental blob cursors when the table row
       
  5677 ** on which they are opened is deleted or modified. Cursors are invalidated
       
  5678 ** according to the following rules:
       
  5679 **
       
  5680 **   1) When BtreeClearTable() is called to completely delete the contents
       
  5681 **      of a B-Tree table, pExclude is set to zero and parameter iRow is 
       
  5682 **      set to non-zero. In this case all incremental blob cursors open
       
  5683 **      on the table rooted at pgnoRoot are invalidated.
       
  5684 **
       
  5685 **   2) When BtreeInsert(), BtreeDelete() or BtreePutData() is called to 
       
  5686 **      modify a table row via an SQL statement, pExclude is set to the 
       
  5687 **      write cursor used to do the modification and parameter iRow is set
       
  5688 **      to the integer row id of the B-Tree entry being modified. Unless
       
  5689 **      pExclude is itself an incremental blob cursor, then all incremental
       
  5690 **      blob cursors open on row iRow of the B-Tree are invalidated.
       
  5691 **
       
  5692 **   3) If both pExclude and iRow are set to zero, no incremental blob 
       
  5693 **      cursors are invalidated.
       
  5694 */
       
  5695 static int checkReadLocks(
       
  5696   Btree *pBtree, 
       
  5697   Pgno pgnoRoot, 
       
  5698   BtCursor *pExclude,
       
  5699   i64 iRow
       
  5700 ){
       
  5701   BtCursor *p;
       
  5702   BtShared *pBt = pBtree->pBt;
       
  5703   sqlite3 *db = pBtree->db;
       
  5704   assert( sqlite3BtreeHoldsMutex(pBtree) );
       
  5705   for(p=pBt->pCursor; p; p=p->pNext){
       
  5706     if( p==pExclude ) continue;
       
  5707     if( p->pgnoRoot!=pgnoRoot ) continue;
       
  5708 #ifndef SQLITE_OMIT_INCRBLOB
       
  5709     if( p->isIncrblobHandle && ( 
       
  5710          (!pExclude && iRow)
       
  5711       || (pExclude && !pExclude->isIncrblobHandle && p->info.nKey==iRow)
       
  5712     )){
       
  5713       p->eState = CURSOR_INVALID;
       
  5714     }
       
  5715 #endif
       
  5716     if( p->eState!=CURSOR_VALID ) continue;
       
  5717     if( p->wrFlag==0 
       
  5718 #ifndef SQLITE_OMIT_INCRBLOB
       
  5719      || p->isIncrblobHandle
       
  5720 #endif
       
  5721     ){
       
  5722       sqlite3 *dbOther = p->pBtree->db;
       
  5723       if( dbOther==0 ||
       
  5724          (dbOther!=db && (dbOther->flags & SQLITE_ReadUncommitted)==0) ){
       
  5725         return SQLITE_LOCKED;
       
  5726       }
       
  5727     }
       
  5728   }
       
  5729   return SQLITE_OK;
       
  5730 }
       
  5731 
       
  5732 /*
       
  5733 ** Insert a new record into the BTree.  The key is given by (pKey,nKey)
       
  5734 ** and the data is given by (pData,nData).  The cursor is used only to
       
  5735 ** define what table the record should be inserted into.  The cursor
       
  5736 ** is left pointing at a random location.
       
  5737 **
       
  5738 ** For an INTKEY table, only the nKey value of the key is used.  pKey is
       
  5739 ** ignored.  For a ZERODATA table, the pData and nData are both ignored.
       
  5740 */
       
  5741 int sqlite3BtreeInsert(
       
  5742   BtCursor *pCur,                /* Insert data into the table of this cursor */
       
  5743   const void *pKey, i64 nKey,    /* The key of the new record */
       
  5744   const void *pData, int nData,  /* The data of the new record */
       
  5745   int nZero,                     /* Number of extra 0 bytes to append to data */
       
  5746   int appendBias                 /* True if this is likely an append */
       
  5747 ){
       
  5748   int rc;
       
  5749   int loc;
       
  5750   int szNew;
       
  5751   int idx;
       
  5752   MemPage *pPage;
       
  5753   Btree *p = pCur->pBtree;
       
  5754   BtShared *pBt = p->pBt;
       
  5755   unsigned char *oldCell;
       
  5756   unsigned char *newCell = 0;
       
  5757 
       
  5758   assert( cursorHoldsMutex(pCur) );
       
  5759   if( pBt->inTransaction!=TRANS_WRITE ){
       
  5760     /* Must start a transaction before doing an insert */
       
  5761     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
       
  5762     return rc;
       
  5763   }
       
  5764   assert( !pBt->readOnly );
       
  5765   if( !pCur->wrFlag ){
       
  5766     return SQLITE_PERM;   /* Cursor not open for writing */
       
  5767   }
       
  5768   if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, nKey) ){
       
  5769     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
       
  5770   }
       
  5771   if( pCur->eState==CURSOR_FAULT ){
       
  5772     return pCur->skip;
       
  5773   }
       
  5774 
       
  5775   /* Save the positions of any other cursors open on this table */
       
  5776   clearCursorPosition(pCur);
       
  5777   if( 
       
  5778     SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) ||
       
  5779     SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, nKey, appendBias, &loc))
       
  5780   ){
       
  5781     return rc;
       
  5782   }
       
  5783 
       
  5784   pPage = pCur->apPage[pCur->iPage];
       
  5785   assert( pPage->intKey || nKey>=0 );
       
  5786   assert( pPage->leaf || !pPage->intKey );
       
  5787   TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
       
  5788           pCur->pgnoRoot, nKey, nData, pPage->pgno,
       
  5789           loc==0 ? "overwrite" : "new entry"));
       
  5790   assert( pPage->isInit );
       
  5791   allocateTempSpace(pBt);
       
  5792   newCell = pBt->pTmpSpace;
       
  5793   if( newCell==0 ) return SQLITE_NOMEM;
       
  5794   rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
       
  5795   if( rc ) goto end_insert;
       
  5796   assert( szNew==cellSizePtr(pPage, newCell) );
       
  5797   assert( szNew<=MX_CELL_SIZE(pBt) );
       
  5798   idx = pCur->aiIdx[pCur->iPage];
       
  5799   if( loc==0 && CURSOR_VALID==pCur->eState ){
       
  5800     u16 szOld;
       
  5801     assert( idx<pPage->nCell );
       
  5802     rc = sqlite3PagerWrite(pPage->pDbPage);
       
  5803     if( rc ){
       
  5804       goto end_insert;
       
  5805     }
       
  5806     oldCell = findCell(pPage, idx);
       
  5807     if( !pPage->leaf ){
       
  5808       memcpy(newCell, oldCell, 4);
       
  5809     }
       
  5810     szOld = cellSizePtr(pPage, oldCell);
       
  5811     rc = clearCell(pPage, oldCell);
       
  5812     if( rc ) goto end_insert;
       
  5813     dropCell(pPage, idx, szOld);
       
  5814   }else if( loc<0 && pPage->nCell>0 ){
       
  5815     assert( pPage->leaf );
       
  5816     idx = ++pCur->aiIdx[pCur->iPage];
       
  5817     pCur->info.nSize = 0;
       
  5818     pCur->validNKey = 0;
       
  5819   }else{
       
  5820     assert( pPage->leaf );
       
  5821   }
       
  5822   rc = insertCell(pPage, idx, newCell, szNew, 0, 0);
       
  5823   if( rc!=SQLITE_OK ) goto end_insert;
       
  5824   rc = balance(pCur, 1);
       
  5825   if( rc==SQLITE_OK ){
       
  5826     moveToRoot(pCur);
       
  5827   }
       
  5828 end_insert:
       
  5829   return rc;
       
  5830 }
       
  5831 
       
  5832 /*
       
  5833 ** Delete the entry that the cursor is pointing to.  The cursor
       
  5834 ** is left pointing at a arbitrary location.
       
  5835 */
       
  5836 int sqlite3BtreeDelete(BtCursor *pCur){
       
  5837   MemPage *pPage = pCur->apPage[pCur->iPage];
       
  5838   int idx;
       
  5839   unsigned char *pCell;
       
  5840   int rc;
       
  5841   Pgno pgnoChild = 0;
       
  5842   Btree *p = pCur->pBtree;
       
  5843   BtShared *pBt = p->pBt;
       
  5844 
       
  5845   assert( cursorHoldsMutex(pCur) );
       
  5846   assert( pPage->isInit );
       
  5847   if( pBt->inTransaction!=TRANS_WRITE ){
       
  5848     /* Must start a transaction before doing a delete */
       
  5849     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
       
  5850     return rc;
       
  5851   }
       
  5852   assert( !pBt->readOnly );
       
  5853   if( pCur->eState==CURSOR_FAULT ){
       
  5854     return pCur->skip;
       
  5855   }
       
  5856   if( pCur->aiIdx[pCur->iPage]>=pPage->nCell ){
       
  5857     return SQLITE_ERROR;  /* The cursor is not pointing to anything */
       
  5858   }
       
  5859   if( !pCur->wrFlag ){
       
  5860     return SQLITE_PERM;   /* Did not open this cursor for writing */
       
  5861   }
       
  5862   if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, pCur->info.nKey) ){
       
  5863     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
       
  5864   }
       
  5865 
       
  5866   /* Restore the current cursor position (a no-op if the cursor is not in 
       
  5867   ** CURSOR_REQUIRESEEK state) and save the positions of any other cursors 
       
  5868   ** open on the same table. Then call sqlite3PagerWrite() on the page
       
  5869   ** that the entry will be deleted from.
       
  5870   */
       
  5871   if( 
       
  5872     (rc = restoreCursorPosition(pCur))!=0 ||
       
  5873     (rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur))!=0 ||
       
  5874     (rc = sqlite3PagerWrite(pPage->pDbPage))!=0
       
  5875   ){
       
  5876     return rc;
       
  5877   }
       
  5878 
       
  5879   /* Locate the cell within its page and leave pCell pointing to the
       
  5880   ** data. The clearCell() call frees any overflow pages associated with the
       
  5881   ** cell. The cell itself is still intact.
       
  5882   */
       
  5883   idx = pCur->aiIdx[pCur->iPage];
       
  5884   pCell = findCell(pPage, idx);
       
  5885   if( !pPage->leaf ){
       
  5886     pgnoChild = get4byte(pCell);
       
  5887   }
       
  5888   rc = clearCell(pPage, pCell);
       
  5889   if( rc ){
       
  5890     return rc;
       
  5891   }
       
  5892 
       
  5893   if( !pPage->leaf ){
       
  5894     /*
       
  5895     ** The entry we are about to delete is not a leaf so if we do not
       
  5896     ** do something we will leave a hole on an internal page.
       
  5897     ** We have to fill the hole by moving in a cell from a leaf.  The
       
  5898     ** next Cell after the one to be deleted is guaranteed to exist and
       
  5899     ** to be a leaf so we can use it.
       
  5900     */
       
  5901     BtCursor leafCur;
       
  5902     MemPage *pLeafPage;
       
  5903 
       
  5904     unsigned char *pNext;
       
  5905     int notUsed;
       
  5906     unsigned char *tempCell = 0;
       
  5907     assert( !pPage->intKey );
       
  5908     sqlite3BtreeGetTempCursor(pCur, &leafCur);
       
  5909     rc = sqlite3BtreeNext(&leafCur, &notUsed);
       
  5910     if( rc==SQLITE_OK ){
       
  5911       assert( leafCur.aiIdx[leafCur.iPage]==0 );
       
  5912       pLeafPage = leafCur.apPage[leafCur.iPage];
       
  5913       rc = sqlite3PagerWrite(pLeafPage->pDbPage);
       
  5914     }
       
  5915     if( rc==SQLITE_OK ){
       
  5916       int leafCursorInvalid = 0;
       
  5917       u16 szNext;
       
  5918       TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
       
  5919          pCur->pgnoRoot, pPage->pgno, pLeafPage->pgno));
       
  5920       dropCell(pPage, idx, cellSizePtr(pPage, pCell));
       
  5921       pNext = findCell(pLeafPage, 0);
       
  5922       szNext = cellSizePtr(pLeafPage, pNext);
       
  5923       assert( MX_CELL_SIZE(pBt)>=szNext+4 );
       
  5924       allocateTempSpace(pBt);
       
  5925       tempCell = pBt->pTmpSpace;
       
  5926       if( tempCell==0 ){
       
  5927         rc = SQLITE_NOMEM;
       
  5928       }
       
  5929       if( rc==SQLITE_OK ){
       
  5930         rc = insertCell(pPage, idx, pNext-4, szNext+4, tempCell, 0);
       
  5931       }
       
  5932 
       
  5933 
       
  5934       /* The "if" statement in the next code block is critical.  The
       
  5935       ** slightest error in that statement would allow SQLite to operate
       
  5936       ** correctly most of the time but produce very rare failures.  To
       
  5937       ** guard against this, the following macros help to verify that
       
  5938       ** the "if" statement is well tested.
       
  5939       */
       
  5940       testcase( pPage->nOverflow==0 && pPage->nFree<pBt->usableSize*2/3 
       
  5941                  && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
       
  5942       testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3 
       
  5943                  && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
       
  5944       testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3+1 
       
  5945                  && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
       
  5946       testcase( pPage->nOverflow>0 && pPage->nFree<=pBt->usableSize*2/3
       
  5947                  && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
       
  5948       testcase( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3))
       
  5949                  && pLeafPage->nFree+2+szNext == pBt->usableSize*2/3 );
       
  5950 
       
  5951 
       
  5952       if( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3)) &&
       
  5953           (pLeafPage->nFree+2+szNext > pBt->usableSize*2/3)
       
  5954       ){
       
  5955         /* This branch is taken if the internal node is now either overflowing
       
  5956         ** or underfull and the leaf node will be underfull after the just cell 
       
  5957         ** copied to the internal node is deleted from it. This is a special
       
  5958         ** case because the call to balance() to correct the internal node
       
  5959         ** may change the tree structure and invalidate the contents of
       
  5960         ** the leafCur.apPage[] and leafCur.aiIdx[] arrays, which will be
       
  5961         ** used by the balance() required to correct the underfull leaf
       
  5962         ** node.
       
  5963         **
       
  5964         ** The formula used in the expression above are based on facets of
       
  5965         ** the SQLite file-format that do not change over time.
       
  5966         */
       
  5967         testcase( pPage->nFree==pBt->usableSize*2/3+1 );
       
  5968         testcase( pLeafPage->nFree+2+szNext==pBt->usableSize*2/3+1 );
       
  5969         leafCursorInvalid = 1;
       
  5970       }        
       
  5971 
       
  5972       if( rc==SQLITE_OK ){
       
  5973         put4byte(findOverflowCell(pPage, idx), pgnoChild);
       
  5974         VVA_ONLY( pCur->pagesShuffled = 0 );
       
  5975         rc = balance(pCur, 0);
       
  5976       }
       
  5977 
       
  5978       if( rc==SQLITE_OK && leafCursorInvalid ){
       
  5979         /* The leaf-node is now underfull and so the tree needs to be 
       
  5980         ** rebalanced. However, the balance() operation on the internal
       
  5981         ** node above may have modified the structure of the B-Tree and
       
  5982         ** so the current contents of leafCur.apPage[] and leafCur.aiIdx[]
       
  5983         ** may not be trusted.
       
  5984         **
       
  5985         ** It is not possible to copy the ancestry from pCur, as the same
       
  5986         ** balance() call has invalidated the pCur->apPage[] and aiIdx[]
       
  5987         ** arrays. 
       
  5988         **
       
  5989         ** The call to saveCursorPosition() below internally saves the 
       
  5990         ** key that leafCur is currently pointing to. Currently, there
       
  5991         ** are two copies of that key in the tree - one here on the leaf
       
  5992         ** page and one on some internal node in the tree. The copy on
       
  5993         ** the leaf node is always the next key in tree-order after the 
       
  5994         ** copy on the internal node. So, the call to sqlite3BtreeNext()
       
  5995         ** calls restoreCursorPosition() to point the cursor to the copy
       
  5996         ** stored on the internal node, then advances to the next entry,
       
  5997         ** which happens to be the copy of the key on the internal node.
       
  5998         ** Net effect: leafCur is pointing back to the duplicate cell
       
  5999         ** that needs to be removed, and the leafCur.apPage[] and
       
  6000         ** leafCur.aiIdx[] arrays are correct.
       
  6001         */
       
  6002         VVA_ONLY( Pgno leafPgno = pLeafPage->pgno );
       
  6003         rc = saveCursorPosition(&leafCur);
       
  6004         if( rc==SQLITE_OK ){
       
  6005           rc = sqlite3BtreeNext(&leafCur, &notUsed);
       
  6006         }
       
  6007         pLeafPage = leafCur.apPage[leafCur.iPage];
       
  6008         assert( rc!=SQLITE_OK || pLeafPage->pgno==leafPgno );
       
  6009         assert( rc!=SQLITE_OK || leafCur.aiIdx[leafCur.iPage]==0 );
       
  6010       }
       
  6011 
       
  6012       if( rc==SQLITE_OK ){
       
  6013         dropCell(pLeafPage, 0, szNext);
       
  6014         VVA_ONLY( leafCur.pagesShuffled = 0 );
       
  6015         rc = balance(&leafCur, 0);
       
  6016         assert( leafCursorInvalid || !leafCur.pagesShuffled
       
  6017                                    || !pCur->pagesShuffled );
       
  6018       }
       
  6019     }
       
  6020     sqlite3BtreeReleaseTempCursor(&leafCur);
       
  6021   }else{
       
  6022     TRACE(("DELETE: table=%d delete from leaf %d\n",
       
  6023        pCur->pgnoRoot, pPage->pgno));
       
  6024     dropCell(pPage, idx, cellSizePtr(pPage, pCell));
       
  6025     rc = balance(pCur, 0);
       
  6026   }
       
  6027   if( rc==SQLITE_OK ){
       
  6028     moveToRoot(pCur);
       
  6029   }
       
  6030   return rc;
       
  6031 }
       
  6032 
       
  6033 /*
       
  6034 ** Create a new BTree table.  Write into *piTable the page
       
  6035 ** number for the root page of the new table.
       
  6036 **
       
  6037 ** The type of type is determined by the flags parameter.  Only the
       
  6038 ** following values of flags are currently in use.  Other values for
       
  6039 ** flags might not work:
       
  6040 **
       
  6041 **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
       
  6042 **     BTREE_ZERODATA                  Used for SQL indices
       
  6043 */
       
  6044 static int btreeCreateTable(Btree *p, int *piTable, int flags){
       
  6045   BtShared *pBt = p->pBt;
       
  6046   MemPage *pRoot;
       
  6047   Pgno pgnoRoot;
       
  6048   int rc;
       
  6049 
       
  6050   assert( sqlite3BtreeHoldsMutex(p) );
       
  6051   if( pBt->inTransaction!=TRANS_WRITE ){
       
  6052     /* Must start a transaction first */
       
  6053     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
       
  6054     return rc;
       
  6055   }
       
  6056   assert( !pBt->readOnly );
       
  6057 
       
  6058 #ifdef SQLITE_OMIT_AUTOVACUUM
       
  6059   rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
       
  6060   if( rc ){
       
  6061     return rc;
       
  6062   }
       
  6063 #else
       
  6064   if( pBt->autoVacuum ){
       
  6065     Pgno pgnoMove;      /* Move a page here to make room for the root-page */
       
  6066     MemPage *pPageMove; /* The page to move to. */
       
  6067 
       
  6068     /* Creating a new table may probably require moving an existing database
       
  6069     ** to make room for the new tables root page. In case this page turns
       
  6070     ** out to be an overflow page, delete all overflow page-map caches
       
  6071     ** held by open cursors.
       
  6072     */
       
  6073     invalidateAllOverflowCache(pBt);
       
  6074 
       
  6075     /* Read the value of meta[3] from the database to determine where the
       
  6076     ** root page of the new table should go. meta[3] is the largest root-page
       
  6077     ** created so far, so the new root-page is (meta[3]+1).
       
  6078     */
       
  6079     rc = sqlite3BtreeGetMeta(p, 4, &pgnoRoot);
       
  6080     if( rc!=SQLITE_OK ){
       
  6081       return rc;
       
  6082     }
       
  6083     pgnoRoot++;
       
  6084 
       
  6085     /* The new root-page may not be allocated on a pointer-map page, or the
       
  6086     ** PENDING_BYTE page.
       
  6087     */
       
  6088     while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
       
  6089         pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
       
  6090       pgnoRoot++;
       
  6091     }
       
  6092     assert( pgnoRoot>=3 );
       
  6093 
       
  6094     /* Allocate a page. The page that currently resides at pgnoRoot will
       
  6095     ** be moved to the allocated page (unless the allocated page happens
       
  6096     ** to reside at pgnoRoot).
       
  6097     */
       
  6098     rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
       
  6099     if( rc!=SQLITE_OK ){
       
  6100       return rc;
       
  6101     }
       
  6102 
       
  6103     if( pgnoMove!=pgnoRoot ){
       
  6104       /* pgnoRoot is the page that will be used for the root-page of
       
  6105       ** the new table (assuming an error did not occur). But we were
       
  6106       ** allocated pgnoMove. If required (i.e. if it was not allocated
       
  6107       ** by extending the file), the current page at position pgnoMove
       
  6108       ** is already journaled.
       
  6109       */
       
  6110       u8 eType;
       
  6111       Pgno iPtrPage;
       
  6112 
       
  6113       releasePage(pPageMove);
       
  6114 
       
  6115       /* Move the page currently at pgnoRoot to pgnoMove. */
       
  6116       rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
       
  6117       if( rc!=SQLITE_OK ){
       
  6118         return rc;
       
  6119       }
       
  6120       rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
       
  6121       if( rc!=SQLITE_OK || eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
       
  6122         releasePage(pRoot);
       
  6123         return rc;
       
  6124       }
       
  6125       assert( eType!=PTRMAP_ROOTPAGE );
       
  6126       assert( eType!=PTRMAP_FREEPAGE );
       
  6127       rc = sqlite3PagerWrite(pRoot->pDbPage);
       
  6128       if( rc!=SQLITE_OK ){
       
  6129         releasePage(pRoot);
       
  6130         return rc;
       
  6131       }
       
  6132       rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
       
  6133       releasePage(pRoot);
       
  6134 
       
  6135       /* Obtain the page at pgnoRoot */
       
  6136       if( rc!=SQLITE_OK ){
       
  6137         return rc;
       
  6138       }
       
  6139       rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
       
  6140       if( rc!=SQLITE_OK ){
       
  6141         return rc;
       
  6142       }
       
  6143       rc = sqlite3PagerWrite(pRoot->pDbPage);
       
  6144       if( rc!=SQLITE_OK ){
       
  6145         releasePage(pRoot);
       
  6146         return rc;
       
  6147       }
       
  6148     }else{
       
  6149       pRoot = pPageMove;
       
  6150     } 
       
  6151 
       
  6152     /* Update the pointer-map and meta-data with the new root-page number. */
       
  6153     rc = ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0);
       
  6154     if( rc ){
       
  6155       releasePage(pRoot);
       
  6156       return rc;
       
  6157     }
       
  6158     rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
       
  6159     if( rc ){
       
  6160       releasePage(pRoot);
       
  6161       return rc;
       
  6162     }
       
  6163 
       
  6164   }else{
       
  6165     rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
       
  6166     if( rc ) return rc;
       
  6167   }
       
  6168 #endif
       
  6169   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
       
  6170   zeroPage(pRoot, flags | PTF_LEAF);
       
  6171   sqlite3PagerUnref(pRoot->pDbPage);
       
  6172   *piTable = (int)pgnoRoot;
       
  6173   return SQLITE_OK;
       
  6174 }
       
  6175 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
       
  6176   int rc;
       
  6177   sqlite3BtreeEnter(p);
       
  6178   p->pBt->db = p->db;
       
  6179   rc = btreeCreateTable(p, piTable, flags);
       
  6180   sqlite3BtreeLeave(p);
       
  6181   return rc;
       
  6182 }
       
  6183 
       
  6184 /*
       
  6185 ** Erase the given database page and all its children.  Return
       
  6186 ** the page to the freelist.
       
  6187 */
       
  6188 static int clearDatabasePage(
       
  6189   BtShared *pBt,           /* The BTree that contains the table */
       
  6190   Pgno pgno,            /* Page number to clear */
       
  6191   MemPage *pParent,     /* Parent page.  NULL for the root */
       
  6192   int freePageFlag      /* Deallocate page if true */
       
  6193 ){
       
  6194   MemPage *pPage = 0;
       
  6195   int rc;
       
  6196   unsigned char *pCell;
       
  6197   int i;
       
  6198 
       
  6199   assert( sqlite3_mutex_held(pBt->mutex) );
       
  6200   if( pgno>pagerPagecount(pBt->pPager) ){
       
  6201     return SQLITE_CORRUPT_BKPT;
       
  6202   }
       
  6203 
       
  6204   rc = getAndInitPage(pBt, pgno, &pPage);
       
  6205   if( rc ) goto cleardatabasepage_out;
       
  6206   for(i=0; i<pPage->nCell; i++){
       
  6207     pCell = findCell(pPage, i);
       
  6208     if( !pPage->leaf ){
       
  6209       rc = clearDatabasePage(pBt, get4byte(pCell), pPage, 1);
       
  6210       if( rc ) goto cleardatabasepage_out;
       
  6211     }
       
  6212     rc = clearCell(pPage, pCell);
       
  6213     if( rc ) goto cleardatabasepage_out;
       
  6214   }
       
  6215   if( !pPage->leaf ){
       
  6216     rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), pPage, 1);
       
  6217     if( rc ) goto cleardatabasepage_out;
       
  6218   }
       
  6219   if( freePageFlag ){
       
  6220     rc = freePage(pPage);
       
  6221   }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
       
  6222     zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
       
  6223   }
       
  6224 
       
  6225 cleardatabasepage_out:
       
  6226   releasePage(pPage);
       
  6227   return rc;
       
  6228 }
       
  6229 
       
  6230 /*
       
  6231 ** Delete all information from a single table in the database.  iTable is
       
  6232 ** the page number of the root of the table.  After this routine returns,
       
  6233 ** the root page is empty, but still exists.
       
  6234 **
       
  6235 ** This routine will fail with SQLITE_LOCKED if there are any open
       
  6236 ** read cursors on the table.  Open write cursors are moved to the
       
  6237 ** root of the table.
       
  6238 */
       
  6239 int sqlite3BtreeClearTable(Btree *p, int iTable){
       
  6240   int rc;
       
  6241   BtShared *pBt = p->pBt;
       
  6242   sqlite3BtreeEnter(p);
       
  6243   pBt->db = p->db;
       
  6244   if( p->inTrans!=TRANS_WRITE ){
       
  6245     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
       
  6246   }else if( (rc = checkReadLocks(p, iTable, 0, 1))!=SQLITE_OK ){
       
  6247     /* nothing to do */
       
  6248   }else if( SQLITE_OK!=(rc = saveAllCursors(pBt, iTable, 0)) ){
       
  6249     /* nothing to do */
       
  6250   }else{
       
  6251     rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0);
       
  6252   }
       
  6253   sqlite3BtreeLeave(p);
       
  6254   return rc;
       
  6255 }
       
  6256 
       
  6257 /*
       
  6258 ** Erase all information in a table and add the root of the table to
       
  6259 ** the freelist.  Except, the root of the principle table (the one on
       
  6260 ** page 1) is never added to the freelist.
       
  6261 **
       
  6262 ** This routine will fail with SQLITE_LOCKED if there are any open
       
  6263 ** cursors on the table.
       
  6264 **
       
  6265 ** If AUTOVACUUM is enabled and the page at iTable is not the last
       
  6266 ** root page in the database file, then the last root page 
       
  6267 ** in the database file is moved into the slot formerly occupied by
       
  6268 ** iTable and that last slot formerly occupied by the last root page
       
  6269 ** is added to the freelist instead of iTable.  In this say, all
       
  6270 ** root pages are kept at the beginning of the database file, which
       
  6271 ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the 
       
  6272 ** page number that used to be the last root page in the file before
       
  6273 ** the move.  If no page gets moved, *piMoved is set to 0.
       
  6274 ** The last root page is recorded in meta[3] and the value of
       
  6275 ** meta[3] is updated by this procedure.
       
  6276 */
       
  6277 static int btreeDropTable(Btree *p, int iTable, int *piMoved){
       
  6278   int rc;
       
  6279   MemPage *pPage = 0;
       
  6280   BtShared *pBt = p->pBt;
       
  6281 
       
  6282   assert( sqlite3BtreeHoldsMutex(p) );
       
  6283   if( p->inTrans!=TRANS_WRITE ){
       
  6284     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
       
  6285   }
       
  6286 
       
  6287   /* It is illegal to drop a table if any cursors are open on the
       
  6288   ** database. This is because in auto-vacuum mode the backend may
       
  6289   ** need to move another root-page to fill a gap left by the deleted
       
  6290   ** root page. If an open cursor was using this page a problem would 
       
  6291   ** occur.
       
  6292   */
       
  6293   if( pBt->pCursor ){
       
  6294     return SQLITE_LOCKED;
       
  6295   }
       
  6296 
       
  6297   rc = sqlite3BtreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
       
  6298   if( rc ) return rc;
       
  6299   rc = sqlite3BtreeClearTable(p, iTable);
       
  6300   if( rc ){
       
  6301     releasePage(pPage);
       
  6302     return rc;
       
  6303   }
       
  6304 
       
  6305   *piMoved = 0;
       
  6306 
       
  6307   if( iTable>1 ){
       
  6308 #ifdef SQLITE_OMIT_AUTOVACUUM
       
  6309     rc = freePage(pPage);
       
  6310     releasePage(pPage);
       
  6311 #else
       
  6312     if( pBt->autoVacuum ){
       
  6313       Pgno maxRootPgno;
       
  6314       rc = sqlite3BtreeGetMeta(p, 4, &maxRootPgno);
       
  6315       if( rc!=SQLITE_OK ){
       
  6316         releasePage(pPage);
       
  6317         return rc;
       
  6318       }
       
  6319 
       
  6320       if( iTable==maxRootPgno ){
       
  6321         /* If the table being dropped is the table with the largest root-page
       
  6322         ** number in the database, put the root page on the free list. 
       
  6323         */
       
  6324         rc = freePage(pPage);
       
  6325         releasePage(pPage);
       
  6326         if( rc!=SQLITE_OK ){
       
  6327           return rc;
       
  6328         }
       
  6329       }else{
       
  6330         /* The table being dropped does not have the largest root-page
       
  6331         ** number in the database. So move the page that does into the 
       
  6332         ** gap left by the deleted root-page.
       
  6333         */
       
  6334         MemPage *pMove;
       
  6335         releasePage(pPage);
       
  6336         rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
       
  6337         if( rc!=SQLITE_OK ){
       
  6338           return rc;
       
  6339         }
       
  6340         rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
       
  6341         releasePage(pMove);
       
  6342         if( rc!=SQLITE_OK ){
       
  6343           return rc;
       
  6344         }
       
  6345         rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
       
  6346         if( rc!=SQLITE_OK ){
       
  6347           return rc;
       
  6348         }
       
  6349         rc = freePage(pMove);
       
  6350         releasePage(pMove);
       
  6351         if( rc!=SQLITE_OK ){
       
  6352           return rc;
       
  6353         }
       
  6354         *piMoved = maxRootPgno;
       
  6355       }
       
  6356 
       
  6357       /* Set the new 'max-root-page' value in the database header. This
       
  6358       ** is the old value less one, less one more if that happens to
       
  6359       ** be a root-page number, less one again if that is the
       
  6360       ** PENDING_BYTE_PAGE.
       
  6361       */
       
  6362       maxRootPgno--;
       
  6363       if( maxRootPgno==PENDING_BYTE_PAGE(pBt) ){
       
  6364         maxRootPgno--;
       
  6365       }
       
  6366       if( maxRootPgno==PTRMAP_PAGENO(pBt, maxRootPgno) ){
       
  6367         maxRootPgno--;
       
  6368       }
       
  6369       assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
       
  6370 
       
  6371       rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
       
  6372     }else{
       
  6373       rc = freePage(pPage);
       
  6374       releasePage(pPage);
       
  6375     }
       
  6376 #endif
       
  6377   }else{
       
  6378     /* If sqlite3BtreeDropTable was called on page 1. */
       
  6379     zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
       
  6380     releasePage(pPage);
       
  6381   }
       
  6382   return rc;  
       
  6383 }
       
  6384 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
       
  6385   int rc;
       
  6386   sqlite3BtreeEnter(p);
       
  6387   p->pBt->db = p->db;
       
  6388   rc = btreeDropTable(p, iTable, piMoved);
       
  6389   sqlite3BtreeLeave(p);
       
  6390   return rc;
       
  6391 }
       
  6392 
       
  6393 
       
  6394 /*
       
  6395 ** Read the meta-information out of a database file.  Meta[0]
       
  6396 ** is the number of free pages currently in the database.  Meta[1]
       
  6397 ** through meta[15] are available for use by higher layers.  Meta[0]
       
  6398 ** is read-only, the others are read/write.
       
  6399 ** 
       
  6400 ** The schema layer numbers meta values differently.  At the schema
       
  6401 ** layer (and the SetCookie and ReadCookie opcodes) the number of
       
  6402 ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
       
  6403 */
       
  6404 int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
       
  6405   DbPage *pDbPage;
       
  6406   int rc;
       
  6407   unsigned char *pP1;
       
  6408   BtShared *pBt = p->pBt;
       
  6409 
       
  6410   sqlite3BtreeEnter(p);
       
  6411   pBt->db = p->db;
       
  6412 
       
  6413   /* Reading a meta-data value requires a read-lock on page 1 (and hence
       
  6414   ** the sqlite_master table. We grab this lock regardless of whether or
       
  6415   ** not the SQLITE_ReadUncommitted flag is set (the table rooted at page
       
  6416   ** 1 is treated as a special case by queryTableLock() and lockTable()).
       
  6417   */
       
  6418   rc = queryTableLock(p, 1, READ_LOCK);
       
  6419   if( rc!=SQLITE_OK ){
       
  6420     sqlite3BtreeLeave(p);
       
  6421     return rc;
       
  6422   }
       
  6423 
       
  6424   assert( idx>=0 && idx<=15 );
       
  6425   if( pBt->pPage1 ){
       
  6426     /* The b-tree is already holding a reference to page 1 of the database
       
  6427     ** file. In this case the required meta-data value can be read directly
       
  6428     ** from the page data of this reference. This is slightly faster than
       
  6429     ** requesting a new reference from the pager layer.
       
  6430     */
       
  6431     pP1 = (unsigned char *)pBt->pPage1->aData;
       
  6432   }else{
       
  6433     /* The b-tree does not have a reference to page 1 of the database file.
       
  6434     ** Obtain one from the pager layer.
       
  6435     */
       
  6436     rc = sqlite3PagerGet(pBt->pPager, 1, &pDbPage);
       
  6437     if( rc ){
       
  6438       sqlite3BtreeLeave(p);
       
  6439       return rc;
       
  6440     }
       
  6441     pP1 = (unsigned char *)sqlite3PagerGetData(pDbPage);
       
  6442   }
       
  6443   *pMeta = get4byte(&pP1[36 + idx*4]);
       
  6444 
       
  6445   /* If the b-tree is not holding a reference to page 1, then one was 
       
  6446   ** requested from the pager layer in the above block. Release it now.
       
  6447   */
       
  6448   if( !pBt->pPage1 ){
       
  6449     sqlite3PagerUnref(pDbPage);
       
  6450   }
       
  6451 
       
  6452   /* If autovacuumed is disabled in this build but we are trying to 
       
  6453   ** access an autovacuumed database, then make the database readonly. 
       
  6454   */
       
  6455 #ifdef SQLITE_OMIT_AUTOVACUUM
       
  6456   if( idx==4 && *pMeta>0 ) pBt->readOnly = 1;
       
  6457 #endif
       
  6458 
       
  6459   /* Grab the read-lock on page 1. */
       
  6460   rc = lockTable(p, 1, READ_LOCK);
       
  6461   sqlite3BtreeLeave(p);
       
  6462   return rc;
       
  6463 }
       
  6464 
       
  6465 /*
       
  6466 ** Write meta-information back into the database.  Meta[0] is
       
  6467 ** read-only and may not be written.
       
  6468 */
       
  6469 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
       
  6470   BtShared *pBt = p->pBt;
       
  6471   unsigned char *pP1;
       
  6472   int rc;
       
  6473   assert( idx>=1 && idx<=15 );
       
  6474   sqlite3BtreeEnter(p);
       
  6475   pBt->db = p->db;
       
  6476   if( p->inTrans!=TRANS_WRITE ){
       
  6477     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
       
  6478   }else{
       
  6479     assert( pBt->pPage1!=0 );
       
  6480     pP1 = pBt->pPage1->aData;
       
  6481     rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
       
  6482     if( rc==SQLITE_OK ){
       
  6483       put4byte(&pP1[36 + idx*4], iMeta);
       
  6484 #ifndef SQLITE_OMIT_AUTOVACUUM
       
  6485       if( idx==7 ){
       
  6486         assert( pBt->autoVacuum || iMeta==0 );
       
  6487         assert( iMeta==0 || iMeta==1 );
       
  6488         pBt->incrVacuum = iMeta;
       
  6489       }
       
  6490 #endif
       
  6491     }
       
  6492   }
       
  6493   sqlite3BtreeLeave(p);
       
  6494   return rc;
       
  6495 }
       
  6496 
       
  6497 /*
       
  6498 ** Return the flag byte at the beginning of the page that the cursor
       
  6499 ** is currently pointing to.
       
  6500 */
       
  6501 int sqlite3BtreeFlags(BtCursor *pCur){
       
  6502   /* TODO: What about CURSOR_REQUIRESEEK state? Probably need to call
       
  6503   ** restoreCursorPosition() here.
       
  6504   */
       
  6505   MemPage *pPage;
       
  6506   restoreCursorPosition(pCur);
       
  6507   pPage = pCur->apPage[pCur->iPage];
       
  6508   assert( cursorHoldsMutex(pCur) );
       
  6509   assert( pPage->pBt==pCur->pBt );
       
  6510   return pPage ? pPage->aData[pPage->hdrOffset] : 0;
       
  6511 }
       
  6512 
       
  6513 
       
  6514 /*
       
  6515 ** Return the pager associated with a BTree.  This routine is used for
       
  6516 ** testing and debugging only.
       
  6517 */
       
  6518 Pager *sqlite3BtreePager(Btree *p){
       
  6519   return p->pBt->pPager;
       
  6520 }
       
  6521 
       
  6522 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
       
  6523 /*
       
  6524 ** Append a message to the error message string.
       
  6525 */
       
  6526 static void checkAppendMsg(
       
  6527   IntegrityCk *pCheck,
       
  6528   char *zMsg1,
       
  6529   const char *zFormat,
       
  6530   ...
       
  6531 ){
       
  6532   va_list ap;
       
  6533   if( !pCheck->mxErr ) return;
       
  6534   pCheck->mxErr--;
       
  6535   pCheck->nErr++;
       
  6536   va_start(ap, zFormat);
       
  6537   if( pCheck->errMsg.nChar ){
       
  6538     sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
       
  6539   }
       
  6540   if( zMsg1 ){
       
  6541     sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1);
       
  6542   }
       
  6543   sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);
       
  6544   va_end(ap);
       
  6545   if( pCheck->errMsg.mallocFailed ){
       
  6546     pCheck->mallocFailed = 1;
       
  6547   }
       
  6548 }
       
  6549 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
       
  6550 
       
  6551 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
       
  6552 /*
       
  6553 ** Add 1 to the reference count for page iPage.  If this is the second
       
  6554 ** reference to the page, add an error message to pCheck->zErrMsg.
       
  6555 ** Return 1 if there are 2 ore more references to the page and 0 if
       
  6556 ** if this is the first reference to the page.
       
  6557 **
       
  6558 ** Also check that the page number is in bounds.
       
  6559 */
       
  6560 static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
       
  6561   if( iPage==0 ) return 1;
       
  6562   if( iPage>pCheck->nPage || iPage<0 ){
       
  6563     checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
       
  6564     return 1;
       
  6565   }
       
  6566   if( pCheck->anRef[iPage]==1 ){
       
  6567     checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
       
  6568     return 1;
       
  6569   }
       
  6570   return  (pCheck->anRef[iPage]++)>1;
       
  6571 }
       
  6572 
       
  6573 #ifndef SQLITE_OMIT_AUTOVACUUM
       
  6574 /*
       
  6575 ** Check that the entry in the pointer-map for page iChild maps to 
       
  6576 ** page iParent, pointer type ptrType. If not, append an error message
       
  6577 ** to pCheck.
       
  6578 */
       
  6579 static void checkPtrmap(
       
  6580   IntegrityCk *pCheck,   /* Integrity check context */
       
  6581   Pgno iChild,           /* Child page number */
       
  6582   u8 eType,              /* Expected pointer map type */
       
  6583   Pgno iParent,          /* Expected pointer map parent page number */
       
  6584   char *zContext         /* Context description (used for error msg) */
       
  6585 ){
       
  6586   int rc;
       
  6587   u8 ePtrmapType;
       
  6588   Pgno iPtrmapParent;
       
  6589 
       
  6590   rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
       
  6591   if( rc!=SQLITE_OK ){
       
  6592     checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
       
  6593     return;
       
  6594   }
       
  6595 
       
  6596   if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
       
  6597     checkAppendMsg(pCheck, zContext, 
       
  6598       "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)", 
       
  6599       iChild, eType, iParent, ePtrmapType, iPtrmapParent);
       
  6600   }
       
  6601 }
       
  6602 #endif
       
  6603 
       
  6604 /*
       
  6605 ** Check the integrity of the freelist or of an overflow page list.
       
  6606 ** Verify that the number of pages on the list is N.
       
  6607 */
       
  6608 static void checkList(
       
  6609   IntegrityCk *pCheck,  /* Integrity checking context */
       
  6610   int isFreeList,       /* True for a freelist.  False for overflow page list */
       
  6611   int iPage,            /* Page number for first page in the list */
       
  6612   int N,                /* Expected number of pages in the list */
       
  6613   char *zContext        /* Context for error messages */
       
  6614 ){
       
  6615   int i;
       
  6616   int expected = N;
       
  6617   int iFirst = iPage;
       
  6618   while( N-- > 0 && pCheck->mxErr ){
       
  6619     DbPage *pOvflPage;
       
  6620     unsigned char *pOvflData;
       
  6621     if( iPage<1 ){
       
  6622       checkAppendMsg(pCheck, zContext,
       
  6623          "%d of %d pages missing from overflow list starting at %d",
       
  6624           N+1, expected, iFirst);
       
  6625       break;
       
  6626     }
       
  6627     if( checkRef(pCheck, iPage, zContext) ) break;
       
  6628     if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
       
  6629       checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
       
  6630       break;
       
  6631     }
       
  6632     pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
       
  6633     if( isFreeList ){
       
  6634       int n = get4byte(&pOvflData[4]);
       
  6635 #ifndef SQLITE_OMIT_AUTOVACUUM
       
  6636       if( pCheck->pBt->autoVacuum ){
       
  6637         checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
       
  6638       }
       
  6639 #endif
       
  6640       if( n>pCheck->pBt->usableSize/4-2 ){
       
  6641         checkAppendMsg(pCheck, zContext,
       
  6642            "freelist leaf count too big on page %d", iPage);
       
  6643         N--;
       
  6644       }else{
       
  6645         for(i=0; i<n; i++){
       
  6646           Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
       
  6647 #ifndef SQLITE_OMIT_AUTOVACUUM
       
  6648           if( pCheck->pBt->autoVacuum ){
       
  6649             checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
       
  6650           }
       
  6651 #endif
       
  6652           checkRef(pCheck, iFreePage, zContext);
       
  6653         }
       
  6654         N -= n;
       
  6655       }
       
  6656     }
       
  6657 #ifndef SQLITE_OMIT_AUTOVACUUM
       
  6658     else{
       
  6659       /* If this database supports auto-vacuum and iPage is not the last
       
  6660       ** page in this overflow list, check that the pointer-map entry for
       
  6661       ** the following page matches iPage.
       
  6662       */
       
  6663       if( pCheck->pBt->autoVacuum && N>0 ){
       
  6664         i = get4byte(pOvflData);
       
  6665         checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
       
  6666       }
       
  6667     }
       
  6668 #endif
       
  6669     iPage = get4byte(pOvflData);
       
  6670     sqlite3PagerUnref(pOvflPage);
       
  6671   }
       
  6672 }
       
  6673 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
       
  6674 
       
  6675 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
       
  6676 /*
       
  6677 ** Do various sanity checks on a single page of a tree.  Return
       
  6678 ** the tree depth.  Root pages return 0.  Parents of root pages
       
  6679 ** return 1, and so forth.
       
  6680 ** 
       
  6681 ** These checks are done:
       
  6682 **
       
  6683 **      1.  Make sure that cells and freeblocks do not overlap
       
  6684 **          but combine to completely cover the page.
       
  6685 **  NO  2.  Make sure cell keys are in order.
       
  6686 **  NO  3.  Make sure no key is less than or equal to zLowerBound.
       
  6687 **  NO  4.  Make sure no key is greater than or equal to zUpperBound.
       
  6688 **      5.  Check the integrity of overflow pages.
       
  6689 **      6.  Recursively call checkTreePage on all children.
       
  6690 **      7.  Verify that the depth of all children is the same.
       
  6691 **      8.  Make sure this page is at least 33% full or else it is
       
  6692 **          the root of the tree.
       
  6693 */
       
  6694 static int checkTreePage(
       
  6695   IntegrityCk *pCheck,  /* Context for the sanity check */
       
  6696   int iPage,            /* Page number of the page to check */
       
  6697   MemPage *pParent,     /* Parent page */
       
  6698   char *zParentContext  /* Parent context */
       
  6699 ){
       
  6700   MemPage *pPage;
       
  6701   int i, rc, depth, d2, pgno, cnt;
       
  6702   int hdr, cellStart;
       
  6703   int nCell;
       
  6704   u8 *data;
       
  6705   BtShared *pBt;
       
  6706   int usableSize;
       
  6707   char zContext[100];
       
  6708   char *hit;
       
  6709 
       
  6710   sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
       
  6711 
       
  6712   /* Check that the page exists
       
  6713   */
       
  6714   pBt = pCheck->pBt;
       
  6715   usableSize = pBt->usableSize;
       
  6716   if( iPage==0 ) return 0;
       
  6717   if( checkRef(pCheck, iPage, zParentContext) ) return 0;
       
  6718   if( (rc = sqlite3BtreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
       
  6719     checkAppendMsg(pCheck, zContext,
       
  6720        "unable to get the page. error code=%d", rc);
       
  6721     return 0;
       
  6722   }
       
  6723   if( (rc = sqlite3BtreeInitPage(pPage))!=0 ){
       
  6724     checkAppendMsg(pCheck, zContext, 
       
  6725                    "sqlite3BtreeInitPage() returns error code %d", rc);
       
  6726     releasePage(pPage);
       
  6727     return 0;
       
  6728   }
       
  6729 
       
  6730   /* Check out all the cells.
       
  6731   */
       
  6732   depth = 0;
       
  6733   for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
       
  6734     u8 *pCell;
       
  6735     int sz;
       
  6736     CellInfo info;
       
  6737 
       
  6738     /* Check payload overflow pages
       
  6739     */
       
  6740     sqlite3_snprintf(sizeof(zContext), zContext,
       
  6741              "On tree page %d cell %d: ", iPage, i);
       
  6742     pCell = findCell(pPage,i);
       
  6743     sqlite3BtreeParseCellPtr(pPage, pCell, &info);
       
  6744     sz = info.nData;
       
  6745     if( !pPage->intKey ) sz += info.nKey;
       
  6746     assert( sz==info.nPayload );
       
  6747     if( sz>info.nLocal ){
       
  6748       int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
       
  6749       Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
       
  6750 #ifndef SQLITE_OMIT_AUTOVACUUM
       
  6751       if( pBt->autoVacuum ){
       
  6752         checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
       
  6753       }
       
  6754 #endif
       
  6755       checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
       
  6756     }
       
  6757 
       
  6758     /* Check sanity of left child page.
       
  6759     */
       
  6760     if( !pPage->leaf ){
       
  6761       pgno = get4byte(pCell);
       
  6762 #ifndef SQLITE_OMIT_AUTOVACUUM
       
  6763       if( pBt->autoVacuum ){
       
  6764         checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
       
  6765       }
       
  6766 #endif
       
  6767       d2 = checkTreePage(pCheck,pgno,pPage,zContext);
       
  6768       if( i>0 && d2!=depth ){
       
  6769         checkAppendMsg(pCheck, zContext, "Child page depth differs");
       
  6770       }
       
  6771       depth = d2;
       
  6772     }
       
  6773   }
       
  6774   if( !pPage->leaf ){
       
  6775     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
       
  6776     sqlite3_snprintf(sizeof(zContext), zContext, 
       
  6777                      "On page %d at right child: ", iPage);
       
  6778 #ifndef SQLITE_OMIT_AUTOVACUUM
       
  6779     if( pBt->autoVacuum ){
       
  6780       checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0);
       
  6781     }
       
  6782 #endif
       
  6783     checkTreePage(pCheck, pgno, pPage, zContext);
       
  6784   }
       
  6785  
       
  6786   /* Check for complete coverage of the page
       
  6787   */
       
  6788   data = pPage->aData;
       
  6789   hdr = pPage->hdrOffset;
       
  6790   hit = sqlite3PageMalloc( pBt->pageSize );
       
  6791   if( hit==0 ){
       
  6792     pCheck->mallocFailed = 1;
       
  6793   }else{
       
  6794     memset(hit, 0, usableSize );
       
  6795     memset(hit, 1, get2byte(&data[hdr+5]));
       
  6796     nCell = get2byte(&data[hdr+3]);
       
  6797     cellStart = hdr + 12 - 4*pPage->leaf;
       
  6798     for(i=0; i<nCell; i++){
       
  6799       int pc = get2byte(&data[cellStart+i*2]);
       
  6800       u16 size = 1024;
       
  6801       int j;
       
  6802       if( pc<=usableSize ){
       
  6803         size = cellSizePtr(pPage, &data[pc]);
       
  6804       }
       
  6805       if( (pc+size-1)>=usableSize || pc<0 ){
       
  6806         checkAppendMsg(pCheck, 0, 
       
  6807             "Corruption detected in cell %d on page %d",i,iPage,0);
       
  6808       }else{
       
  6809         for(j=pc+size-1; j>=pc; j--) hit[j]++;
       
  6810       }
       
  6811     }
       
  6812     for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000; 
       
  6813            cnt++){
       
  6814       int size = get2byte(&data[i+2]);
       
  6815       int j;
       
  6816       if( (i+size-1)>=usableSize || i<0 ){
       
  6817         checkAppendMsg(pCheck, 0,  
       
  6818             "Corruption detected in cell %d on page %d",i,iPage,0);
       
  6819       }else{
       
  6820         for(j=i+size-1; j>=i; j--) hit[j]++;
       
  6821       }
       
  6822       i = get2byte(&data[i]);
       
  6823     }
       
  6824     for(i=cnt=0; i<usableSize; i++){
       
  6825       if( hit[i]==0 ){
       
  6826         cnt++;
       
  6827       }else if( hit[i]>1 ){
       
  6828         checkAppendMsg(pCheck, 0,
       
  6829           "Multiple uses for byte %d of page %d", i, iPage);
       
  6830         break;
       
  6831       }
       
  6832     }
       
  6833     if( cnt!=data[hdr+7] ){
       
  6834       checkAppendMsg(pCheck, 0, 
       
  6835           "Fragmented space is %d byte reported as %d on page %d",
       
  6836           cnt, data[hdr+7], iPage);
       
  6837     }
       
  6838   }
       
  6839   sqlite3PageFree(hit);
       
  6840 
       
  6841   releasePage(pPage);
       
  6842   return depth+1;
       
  6843 }
       
  6844 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
       
  6845 
       
  6846 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
       
  6847 /*
       
  6848 ** This routine does a complete check of the given BTree file.  aRoot[] is
       
  6849 ** an array of pages numbers were each page number is the root page of
       
  6850 ** a table.  nRoot is the number of entries in aRoot.
       
  6851 **
       
  6852 ** Write the number of error seen in *pnErr.  Except for some memory
       
  6853 ** allocation errors,  nn error message is held in memory obtained from
       
  6854 ** malloc is returned if *pnErr is non-zero.  If *pnErr==0 then NULL is
       
  6855 ** returned.
       
  6856 */
       
  6857 char *sqlite3BtreeIntegrityCheck(
       
  6858   Btree *p,     /* The btree to be checked */
       
  6859   int *aRoot,   /* An array of root pages numbers for individual trees */
       
  6860   int nRoot,    /* Number of entries in aRoot[] */
       
  6861   int mxErr,    /* Stop reporting errors after this many */
       
  6862   int *pnErr    /* Write number of errors seen to this variable */
       
  6863 ){
       
  6864   int i;
       
  6865   int nRef;
       
  6866   IntegrityCk sCheck;
       
  6867   BtShared *pBt = p->pBt;
       
  6868   char zErr[100];
       
  6869 
       
  6870   sqlite3BtreeEnter(p);
       
  6871   pBt->db = p->db;
       
  6872   nRef = sqlite3PagerRefcount(pBt->pPager);
       
  6873   if( lockBtreeWithRetry(p)!=SQLITE_OK ){
       
  6874     *pnErr = 1;
       
  6875     sqlite3BtreeLeave(p);
       
  6876     return sqlite3DbStrDup(0, "cannot acquire a read lock on the database");
       
  6877   }
       
  6878   sCheck.pBt = pBt;
       
  6879   sCheck.pPager = pBt->pPager;
       
  6880   sCheck.nPage = pagerPagecount(sCheck.pPager);
       
  6881   sCheck.mxErr = mxErr;
       
  6882   sCheck.nErr = 0;
       
  6883   sCheck.mallocFailed = 0;
       
  6884   *pnErr = 0;
       
  6885 #ifndef SQLITE_OMIT_AUTOVACUUM
       
  6886   if( pBt->nTrunc!=0 ){
       
  6887     sCheck.nPage = pBt->nTrunc;
       
  6888   }
       
  6889 #endif
       
  6890   if( sCheck.nPage==0 ){
       
  6891     unlockBtreeIfUnused(pBt);
       
  6892     sqlite3BtreeLeave(p);
       
  6893     return 0;
       
  6894   }
       
  6895   sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
       
  6896   if( !sCheck.anRef ){
       
  6897     unlockBtreeIfUnused(pBt);
       
  6898     *pnErr = 1;
       
  6899     sqlite3BtreeLeave(p);
       
  6900     return 0;
       
  6901   }
       
  6902   for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
       
  6903   i = PENDING_BYTE_PAGE(pBt);
       
  6904   if( i<=sCheck.nPage ){
       
  6905     sCheck.anRef[i] = 1;
       
  6906   }
       
  6907   sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000);
       
  6908 
       
  6909   /* Check the integrity of the freelist
       
  6910   */
       
  6911   checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
       
  6912             get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
       
  6913 
       
  6914   /* Check all the tables.
       
  6915   */
       
  6916   for(i=0; i<nRoot && sCheck.mxErr; i++){
       
  6917     if( aRoot[i]==0 ) continue;
       
  6918 #ifndef SQLITE_OMIT_AUTOVACUUM
       
  6919     if( pBt->autoVacuum && aRoot[i]>1 ){
       
  6920       checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
       
  6921     }
       
  6922 #endif
       
  6923     checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ");
       
  6924   }
       
  6925 
       
  6926   /* Make sure every page in the file is referenced
       
  6927   */
       
  6928   for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
       
  6929 #ifdef SQLITE_OMIT_AUTOVACUUM
       
  6930     if( sCheck.anRef[i]==0 ){
       
  6931       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
       
  6932     }
       
  6933 #else
       
  6934     /* If the database supports auto-vacuum, make sure no tables contain
       
  6935     ** references to pointer-map pages.
       
  6936     */
       
  6937     if( sCheck.anRef[i]==0 && 
       
  6938        (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
       
  6939       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
       
  6940     }
       
  6941     if( sCheck.anRef[i]!=0 && 
       
  6942        (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
       
  6943       checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
       
  6944     }
       
  6945 #endif
       
  6946   }
       
  6947 
       
  6948   /* Make sure this analysis did not leave any unref() pages
       
  6949   */
       
  6950   unlockBtreeIfUnused(pBt);
       
  6951   if( nRef != sqlite3PagerRefcount(pBt->pPager) ){
       
  6952     checkAppendMsg(&sCheck, 0, 
       
  6953       "Outstanding page count goes from %d to %d during this analysis",
       
  6954       nRef, sqlite3PagerRefcount(pBt->pPager)
       
  6955     );
       
  6956   }
       
  6957 
       
  6958   /* Clean  up and report errors.
       
  6959   */
       
  6960   sqlite3BtreeLeave(p);
       
  6961   sqlite3_free(sCheck.anRef);
       
  6962   if( sCheck.mallocFailed ){
       
  6963     sqlite3StrAccumReset(&sCheck.errMsg);
       
  6964     *pnErr = sCheck.nErr+1;
       
  6965     return 0;
       
  6966   }
       
  6967   *pnErr = sCheck.nErr;
       
  6968   if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
       
  6969   return sqlite3StrAccumFinish(&sCheck.errMsg);
       
  6970 }
       
  6971 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
       
  6972 
       
  6973 /*
       
  6974 ** Return the full pathname of the underlying database file.
       
  6975 **
       
  6976 ** The pager filename is invariant as long as the pager is
       
  6977 ** open so it is safe to access without the BtShared mutex.
       
  6978 */
       
  6979 const char *sqlite3BtreeGetFilename(Btree *p){
       
  6980   assert( p->pBt->pPager!=0 );
       
  6981   return sqlite3PagerFilename(p->pBt->pPager);
       
  6982 }
       
  6983 
       
  6984 /*
       
  6985 ** Return the pathname of the directory that contains the database file.
       
  6986 **
       
  6987 ** The pager directory name is invariant as long as the pager is
       
  6988 ** open so it is safe to access without the BtShared mutex.
       
  6989 */
       
  6990 const char *sqlite3BtreeGetDirname(Btree *p){
       
  6991   assert( p->pBt->pPager!=0 );
       
  6992   return sqlite3PagerDirname(p->pBt->pPager);
       
  6993 }
       
  6994 
       
  6995 /*
       
  6996 ** Return the pathname of the journal file for this database. The return
       
  6997 ** value of this routine is the same regardless of whether the journal file
       
  6998 ** has been created or not.
       
  6999 **
       
  7000 ** The pager journal filename is invariant as long as the pager is
       
  7001 ** open so it is safe to access without the BtShared mutex.
       
  7002 */
       
  7003 const char *sqlite3BtreeGetJournalname(Btree *p){
       
  7004   assert( p->pBt->pPager!=0 );
       
  7005   return sqlite3PagerJournalname(p->pBt->pPager);
       
  7006 }
       
  7007 
       
  7008 #ifndef SQLITE_OMIT_VACUUM
       
  7009 /*
       
  7010 ** Copy the complete content of pBtFrom into pBtTo.  A transaction
       
  7011 ** must be active for both files.
       
  7012 **
       
  7013 ** The size of file pTo may be reduced by this operation.
       
  7014 ** If anything goes wrong, the transaction on pTo is rolled back. 
       
  7015 **
       
  7016 ** If successful, CommitPhaseOne() may be called on pTo before returning. 
       
  7017 ** The caller should finish committing the transaction on pTo by calling
       
  7018 ** sqlite3BtreeCommit().
       
  7019 */
       
  7020 static int btreeCopyFile(Btree *pTo, Btree *pFrom){
       
  7021   int rc = SQLITE_OK;
       
  7022   Pgno i;
       
  7023 
       
  7024   Pgno nFromPage;     /* Number of pages in pFrom */
       
  7025   Pgno nToPage;       /* Number of pages in pTo */
       
  7026   Pgno nNewPage;      /* Number of pages in pTo after the copy */
       
  7027 
       
  7028   Pgno iSkip;         /* Pending byte page in pTo */
       
  7029   int nToPageSize;    /* Page size of pTo in bytes */
       
  7030   int nFromPageSize;  /* Page size of pFrom in bytes */
       
  7031 
       
  7032   BtShared *pBtTo = pTo->pBt;
       
  7033   BtShared *pBtFrom = pFrom->pBt;
       
  7034   pBtTo->db = pTo->db;
       
  7035   pBtFrom->db = pFrom->db;
       
  7036 
       
  7037   nToPageSize = pBtTo->pageSize;
       
  7038   nFromPageSize = pBtFrom->pageSize;
       
  7039 
       
  7040   if( pTo->inTrans!=TRANS_WRITE || pFrom->inTrans!=TRANS_WRITE ){
       
  7041     return SQLITE_ERROR;
       
  7042   }
       
  7043   if( pBtTo->pCursor ){
       
  7044     return SQLITE_BUSY;
       
  7045   }
       
  7046 
       
  7047   nToPage = pagerPagecount(pBtTo->pPager);
       
  7048   nFromPage = pagerPagecount(pBtFrom->pPager);
       
  7049   iSkip = PENDING_BYTE_PAGE(pBtTo);
       
  7050 
       
  7051   /* Variable nNewPage is the number of pages required to store the
       
  7052   ** contents of pFrom using the current page-size of pTo.
       
  7053   */
       
  7054   nNewPage = ((i64)nFromPage * (i64)nFromPageSize + (i64)nToPageSize - 1) / 
       
  7055       (i64)nToPageSize;
       
  7056 
       
  7057   for(i=1; rc==SQLITE_OK && (i<=nToPage || i<=nNewPage); i++){
       
  7058 
       
  7059     /* Journal the original page.
       
  7060     **
       
  7061     ** iSkip is the page number of the locking page (PENDING_BYTE_PAGE)
       
  7062     ** in database *pTo (before the copy). This page is never written 
       
  7063     ** into the journal file. Unless i==iSkip or the page was not
       
  7064     ** present in pTo before the copy operation, journal page i from pTo.
       
  7065     */
       
  7066     if( i!=iSkip && i<=nToPage ){
       
  7067       DbPage *pDbPage = 0;
       
  7068       rc = sqlite3PagerGet(pBtTo->pPager, i, &pDbPage);
       
  7069       if( rc==SQLITE_OK ){
       
  7070         rc = sqlite3PagerWrite(pDbPage);
       
  7071         if( rc==SQLITE_OK && i>nFromPage ){
       
  7072           /* Yeah.  It seems wierd to call DontWrite() right after Write(). But
       
  7073           ** that is because the names of those procedures do not exactly 
       
  7074           ** represent what they do.  Write() really means "put this page in the
       
  7075           ** rollback journal and mark it as dirty so that it will be written
       
  7076           ** to the database file later."  DontWrite() undoes the second part of
       
  7077           ** that and prevents the page from being written to the database. The
       
  7078           ** page is still on the rollback journal, though.  And that is the 
       
  7079           ** whole point of this block: to put pages on the rollback journal. 
       
  7080           */
       
  7081           rc = sqlite3PagerDontWrite(pDbPage);
       
  7082         }
       
  7083         sqlite3PagerUnref(pDbPage);
       
  7084       }
       
  7085     }
       
  7086 
       
  7087     /* Overwrite the data in page i of the target database */
       
  7088     if( rc==SQLITE_OK && i!=iSkip && i<=nNewPage ){
       
  7089 
       
  7090       DbPage *pToPage = 0;
       
  7091       sqlite3_int64 iOff;
       
  7092 
       
  7093       rc = sqlite3PagerGet(pBtTo->pPager, i, &pToPage);
       
  7094       if( rc==SQLITE_OK ){
       
  7095         rc = sqlite3PagerWrite(pToPage);
       
  7096       }
       
  7097 
       
  7098       for(
       
  7099         iOff=(i-1)*nToPageSize; 
       
  7100         rc==SQLITE_OK && iOff<i*nToPageSize; 
       
  7101         iOff += nFromPageSize
       
  7102       ){
       
  7103         DbPage *pFromPage = 0;
       
  7104         Pgno iFrom = (iOff/nFromPageSize)+1;
       
  7105 
       
  7106         if( iFrom==PENDING_BYTE_PAGE(pBtFrom) ){
       
  7107           continue;
       
  7108         }
       
  7109 
       
  7110         rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
       
  7111         if( rc==SQLITE_OK ){
       
  7112           char *zTo = sqlite3PagerGetData(pToPage);
       
  7113           char *zFrom = sqlite3PagerGetData(pFromPage);
       
  7114           int nCopy;
       
  7115 
       
  7116           if( nFromPageSize>=nToPageSize ){
       
  7117             zFrom += ((i-1)*nToPageSize - ((iFrom-1)*nFromPageSize));
       
  7118             nCopy = nToPageSize;
       
  7119           }else{
       
  7120             zTo += (((iFrom-1)*nFromPageSize) - (i-1)*nToPageSize);
       
  7121             nCopy = nFromPageSize;
       
  7122           }
       
  7123 
       
  7124           memcpy(zTo, zFrom, nCopy);
       
  7125           sqlite3PagerUnref(pFromPage);
       
  7126         }
       
  7127       }
       
  7128 
       
  7129       if( pToPage ){
       
  7130         MemPage *p = (MemPage *)sqlite3PagerGetExtra(pToPage);
       
  7131         p->isInit = 0;
       
  7132         sqlite3PagerUnref(pToPage);
       
  7133       }
       
  7134     }
       
  7135   }
       
  7136 
       
  7137   /* If things have worked so far, the database file may need to be 
       
  7138   ** truncated. The complex part is that it may need to be truncated to
       
  7139   ** a size that is not an integer multiple of nToPageSize - the current
       
  7140   ** page size used by the pager associated with B-Tree pTo.
       
  7141   **
       
  7142   ** For example, say the page-size of pTo is 2048 bytes and the original 
       
  7143   ** number of pages is 5 (10 KB file). If pFrom has a page size of 1024 
       
  7144   ** bytes and 9 pages, then the file needs to be truncated to 9KB.
       
  7145   */
       
  7146   if( rc==SQLITE_OK ){
       
  7147     if( nFromPageSize!=nToPageSize ){
       
  7148       sqlite3_file *pFile = sqlite3PagerFile(pBtTo->pPager);
       
  7149       i64 iSize = (i64)nFromPageSize * (i64)nFromPage;
       
  7150       i64 iNow = (i64)((nToPage>nNewPage)?nToPage:nNewPage) * (i64)nToPageSize; 
       
  7151       i64 iPending = ((i64)PENDING_BYTE_PAGE(pBtTo)-1) *(i64)nToPageSize;
       
  7152   
       
  7153       assert( iSize<=iNow );
       
  7154   
       
  7155       /* Commit phase one syncs the journal file associated with pTo 
       
  7156       ** containing the original data. It does not sync the database file
       
  7157       ** itself. After doing this it is safe to use OsTruncate() and other
       
  7158       ** file APIs on the database file directly.
       
  7159       */
       
  7160       pBtTo->db = pTo->db;
       
  7161       rc = sqlite3PagerCommitPhaseOne(pBtTo->pPager, 0, 0, 1);
       
  7162       if( iSize<iNow && rc==SQLITE_OK ){
       
  7163         rc = sqlite3OsTruncate(pFile, iSize);
       
  7164       }
       
  7165   
       
  7166       /* The loop that copied data from database pFrom to pTo did not
       
  7167       ** populate the locking page of database pTo. If the page-size of
       
  7168       ** pFrom is smaller than that of pTo, this means some data will
       
  7169       ** not have been copied. 
       
  7170       **
       
  7171       ** This block copies the missing data from database pFrom to pTo 
       
  7172       ** using file APIs. This is safe because at this point we know that
       
  7173       ** all of the original data from pTo has been synced into the 
       
  7174       ** journal file. At this point it would be safe to do anything at
       
  7175       ** all to the database file except truncate it to zero bytes.
       
  7176       */
       
  7177       if( rc==SQLITE_OK && nFromPageSize<nToPageSize && iSize>iPending){
       
  7178         i64 iOff;
       
  7179         for(
       
  7180           iOff=iPending; 
       
  7181           rc==SQLITE_OK && iOff<(iPending+nToPageSize); 
       
  7182           iOff += nFromPageSize
       
  7183         ){
       
  7184           DbPage *pFromPage = 0;
       
  7185           Pgno iFrom = (iOff/nFromPageSize)+1;
       
  7186   
       
  7187           if( iFrom==PENDING_BYTE_PAGE(pBtFrom) || iFrom>nFromPage ){
       
  7188             continue;
       
  7189           }
       
  7190   
       
  7191           rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
       
  7192           if( rc==SQLITE_OK ){
       
  7193             char *zFrom = sqlite3PagerGetData(pFromPage);
       
  7194             rc = sqlite3OsWrite(pFile, zFrom, nFromPageSize, iOff);
       
  7195             sqlite3PagerUnref(pFromPage);
       
  7196           }
       
  7197         }
       
  7198       }
       
  7199   
       
  7200       /* Sync the database file */
       
  7201       if( rc==SQLITE_OK ){
       
  7202         rc = sqlite3PagerSync(pBtTo->pPager);
       
  7203       }
       
  7204     }else{
       
  7205       rc = sqlite3PagerTruncate(pBtTo->pPager, nNewPage);
       
  7206     }
       
  7207     if( rc==SQLITE_OK ){
       
  7208       pBtTo->pageSizeFixed = 0;
       
  7209     }
       
  7210   }
       
  7211 
       
  7212   if( rc ){
       
  7213     sqlite3BtreeRollback(pTo);
       
  7214   }
       
  7215 
       
  7216   return rc;  
       
  7217 }
       
  7218 int sqlite3BtreeCopyFile(Btree *pTo, Btree *pFrom){
       
  7219   int rc;
       
  7220   sqlite3BtreeEnter(pTo);
       
  7221   sqlite3BtreeEnter(pFrom);
       
  7222   rc = btreeCopyFile(pTo, pFrom);
       
  7223   sqlite3BtreeLeave(pFrom);
       
  7224   sqlite3BtreeLeave(pTo);
       
  7225   return rc;
       
  7226 }
       
  7227 
       
  7228 #endif /* SQLITE_OMIT_VACUUM */
       
  7229 
       
  7230 /*
       
  7231 ** Return non-zero if a transaction is active.
       
  7232 */
       
  7233 int sqlite3BtreeIsInTrans(Btree *p){
       
  7234   assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
       
  7235   return (p && (p->inTrans==TRANS_WRITE));
       
  7236 }
       
  7237 
       
  7238 /*
       
  7239 ** Return non-zero if a statement transaction is active.
       
  7240 */
       
  7241 int sqlite3BtreeIsInStmt(Btree *p){
       
  7242   assert( sqlite3BtreeHoldsMutex(p) );
       
  7243   return (p->pBt && p->pBt->inStmt);
       
  7244 }
       
  7245 
       
  7246 /*
       
  7247 ** Return non-zero if a read (or write) transaction is active.
       
  7248 */
       
  7249 int sqlite3BtreeIsInReadTrans(Btree *p){
       
  7250   assert( sqlite3_mutex_held(p->db->mutex) );
       
  7251   return (p && (p->inTrans!=TRANS_NONE));
       
  7252 }
       
  7253 
       
  7254 /*
       
  7255 ** This function returns a pointer to a blob of memory associated with
       
  7256 ** a single shared-btree. The memory is used by client code for its own
       
  7257 ** purposes (for example, to store a high-level schema associated with 
       
  7258 ** the shared-btree). The btree layer manages reference counting issues.
       
  7259 **
       
  7260 ** The first time this is called on a shared-btree, nBytes bytes of memory
       
  7261 ** are allocated, zeroed, and returned to the caller. For each subsequent 
       
  7262 ** call the nBytes parameter is ignored and a pointer to the same blob
       
  7263 ** of memory returned. 
       
  7264 **
       
  7265 ** If the nBytes parameter is 0 and the blob of memory has not yet been
       
  7266 ** allocated, a null pointer is returned. If the blob has already been
       
  7267 ** allocated, it is returned as normal.
       
  7268 **
       
  7269 ** Just before the shared-btree is closed, the function passed as the 
       
  7270 ** xFree argument when the memory allocation was made is invoked on the 
       
  7271 ** blob of allocated memory. This function should not call sqlite3_free()
       
  7272 ** on the memory, the btree layer does that.
       
  7273 */
       
  7274 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
       
  7275   BtShared *pBt = p->pBt;
       
  7276   sqlite3BtreeEnter(p);
       
  7277   if( !pBt->pSchema && nBytes ){
       
  7278     pBt->pSchema = sqlite3MallocZero(nBytes);
       
  7279     pBt->xFreeSchema = xFree;
       
  7280   }
       
  7281   sqlite3BtreeLeave(p);
       
  7282   return pBt->pSchema;
       
  7283 }
       
  7284 
       
  7285 /*
       
  7286 ** Return true if another user of the same shared btree as the argument
       
  7287 ** handle holds an exclusive lock on the sqlite_master table.
       
  7288 */
       
  7289 int sqlite3BtreeSchemaLocked(Btree *p){
       
  7290   int rc;
       
  7291   assert( sqlite3_mutex_held(p->db->mutex) );
       
  7292   sqlite3BtreeEnter(p);
       
  7293   rc = (queryTableLock(p, MASTER_ROOT, READ_LOCK)!=SQLITE_OK);
       
  7294   sqlite3BtreeLeave(p);
       
  7295   return rc;
       
  7296 }
       
  7297 
       
  7298 
       
  7299 #ifndef SQLITE_OMIT_SHARED_CACHE
       
  7300 /*
       
  7301 ** Obtain a lock on the table whose root page is iTab.  The
       
  7302 ** lock is a write lock if isWritelock is true or a read lock
       
  7303 ** if it is false.
       
  7304 */
       
  7305 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
       
  7306   int rc = SQLITE_OK;
       
  7307   if( p->sharable ){
       
  7308     u8 lockType = READ_LOCK + isWriteLock;
       
  7309     assert( READ_LOCK+1==WRITE_LOCK );
       
  7310     assert( isWriteLock==0 || isWriteLock==1 );
       
  7311     sqlite3BtreeEnter(p);
       
  7312     rc = queryTableLock(p, iTab, lockType);
       
  7313     if( rc==SQLITE_OK ){
       
  7314       rc = lockTable(p, iTab, lockType);
       
  7315     }
       
  7316     sqlite3BtreeLeave(p);
       
  7317   }
       
  7318   return rc;
       
  7319 }
       
  7320 #endif
       
  7321 
       
  7322 #ifndef SQLITE_OMIT_INCRBLOB
       
  7323 /*
       
  7324 ** Argument pCsr must be a cursor opened for writing on an 
       
  7325 ** INTKEY table currently pointing at a valid table entry. 
       
  7326 ** This function modifies the data stored as part of that entry.
       
  7327 ** Only the data content may only be modified, it is not possible
       
  7328 ** to change the length of the data stored.
       
  7329 */
       
  7330 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
       
  7331   assert( cursorHoldsMutex(pCsr) );
       
  7332   assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
       
  7333   assert(pCsr->isIncrblobHandle);
       
  7334 
       
  7335   restoreCursorPosition(pCsr);
       
  7336   assert( pCsr->eState!=CURSOR_REQUIRESEEK );
       
  7337   if( pCsr->eState!=CURSOR_VALID ){
       
  7338     return SQLITE_ABORT;
       
  7339   }
       
  7340 
       
  7341   /* Check some preconditions: 
       
  7342   **   (a) the cursor is open for writing,
       
  7343   **   (b) there is no read-lock on the table being modified and
       
  7344   **   (c) the cursor points at a valid row of an intKey table.
       
  7345   */
       
  7346   if( !pCsr->wrFlag ){
       
  7347     return SQLITE_READONLY;
       
  7348   }
       
  7349   assert( !pCsr->pBt->readOnly 
       
  7350           && pCsr->pBt->inTransaction==TRANS_WRITE );
       
  7351   if( checkReadLocks(pCsr->pBtree, pCsr->pgnoRoot, pCsr, 0) ){
       
  7352     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
       
  7353   }
       
  7354   if( pCsr->eState==CURSOR_INVALID || !pCsr->apPage[pCsr->iPage]->intKey ){
       
  7355     return SQLITE_ERROR;
       
  7356   }
       
  7357 
       
  7358   return accessPayload(pCsr, offset, amt, (unsigned char *)z, 0, 1);
       
  7359 }
       
  7360 
       
  7361 /* 
       
  7362 ** Set a flag on this cursor to cache the locations of pages from the 
       
  7363 ** overflow list for the current row. This is used by cursors opened
       
  7364 ** for incremental blob IO only.
       
  7365 **
       
  7366 ** This function sets a flag only. The actual page location cache
       
  7367 ** (stored in BtCursor.aOverflow[]) is allocated and used by function
       
  7368 ** accessPayload() (the worker function for sqlite3BtreeData() and
       
  7369 ** sqlite3BtreePutData()).
       
  7370 */
       
  7371 void sqlite3BtreeCacheOverflow(BtCursor *pCur){
       
  7372   assert( cursorHoldsMutex(pCur) );
       
  7373   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
       
  7374   assert(!pCur->isIncrblobHandle);
       
  7375   assert(!pCur->aOverflow);
       
  7376   pCur->isIncrblobHandle = 1;
       
  7377 }
       
  7378 #endif