--- libdb/fileops/fop_util.c.orig Thu Nov 20 23:13:30 2003 +++ libdb/fileops/fop_util.c Fri Mar 18 20:31:10 2005 @@ -40,7 +40,7 @@ u_int32_t __lockval; \ \ if (LOCKING_ON((ENV))) { \ - __lockval = 0; \ + __lockval = 1; \ __dbt.data = &__lockval; \ __dbt.size = sizeof(__lockval); \ if ((ret = (ENV)->lock_get((ENV), (ID), \ --- libdb/dbinc/mp.h.orig Thu Nov 20 23:13:17 2003 +++ libdb/dbinc/mp.h Fri Mar 18 20:31:14 2005 @@ -149,6 +149,13 @@ * region lock). */ DB_MPOOL_STAT stat; /* Per-cache mpool statistics. */ + + /* + * We track page puts so that we can decide when allocation is never + * going to succeed. We don't lock the field, all we care about is + * if it changes. + */ + u_int32_t put_counter; /* Count of page put calls. */ }; struct __db_mpool_hash { --- libdb/mp/mp_fput.c.orig Thu Nov 20 23:13:36 2003 +++ libdb/mp/mp_fput.c Fri Mar 18 20:31:14 2005 @@ -19,6 +19,8 @@ #include "dbinc/db_shash.h" #include "dbinc/mp.h" +static void __memp_reset_lru __P((DB_ENV *, REGINFO *)); + /* * __memp_fput -- * Mpool file put function. @@ -198,5 +200,56 @@ MUTEX_UNLOCK(dbenv, &hp->hash_mutex); + /* + * On every buffer put we update the buffer generation number and check + * for wraparound. + */ + if (++c_mp->lru_count == UINT32_T_MAX) + __memp_reset_lru(dbenv, dbmp->reginfo); + return (0); +} + +/* + * __memp_reset_lru -- + * Reset the cache LRU counter. + */ +static void +__memp_reset_lru(dbenv, memreg) + DB_ENV *dbenv; + REGINFO *memreg; +{ + BH *bhp; + DB_MPOOL_HASH *hp; + MPOOL *c_mp; + int bucket; + + c_mp = memreg->primary; + + /* + * Update the counter so all future allocations will start at the + * bottom. + */ + c_mp->lru_count -= MPOOL_BASE_DECREMENT; + + /* Adjust the priority of every buffer in the system. */ + for (hp = R_ADDR(memreg, c_mp->htab), + bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) { + /* + * Skip empty buckets. + * + * We can check for empty buckets before locking as we + * only care if the pointer is zero or non-zero. + */ + if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) + continue; + + MUTEX_LOCK(dbenv, &hp->hash_mutex); + for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); + bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) + if (bhp->priority != UINT32_T_MAX && + bhp->priority > MPOOL_BASE_DECREMENT) + bhp->priority -= MPOOL_BASE_DECREMENT; + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); + } } --- libdb/mp/mp_alloc.c.orig Thu Nov 20 23:13:36 2003 +++ libdb/mp/mp_alloc.c Fri Mar 18 20:31:14 2005 @@ -25,7 +25,6 @@ } HS; static void __memp_bad_buffer __P((DB_MPOOL_HASH *)); -static void __memp_reset_lru __P((DB_ENV *, REGINFO *, MPOOL *)); /* * __memp_alloc -- @@ -50,8 +49,9 @@ MPOOL *c_mp; MPOOLFILE *bh_mfp; size_t freed_space; - u_int32_t buckets, buffers, high_priority, max_na, priority; - int aggressive, ret; + u_int32_t buckets, buffers, high_priority, priority, put_counter; + u_int32_t total_buckets; + int aggressive, giveup, ret; void *p; dbenv = dbmp->dbenv; @@ -59,18 +59,13 @@ dbht = R_ADDR(memreg, c_mp->htab); hp_end = &dbht[c_mp->htab_buckets]; - buckets = buffers = 0; - aggressive = 0; + buckets = buffers = put_counter = total_buckets = 0; + aggressive = giveup = 0; + hp_tmp = NULL; c_mp->stat.st_alloc++; /* - * Get aggressive if we've tried to flush the number of pages as are - * in the system without finding space. - */ - max_na = 5 * c_mp->htab_buckets; - - /* * If we're allocating a buffer, and the one we're discarding is the * same size, we don't want to waste the time to re-integrate it into * the shared memory free list. If the DB_MPOOLFILE argument isn't @@ -81,19 +76,10 @@ len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize; R_LOCK(dbenv, memreg); - - /* - * On every buffer allocation we update the buffer generation number - * and check for wraparound. - */ - if (++c_mp->lru_count == UINT32_T_MAX) - __memp_reset_lru(dbenv, memreg, c_mp); - /* * Anything newer than 1/10th of the buffer pool is ignored during * allocation (unless allocation starts failing). */ - DB_ASSERT(c_mp->lru_count > c_mp->stat.st_pages / 10); high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10; /* @@ -120,10 +106,11 @@ * We're not holding the region locked here, these statistics * can't be trusted. */ - if (buckets != 0) { - if (buckets > c_mp->stat.st_alloc_max_buckets) - c_mp->stat.st_alloc_max_buckets = buckets; - c_mp->stat.st_alloc_buckets += buckets; + total_buckets += buckets; + if (total_buckets != 0) { + if (total_buckets > c_mp->stat.st_alloc_max_buckets) + c_mp->stat.st_alloc_max_buckets = total_buckets; + c_mp->stat.st_alloc_buckets += total_buckets; } if (buffers != 0) { if (buffers > c_mp->stat.st_alloc_max_pages) @@ -131,6 +118,12 @@ c_mp->stat.st_alloc_pages += buffers; } return (0); + } else if (giveup || c_mp->stat.st_pages == 0) { + R_UNLOCK(dbenv, memreg); + + __db_err(dbenv, + "unable to allocate space from the buffer cache"); + return (ret); } /* @@ -138,26 +131,24 @@ * we need. Reset our free-space counter. */ freed_space = 0; + total_buckets += buckets; + buckets = 0; /* * Walk the hash buckets and find the next two with potentially useful * buffers. Free the buffer with the lowest priority from the buckets' * chains. */ - for (hp_tmp = NULL;;) { + for (;;) { + /* All pages have been freed, make one last try */ + if (c_mp->stat.st_pages == 0) + goto alloc; + /* Check for wrap around. */ hp = &dbht[c_mp->last_checked++]; if (hp >= hp_end) { c_mp->last_checked = 0; - - /* - * If we've gone through all of the hash buckets, try - * an allocation. If the cache is small, the old page - * size is small, and the new page size is large, we - * might have freed enough memory (but not 3 times the - * memory). - */ - goto alloc; + hp = &dbht[c_mp->last_checked++]; } /* @@ -172,39 +163,59 @@ /* * The failure mode is when there are too many buffers we can't * write or there's not enough memory in the system. We don't - * have a metric for deciding if allocation has no possible way - * to succeed, so we don't ever fail, we assume memory will be - * available if we wait long enough. + * have a way to know that allocation has no way to succeed. + * We fail if there were no pages returned to the cache after + * we've been trying for a relatively long time. * - * Get aggressive if we've tried to flush 5 times the number of - * hash buckets as are in the system -- it's possible we have - * been repeatedly trying to flush the same buffers, although - * it's unlikely. Aggressive means: + * Get aggressive if we've tried to flush the number of hash + * buckets as are in the system and have not found any more + * space. Aggressive means: * * a: set a flag to attempt to flush high priority buffers as * well as other buffers. * b: sync the mpool to force out queue extent pages. While we * might not have enough space for what we want and flushing * is expensive, why not? - * c: sleep for a second -- hopefully someone else will run and - * free up some memory. Try to allocate memory too, in case - * the other thread returns its memory to the region. - * d: look at a buffer in every hash bucket rather than choose + * c: look at a buffer in every hash bucket rather than choose * the more preferable of two. + * d: start to think about giving up. + * + * If we get here twice, sleep for a second, hopefully someone + * else will run and free up some memory. + * + * Always try to allocate memory too, in case some other thread + * returns its memory to the region. * * !!! * This test ignores pathological cases like no buffers in the * system -- that shouldn't be possible. */ - if ((++buckets % max_na) == 0) { - aggressive = 1; - + if ((++buckets % c_mp->htab_buckets) == 0) { + if (freed_space > 0) + goto alloc; R_UNLOCK(dbenv, memreg); - (void)__memp_sync_int( - dbenv, NULL, 0, DB_SYNC_ALLOC, NULL); - - (void)__os_sleep(dbenv, 1, 0); + switch (++aggressive) { + case 1: + break; + case 2: + put_counter = c_mp->put_counter; + /* FALLTHROUGH */ + case 3: + case 4: + case 5: + case 6: + (void)__memp_sync_int( + dbenv, NULL, 0, DB_SYNC_ALLOC, NULL); + + (void)__os_sleep(dbenv, 1, 0); + break; + default: + aggressive = 1; + if (put_counter == c_mp->put_counter) + giveup = 1; + break; + } R_LOCK(dbenv, memreg); goto alloc; @@ -277,7 +288,8 @@ * thread may have acquired this buffer and incremented the ref * count after we wrote it, in which case we can't have it. * - * If there's a write error, avoid selecting this buffer again + * If there's a write error and we're having problems finding + * something to allocate, avoid selecting this buffer again * by making it the bucket's least-desirable buffer. */ if (ret != 0 || bhp->ref != 0) { @@ -301,6 +313,8 @@ freed_space += __db_shsizeof(bhp); __memp_bhfree(dbmp, hp, bhp, 1); + if (aggressive > 1) + aggressive = 1; /* * Unlock this hash bucket and re-acquire the region lock. If @@ -360,54 +374,6 @@ /* Reset the hash bucket's priority. */ hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority; -} - -/* - * __memp_reset_lru -- - * Reset the cache LRU counter. - */ -static void -__memp_reset_lru(dbenv, memreg, c_mp) - DB_ENV *dbenv; - REGINFO *memreg; - MPOOL *c_mp; -{ - BH *bhp; - DB_MPOOL_HASH *hp; - int bucket; - - /* - * Update the counter so all future allocations will start at the - * bottom. - */ - c_mp->lru_count -= MPOOL_BASE_DECREMENT; - - /* Release the region lock. */ - R_UNLOCK(dbenv, memreg); - - /* Adjust the priority of every buffer in the system. */ - for (hp = R_ADDR(memreg, c_mp->htab), - bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) { - /* - * Skip empty buckets. - * - * We can check for empty buckets before locking as we - * only care if the pointer is zero or non-zero. - */ - if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) - continue; - - MUTEX_LOCK(dbenv, &hp->hash_mutex); - for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); - bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) - if (bhp->priority != UINT32_T_MAX && - bhp->priority > MPOOL_BASE_DECREMENT) - bhp->priority -= MPOOL_BASE_DECREMENT; - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - } - - /* Reacquire the region lock. */ - R_LOCK(dbenv, memreg); } #ifdef DIAGNOSTIC --- libdb/dbreg/dbreg_rec.c.orig Thu Nov 20 23:13:19 2003 +++ libdb/dbreg/dbreg_rec.c Fri Mar 18 20:31:14 2005 @@ -174,19 +174,20 @@ * Typically, closes should match an open which means * that if this is a close, there should be a valid * entry in the dbentry table when we get here, - * however there is an exception. If this is an + * however there are exceptions. 1. If this is an * OPENFILES pass, then we may have started from * a log file other than the first, and the * corresponding open appears in an earlier file. - * We can ignore that case, but all others are errors. + * 2. If we are undoing an open on an abort or + * recovery, it's possible that we failed after + * the log record, but before we actually entered + * a handle here. */ dbe = &dblp->dbentry[argp->fileid]; if (dbe->dbp == NULL && !dbe->deleted) { /* No valid entry here. */ - if ((argp->opcode != LOG_CLOSE && - argp->opcode != LOG_RCLOSE) || - (op != DB_TXN_OPENFILES && - op !=DB_TXN_POPENFILES)) { + if (DB_REDO(op) || + argp->opcode == LOG_CHECKPOINT) { __db_err(dbenv, "Improper file close at %lu/%lu", (u_long)lsnp->file, --- libdb/env/env_recover.c.orig Thu Nov 20 23:13:20 2003 +++ libdb/env/env_recover.c Fri Mar 18 20:31:14 2005 @@ -232,12 +232,9 @@ * we'll still need to do a vtruncate based on information we haven't * yet collected. */ - if (ret == DB_NOTFOUND) { + if (ret == DB_NOTFOUND) ret = 0; - if (max_lsn == NULL) - goto done; - } - if (ret != 0) + else if (ret != 0) goto err; hi_txn = txnid; @@ -331,7 +328,7 @@ /* Find a low txnid. */ ret = 0; - do { + if (hi_txn != 0) do { /* txnid is after rectype, which is a u_int32. */ memcpy(&txnid, (u_int8_t *)data.data + sizeof(u_int32_t), sizeof(txnid)); @@ -344,11 +341,8 @@ * There are no transactions and we're not recovering to an LSN (see * above), so there is nothing to do. */ - if (ret == DB_NOTFOUND) { + if (ret == DB_NOTFOUND) ret = 0; - if (max_lsn == NULL) - goto done; - } /* Reset to the first lsn. */ if (ret != 0 || (ret = logc->get(logc, &first_lsn, &data, DB_SET)) != 0) @@ -367,6 +361,10 @@ txninfo, &data, &first_lsn, &last_lsn, nfiles, 1)) != 0) goto err; + /* If there were no transactions, then we can bail out early. */ + if (hi_txn == 0 && max_lsn == NULL) + goto done; + /* * Pass #2. * @@ -483,6 +481,7 @@ if ((ret = __dbreg_close_files(dbenv)) != 0) goto err; +done: if (max_lsn != NULL) { region->last_ckp = ((DB_TXNHEAD *)txninfo)->ckplsn; @@ -538,7 +537,8 @@ __db_err(dbenv, "Recovery complete at %.24s", ctime(&now)); __db_err(dbenv, "%s %lx %s [%lu][%lu]", "Maximum transaction ID", - ((DB_TXNHEAD *)txninfo)->maxid, + txninfo == NULL ? TXN_MINIMUM : + ((DB_TXNHEAD *)txninfo)->maxid, "Recovery checkpoint", (u_long)region->last_ckp.file, (u_long)region->last_ckp.offset); @@ -550,7 +550,6 @@ (u_long)lsn.file, (u_long)lsn.offset, pass); } -done: err: if (lockid != DB_LOCK_INVALIDID) { if ((t_ret = __rep_unlockpages(dbenv, lockid)) != 0 && ret == 0) ret = t_ret;