--- libdb/fileops/fop_util.c.orig Thu Nov 20 23:13:30 2003
+++ libdb/fileops/fop_util.c Fri Mar 18 20:31:10 2005
@@ -40,7 +40,7 @@
u_int32_t __lockval; \
\
if (LOCKING_ON((ENV))) { \
- __lockval = 0; \
+ __lockval = 1; \
__dbt.data = &__lockval; \
__dbt.size = sizeof(__lockval); \
if ((ret = (ENV)->lock_get((ENV), (ID), \
--- libdb/dbinc/mp.h.orig Thu Nov 20 23:13:17 2003
+++ libdb/dbinc/mp.h Fri Mar 18 20:31:14 2005
@@ -149,6 +149,13 @@
* region lock).
*/
DB_MPOOL_STAT stat; /* Per-cache mpool statistics. */
+
+ /*
+ * We track page puts so that we can decide when allocation is never
+ * going to succeed. We don't lock the field, all we care about is
+ * if it changes.
+ */
+ u_int32_t put_counter; /* Count of page put calls. */
};
struct __db_mpool_hash {
--- libdb/mp/mp_fput.c.orig Thu Nov 20 23:13:36 2003
+++ libdb/mp/mp_fput.c Fri Mar 18 20:31:14 2005
@@ -19,6 +19,8 @@
#include "dbinc/db_shash.h"
#include "dbinc/mp.h"
+static void __memp_reset_lru __P((DB_ENV *, REGINFO *));
+
/*
* __memp_fput --
* Mpool file put function.
@@ -198,5 +200,56 @@
MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+ /*
+ * On every buffer put we update the buffer generation number and check
+ * for wraparound.
+ */
+ if (++c_mp->lru_count == UINT32_T_MAX)
+ __memp_reset_lru(dbenv, dbmp->reginfo);
+
return (0);
+}
+
+/*
+ * __memp_reset_lru --
+ * Reset the cache LRU counter.
+ */
+static void
+__memp_reset_lru(dbenv, memreg)
+ DB_ENV *dbenv;
+ REGINFO *memreg;
+{
+ BH *bhp;
+ DB_MPOOL_HASH *hp;
+ MPOOL *c_mp;
+ int bucket;
+
+ c_mp = memreg->primary;
+
+ /*
+ * Update the counter so all future allocations will start at the
+ * bottom.
+ */
+ c_mp->lru_count -= MPOOL_BASE_DECREMENT;
+
+ /* Adjust the priority of every buffer in the system. */
+ for (hp = R_ADDR(memreg, c_mp->htab),
+ bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+ /*
+ * Skip empty buckets.
+ *
+ * We can check for empty buckets before locking as we
+ * only care if the pointer is zero or non-zero.
+ */
+ if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+ continue;
+
+ MUTEX_LOCK(dbenv, &hp->hash_mutex);
+ for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
+ if (bhp->priority != UINT32_T_MAX &&
+ bhp->priority > MPOOL_BASE_DECREMENT)
+ bhp->priority -= MPOOL_BASE_DECREMENT;
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+ }
}
--- libdb/mp/mp_alloc.c.orig Thu Nov 20 23:13:36 2003
+++ libdb/mp/mp_alloc.c Fri Mar 18 20:31:14 2005
@@ -25,7 +25,6 @@
} HS;
static void __memp_bad_buffer __P((DB_MPOOL_HASH *));
-static void __memp_reset_lru __P((DB_ENV *, REGINFO *, MPOOL *));
/*
* __memp_alloc --
@@ -50,8 +49,9 @@
MPOOL *c_mp;
MPOOLFILE *bh_mfp;
size_t freed_space;
- u_int32_t buckets, buffers, high_priority, max_na, priority;
- int aggressive, ret;
+ u_int32_t buckets, buffers, high_priority, priority, put_counter;
+ u_int32_t total_buckets;
+ int aggressive, giveup, ret;
void *p;
dbenv = dbmp->dbenv;
@@ -59,18 +59,13 @@
dbht = R_ADDR(memreg, c_mp->htab);
hp_end = &dbht[c_mp->htab_buckets];
- buckets = buffers = 0;
- aggressive = 0;
+ buckets = buffers = put_counter = total_buckets = 0;
+ aggressive = giveup = 0;
+ hp_tmp = NULL;
c_mp->stat.st_alloc++;
/*
- * Get aggressive if we've tried to flush the number of pages as are
- * in the system without finding space.
- */
- max_na = 5 * c_mp->htab_buckets;
-
- /*
* If we're allocating a buffer, and the one we're discarding is the
* same size, we don't want to waste the time to re-integrate it into
* the shared memory free list. If the DB_MPOOLFILE argument isn't
@@ -81,19 +76,10 @@
len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize;
R_LOCK(dbenv, memreg);
-
- /*
- * On every buffer allocation we update the buffer generation number
- * and check for wraparound.
- */
- if (++c_mp->lru_count == UINT32_T_MAX)
- __memp_reset_lru(dbenv, memreg, c_mp);
-
/*
* Anything newer than 1/10th of the buffer pool is ignored during
* allocation (unless allocation starts failing).
*/
- DB_ASSERT(c_mp->lru_count > c_mp->stat.st_pages / 10);
high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10;
/*
@@ -120,10 +106,11 @@
* We're not holding the region locked here, these statistics
* can't be trusted.
*/
- if (buckets != 0) {
- if (buckets > c_mp->stat.st_alloc_max_buckets)
- c_mp->stat.st_alloc_max_buckets = buckets;
- c_mp->stat.st_alloc_buckets += buckets;
+ total_buckets += buckets;
+ if (total_buckets != 0) {
+ if (total_buckets > c_mp->stat.st_alloc_max_buckets)
+ c_mp->stat.st_alloc_max_buckets = total_buckets;
+ c_mp->stat.st_alloc_buckets += total_buckets;
}
if (buffers != 0) {
if (buffers > c_mp->stat.st_alloc_max_pages)
@@ -131,6 +118,12 @@
c_mp->stat.st_alloc_pages += buffers;
}
return (0);
+ } else if (giveup || c_mp->stat.st_pages == 0) {
+ R_UNLOCK(dbenv, memreg);
+
+ __db_err(dbenv,
+ "unable to allocate space from the buffer cache");
+ return (ret);
}
/*
@@ -138,26 +131,24 @@
* we need. Reset our free-space counter.
*/
freed_space = 0;
+ total_buckets += buckets;
+ buckets = 0;
/*
* Walk the hash buckets and find the next two with potentially useful
* buffers. Free the buffer with the lowest priority from the buckets'
* chains.
*/
- for (hp_tmp = NULL;;) {
+ for (;;) {
+ /* All pages have been freed, make one last try */
+ if (c_mp->stat.st_pages == 0)
+ goto alloc;
+
/* Check for wrap around. */
hp = &dbht[c_mp->last_checked++];
if (hp >= hp_end) {
c_mp->last_checked = 0;
-
- /*
- * If we've gone through all of the hash buckets, try
- * an allocation. If the cache is small, the old page
- * size is small, and the new page size is large, we
- * might have freed enough memory (but not 3 times the
- * memory).
- */
- goto alloc;
+ hp = &dbht[c_mp->last_checked++];
}
/*
@@ -172,39 +163,59 @@
/*
* The failure mode is when there are too many buffers we can't
* write or there's not enough memory in the system. We don't
- * have a metric for deciding if allocation has no possible way
- * to succeed, so we don't ever fail, we assume memory will be
- * available if we wait long enough.
+ * have a way to know that allocation has no way to succeed.
+ * We fail if there were no pages returned to the cache after
+ * we've been trying for a relatively long time.
*
- * Get aggressive if we've tried to flush 5 times the number of
- * hash buckets as are in the system -- it's possible we have
- * been repeatedly trying to flush the same buffers, although
- * it's unlikely. Aggressive means:
+ * Get aggressive if we've tried to flush the number of hash
+ * buckets as are in the system and have not found any more
+ * space. Aggressive means:
*
* a: set a flag to attempt to flush high priority buffers as
* well as other buffers.
* b: sync the mpool to force out queue extent pages. While we
* might not have enough space for what we want and flushing
* is expensive, why not?
- * c: sleep for a second -- hopefully someone else will run and
- * free up some memory. Try to allocate memory too, in case
- * the other thread returns its memory to the region.
- * d: look at a buffer in every hash bucket rather than choose
+ * c: look at a buffer in every hash bucket rather than choose
* the more preferable of two.
+ * d: start to think about giving up.
+ *
+ * If we get here twice, sleep for a second, hopefully someone
+ * else will run and free up some memory.
+ *
+ * Always try to allocate memory too, in case some other thread
+ * returns its memory to the region.
*
* !!!
* This test ignores pathological cases like no buffers in the
* system -- that shouldn't be possible.
*/
- if ((++buckets % max_na) == 0) {
- aggressive = 1;
-
+ if ((++buckets % c_mp->htab_buckets) == 0) {
+ if (freed_space > 0)
+ goto alloc;
R_UNLOCK(dbenv, memreg);
- (void)__memp_sync_int(
- dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
-
- (void)__os_sleep(dbenv, 1, 0);
+ switch (++aggressive) {
+ case 1:
+ break;
+ case 2:
+ put_counter = c_mp->put_counter;
+ /* FALLTHROUGH */
+ case 3:
+ case 4:
+ case 5:
+ case 6:
+ (void)__memp_sync_int(
+ dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
+
+ (void)__os_sleep(dbenv, 1, 0);
+ break;
+ default:
+ aggressive = 1;
+ if (put_counter == c_mp->put_counter)
+ giveup = 1;
+ break;
+ }
R_LOCK(dbenv, memreg);
goto alloc;
@@ -277,7 +288,8 @@
* thread may have acquired this buffer and incremented the ref
* count after we wrote it, in which case we can't have it.
*
- * If there's a write error, avoid selecting this buffer again
+ * If there's a write error and we're having problems finding
+ * something to allocate, avoid selecting this buffer again
* by making it the bucket's least-desirable buffer.
*/
if (ret != 0 || bhp->ref != 0) {
@@ -301,6 +313,8 @@
freed_space += __db_shsizeof(bhp);
__memp_bhfree(dbmp, hp, bhp, 1);
+ if (aggressive > 1)
+ aggressive = 1;
/*
* Unlock this hash bucket and re-acquire the region lock. If
@@ -360,54 +374,6 @@
/* Reset the hash bucket's priority. */
hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
-}
-
-/*
- * __memp_reset_lru --
- * Reset the cache LRU counter.
- */
-static void
-__memp_reset_lru(dbenv, memreg, c_mp)
- DB_ENV *dbenv;
- REGINFO *memreg;
- MPOOL *c_mp;
-{
- BH *bhp;
- DB_MPOOL_HASH *hp;
- int bucket;
-
- /*
- * Update the counter so all future allocations will start at the
- * bottom.
- */
- c_mp->lru_count -= MPOOL_BASE_DECREMENT;
-
- /* Release the region lock. */
- R_UNLOCK(dbenv, memreg);
-
- /* Adjust the priority of every buffer in the system. */
- for (hp = R_ADDR(memreg, c_mp->htab),
- bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
- /*
- * Skip empty buckets.
- *
- * We can check for empty buckets before locking as we
- * only care if the pointer is zero or non-zero.
- */
- if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
- continue;
-
- MUTEX_LOCK(dbenv, &hp->hash_mutex);
- for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
- bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
- if (bhp->priority != UINT32_T_MAX &&
- bhp->priority > MPOOL_BASE_DECREMENT)
- bhp->priority -= MPOOL_BASE_DECREMENT;
- MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
- }
-
- /* Reacquire the region lock. */
- R_LOCK(dbenv, memreg);
}
#ifdef DIAGNOSTIC
--- libdb/dbreg/dbreg_rec.c.orig Thu Nov 20 23:13:19 2003
+++ libdb/dbreg/dbreg_rec.c Fri Mar 18 20:31:14 2005
@@ -174,19 +174,20 @@
* Typically, closes should match an open which means
* that if this is a close, there should be a valid
* entry in the dbentry table when we get here,
- * however there is an exception. If this is an
+ * however there are exceptions. 1. If this is an
* OPENFILES pass, then we may have started from
* a log file other than the first, and the
* corresponding open appears in an earlier file.
- * We can ignore that case, but all others are errors.
+ * 2. If we are undoing an open on an abort or
+ * recovery, it's possible that we failed after
+ * the log record, but before we actually entered
+ * a handle here.
*/
dbe = &dblp->dbentry[argp->fileid];
if (dbe->dbp == NULL && !dbe->deleted) {
/* No valid entry here. */
- if ((argp->opcode != LOG_CLOSE &&
- argp->opcode != LOG_RCLOSE) ||
- (op != DB_TXN_OPENFILES &&
- op !=DB_TXN_POPENFILES)) {
+ if (DB_REDO(op) ||
+ argp->opcode == LOG_CHECKPOINT) {
__db_err(dbenv,
"Improper file close at %lu/%lu",
(u_long)lsnp->file,
--- libdb/env/env_recover.c.orig Thu Nov 20 23:13:20 2003
+++ libdb/env/env_recover.c Fri Mar 18 20:31:14 2005
@@ -232,12 +232,9 @@
* we'll still need to do a vtruncate based on information we haven't
* yet collected.
*/
- if (ret == DB_NOTFOUND) {
+ if (ret == DB_NOTFOUND)
ret = 0;
- if (max_lsn == NULL)
- goto done;
- }
- if (ret != 0)
+ else if (ret != 0)
goto err;
hi_txn = txnid;
@@ -331,7 +328,7 @@
/* Find a low txnid. */
ret = 0;
- do {
+ if (hi_txn != 0) do {
/* txnid is after rectype, which is a u_int32. */
memcpy(&txnid,
(u_int8_t *)data.data + sizeof(u_int32_t), sizeof(txnid));
@@ -344,11 +341,8 @@
* There are no transactions and we're not recovering to an LSN (see
* above), so there is nothing to do.
*/
- if (ret == DB_NOTFOUND) {
+ if (ret == DB_NOTFOUND)
ret = 0;
- if (max_lsn == NULL)
- goto done;
- }
/* Reset to the first lsn. */
if (ret != 0 || (ret = logc->get(logc, &first_lsn, &data, DB_SET)) != 0)
@@ -367,6 +361,10 @@
txninfo, &data, &first_lsn, &last_lsn, nfiles, 1)) != 0)
goto err;
+ /* If there were no transactions, then we can bail out early. */
+ if (hi_txn == 0 && max_lsn == NULL)
+ goto done;
+
/*
* Pass #2.
*
@@ -483,6 +481,7 @@
if ((ret = __dbreg_close_files(dbenv)) != 0)
goto err;
+done:
if (max_lsn != NULL) {
region->last_ckp = ((DB_TXNHEAD *)txninfo)->ckplsn;
@@ -538,7 +537,8 @@
__db_err(dbenv, "Recovery complete at %.24s", ctime(&now));
__db_err(dbenv, "%s %lx %s [%lu][%lu]",
"Maximum transaction ID",
- ((DB_TXNHEAD *)txninfo)->maxid,
+ txninfo == NULL ? TXN_MINIMUM :
+ ((DB_TXNHEAD *)txninfo)->maxid,
"Recovery checkpoint",
(u_long)region->last_ckp.file,
(u_long)region->last_ckp.offset);
@@ -550,7 +550,6 @@
(u_long)lsn.file, (u_long)lsn.offset, pass);
}
-done:
err: if (lockid != DB_LOCK_INVALIDID) {
if ((t_ret = __rep_unlockpages(dbenv, lockid)) != 0 && ret == 0)
ret = t_ret;