db/patches/patch.4.1.25.2

*** dbinc/mp.h.orig	2004-02-02 10:24:53.000000000 -0800
--- dbinc/mp.h	2004-02-02 10:26:27.000000000 -0800
***************
*** 149,154 ****
--- 149,161 ----
  	 * region lock).
  	 */
  	DB_MPOOL_STAT stat;		/* Per-cache mpool statistics. */
+  
+ 	 /*
+ 	  * We track page puts so that we can decide when allocation is never
+ 	  * going to succeed.  We don't lock the field, all we care about is
+ 	  * if it changes.
+ 	  */
+ 	 u_int32_t  put_counter;                /* Count of page put calls. */
  };
  
  struct __db_mpool_hash {
*** mp/mp_fput.c.orig	2002-08-13 06:26:41.000000000 -0700
--- mp/mp_fput.c	2004-02-02 10:22:35.000000000 -0800
***************
*** 19,24 ****
--- 19,26 ----
  #include "dbinc/db_shash.h"
  #include "dbinc/mp.h"
  
+ static void __memp_reset_lru __P((DB_ENV *, REGINFO *));
+ 
  /*
   * __memp_fput --
   *	Mpool file put function.
***************
*** 198,202 ****
--- 200,255 ----
  
  	MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
  
+ 	/*
+ 	 * On every buffer put we update the buffer generation number and check
+ 	 * for wraparound.
+ 	 */
+ 	if (++c_mp->lru_count == UINT32_T_MAX)
+ 		__memp_reset_lru(dbenv, dbmp->reginfo);
+ 
  	return (0);
  }
+ 
+ /*
+  * __memp_reset_lru --
+  *	Reset the cache LRU counter.
+  */
+ static void
+ __memp_reset_lru(dbenv, memreg)
+ 	DB_ENV *dbenv;
+ 	REGINFO *memreg;
+ {
+ 	BH *bhp;
+ 	DB_MPOOL_HASH *hp;
+ 	MPOOL *c_mp;
+ 	int bucket;
+ 
+ 	c_mp = memreg->primary;
+ 
+ 	/*
+ 	 * Update the counter so all future allocations will start at the
+ 	 * bottom.
+ 	 */
+ 	c_mp->lru_count -= MPOOL_BASE_DECREMENT;
+ 
+ 	/* Adjust the priority of every buffer in the system. */
+ 	for (hp = R_ADDR(memreg, c_mp->htab),
+ 	    bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+ 		/*
+ 		 * Skip empty buckets.
+ 		 *
+ 		 * We can check for empty buckets before locking as we
+ 		 * only care if the pointer is zero or non-zero.
+ 		 */
+ 		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+ 			continue;
+ 
+ 		MUTEX_LOCK(dbenv, &hp->hash_mutex);
+ 		for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+ 		    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
+ 			if (bhp->priority != UINT32_T_MAX &&
+ 			    bhp->priority > MPOOL_BASE_DECREMENT)
+ 				bhp->priority -= MPOOL_BASE_DECREMENT;
+ 		MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+ 	}
+ }
*** mp/mp_alloc.c.orig	2002-08-17 07:23:25.000000000 -0700
--- mp/mp_alloc.c	2004-02-02 10:28:15.000000000 -0800
***************
*** 25,31 ****
  } HS;
  
  static void __memp_bad_buffer __P((DB_MPOOL_HASH *));
- static void __memp_reset_lru __P((DB_ENV *, REGINFO *, MPOOL *));
  
  /*
   * __memp_alloc --
--- 25,30 ----
***************
*** 50,57 ****
  	MPOOL *c_mp;
  	MPOOLFILE *bh_mfp;
  	size_t freed_space;
! 	u_int32_t buckets, buffers, high_priority, max_na, priority;
! 	int aggressive, ret;
  	void *p;
  
  	dbenv = dbmp->dbenv;
--- 49,57 ----
  	MPOOL *c_mp;
  	MPOOLFILE *bh_mfp;
  	size_t freed_space;
! 	u_int32_t buckets, buffers, high_priority, priority, put_counter;
! 	u_int32_t total_buckets;
! 	int aggressive, giveup, ret;
  	void *p;
  
  	dbenv = dbmp->dbenv;
***************
*** 59,76 ****
  	dbht = R_ADDR(memreg, c_mp->htab);
  	hp_end = &dbht[c_mp->htab_buckets];
  
! 	buckets = buffers = 0;
! 	aggressive = 0;
  
  	c_mp->stat.st_alloc++;
  
  	/*
- 	 * Get aggressive if we've tried to flush the number of pages as are
- 	 * in the system without finding space.
- 	 */
- 	max_na = 5 * c_mp->htab_buckets;
- 
- 	/*
  	 * If we're allocating a buffer, and the one we're discarding is the
  	 * same size, we don't want to waste the time to re-integrate it into
  	 * the shared memory free list.  If the DB_MPOOLFILE argument isn't
--- 59,71 ----
  	dbht = R_ADDR(memreg, c_mp->htab);
  	hp_end = &dbht[c_mp->htab_buckets];
  
! 	buckets = buffers = put_counter = total_buckets = 0;
! 	aggressive = giveup = 0;
! 	hp_tmp = NULL;
  
  	c_mp->stat.st_alloc++;
  
  	/*
  	 * If we're allocating a buffer, and the one we're discarding is the
  	 * same size, we don't want to waste the time to re-integrate it into
  	 * the shared memory free list.  If the DB_MPOOLFILE argument isn't
***************
*** 81,99 ****
  		len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize;
  
  	R_LOCK(dbenv, memreg);
- 
- 	/*
- 	 * On every buffer allocation we update the buffer generation number
- 	 * and check for wraparound.
- 	 */
- 	if (++c_mp->lru_count == UINT32_T_MAX)
- 		__memp_reset_lru(dbenv, memreg, c_mp);
- 
  	/*
  	 * Anything newer than 1/10th of the buffer pool is ignored during
  	 * allocation (unless allocation starts failing).
  	 */
- 	DB_ASSERT(c_mp->lru_count > c_mp->stat.st_pages / 10);
  	high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10;
  
  	/*
--- 76,85 ----
***************
*** 120,129 ****
  		 * We're not holding the region locked here, these statistics
  		 * can't be trusted.
  		 */
! 		if (buckets != 0) {
! 			if (buckets > c_mp->stat.st_alloc_max_buckets)
! 				c_mp->stat.st_alloc_max_buckets = buckets;
! 			c_mp->stat.st_alloc_buckets += buckets;
  		}
  		if (buffers != 0) {
  			if (buffers > c_mp->stat.st_alloc_max_pages)
--- 106,116 ----
  		 * We're not holding the region locked here, these statistics
  		 * can't be trusted.
  		 */
! 		total_buckets += buckets;
! 		if (total_buckets != 0) {
! 			if (total_buckets > c_mp->stat.st_alloc_max_buckets)
! 				c_mp->stat.st_alloc_max_buckets = total_buckets;
! 			c_mp->stat.st_alloc_buckets += total_buckets;
  		}
  		if (buffers != 0) {
  			if (buffers > c_mp->stat.st_alloc_max_pages)
***************
*** 131,136 ****
--- 118,129 ----
  			c_mp->stat.st_alloc_pages += buffers;
  		}
  		return (0);
+ 	} else if (giveup || c_mp->stat.st_pages == 0) {
+ 		R_UNLOCK(dbenv, memreg);
+ 
+ 		__db_err(dbenv,
+ 		    "unable to allocate space from the buffer cache");
+ 		return (ret);
  	}
  
  	/*
***************
*** 138,163 ****
  	 * we need.  Reset our free-space counter.
  	 */
  	freed_space = 0;
  
  	/*
  	 * Walk the hash buckets and find the next two with potentially useful
  	 * buffers.  Free the buffer with the lowest priority from the buckets'
  	 * chains.
  	 */
! 	for (hp_tmp = NULL;;) {
  		/* Check for wrap around. */
  		hp = &dbht[c_mp->last_checked++];
  		if (hp >= hp_end) {
  			c_mp->last_checked = 0;
! 
! 			/*
! 			 * If we've gone through all of the hash buckets, try
! 			 * an allocation.  If the cache is small, the old page
! 			 * size is small, and the new page size is large, we
! 			 * might have freed enough memory (but not 3 times the
! 			 * memory).
! 			 */
! 			goto alloc;
  		}
  
  		/*
--- 131,154 ----
  	 * we need.  Reset our free-space counter.
  	 */
  	freed_space = 0;
+ 	total_buckets += buckets;
+ 	buckets = 0;
  
  	/*
  	 * Walk the hash buckets and find the next two with potentially useful
  	 * buffers.  Free the buffer with the lowest priority from the buckets'
  	 * chains.
  	 */
! 	for (;;) {
! 		/* All pages have been freed, make one last try */
! 		if (c_mp->stat.st_pages == 0)
! 			goto alloc;
! 
  		/* Check for wrap around. */
  		hp = &dbht[c_mp->last_checked++];
  		if (hp >= hp_end) {
  			c_mp->last_checked = 0;
! 			hp = &dbht[c_mp->last_checked++];
  		}
  
  		/*
***************
*** 172,210 ****
  		/*
  		 * The failure mode is when there are too many buffers we can't
  		 * write or there's not enough memory in the system.  We don't
! 		 * have a metric for deciding if allocation has no possible way
! 		 * to succeed, so we don't ever fail, we assume memory will be
! 		 * available if we wait long enough.
  		 *
! 		 * Get aggressive if we've tried to flush 5 times the number of
! 		 * hash buckets as are in the system -- it's possible we have
! 		 * been repeatedly trying to flush the same buffers, although
! 		 * it's unlikely.  Aggressive means:
  		 *
  		 * a: set a flag to attempt to flush high priority buffers as
  		 *    well as other buffers.
  		 * b: sync the mpool to force out queue extent pages.  While we
  		 *    might not have enough space for what we want and flushing
  		 *    is expensive, why not?
! 		 * c: sleep for a second -- hopefully someone else will run and
! 		 *    free up some memory.  Try to allocate memory too, in case
! 		 *    the other thread returns its memory to the region.
! 		 * d: look at a buffer in every hash bucket rather than choose
  		 *    the more preferable of two.
  		 *
  		 * !!!
  		 * This test ignores pathological cases like no buffers in the
  		 * system -- that shouldn't be possible.
  		 */
! 		if ((++buckets % max_na) == 0) {
! 			aggressive = 1;
! 
  			R_UNLOCK(dbenv, memreg);
  
! 			(void)__memp_sync_int(
! 			    dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
! 
! 			(void)__os_sleep(dbenv, 1, 0);
  
  			R_LOCK(dbenv, memreg);
  			goto alloc;
--- 163,221 ----
  		/*
  		 * The failure mode is when there are too many buffers we can't
  		 * write or there's not enough memory in the system.  We don't
! 		 * have a way to know that allocation has no way to succeed.
! 		 * We fail if there were no pages returned to the cache after
! 		 * we've been trying for a relatively long time.
  		 *
! 		 * Get aggressive if we've tried to flush the number of hash
! 		 * buckets as are in the system and have not found any more
! 		 * space.  Aggressive means:
  		 *
  		 * a: set a flag to attempt to flush high priority buffers as
  		 *    well as other buffers.
  		 * b: sync the mpool to force out queue extent pages.  While we
  		 *    might not have enough space for what we want and flushing
  		 *    is expensive, why not?
! 		 * c: look at a buffer in every hash bucket rather than choose
  		 *    the more preferable of two.
+ 		 * d: start to think about giving up.
+ 		 *
+ 		 * If we get here twice, sleep for a second, hopefully someone
+ 		 * else will run and free up some memory.
+ 		 *
+ 		 * Always try to allocate memory too, in case some other thread
+ 		 * returns its memory to the region.
  		 *
  		 * !!!
  		 * This test ignores pathological cases like no buffers in the
  		 * system -- that shouldn't be possible.
  		 */
! 		if ((++buckets % c_mp->htab_buckets) == 0) {
! 			if (freed_space > 0)
! 				goto alloc;
  			R_UNLOCK(dbenv, memreg);
  
! 			switch (++aggressive) {
! 			case 1:
! 				break;
! 			case 2:
! 				put_counter = c_mp->put_counter;
! 				/* FALLTHROUGH */
! 			case 3:
! 			case 4:
! 			case 5:
! 			case 6:
! 				(void)__memp_sync_int(
! 				    dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
! 
! 				(void)__os_sleep(dbenv, 1, 0);
! 				break;
! 			default:
! 				aggressive = 1;
! 				if (put_counter == c_mp->put_counter)
! 					giveup = 1;
! 				break;
! 			}
  
  			R_LOCK(dbenv, memreg);
  			goto alloc;
***************
*** 277,283 ****
  		 * thread may have acquired this buffer and incremented the ref
  		 * count after we wrote it, in which case we can't have it.
  		 *
! 		 * If there's a write error, avoid selecting this buffer again
  		 * by making it the bucket's least-desirable buffer.
  		 */
  		if (ret != 0 || bhp->ref != 0) {
--- 288,295 ----
  		 * thread may have acquired this buffer and incremented the ref
  		 * count after we wrote it, in which case we can't have it.
  		 *
! 		 * If there's a write error and we're having problems finding
! 		 * something to allocate, avoid selecting this buffer again
  		 * by making it the bucket's least-desirable buffer.
  		 */
  		if (ret != 0 || bhp->ref != 0) {
***************
*** 301,306 ****
--- 313,320 ----
  
  		freed_space += __db_shsizeof(bhp);
  		__memp_bhfree(dbmp, hp, bhp, 1);
+ 		if (aggressive > 1)
+ 			aggressive = 1;
  
  		/*
  		 * Unlock this hash bucket and re-acquire the region lock. If
***************
*** 362,415 ****
  	hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
  }
  
- /*
-  * __memp_reset_lru --
-  *	Reset the cache LRU counter.
-  */
- static void
- __memp_reset_lru(dbenv, memreg, c_mp)
- 	DB_ENV *dbenv;
- 	REGINFO *memreg;
- 	MPOOL *c_mp;
- {
- 	BH *bhp;
- 	DB_MPOOL_HASH *hp;
- 	int bucket;
- 
- 	/*
- 	 * Update the counter so all future allocations will start at the
- 	 * bottom.
- 	 */
- 	c_mp->lru_count -= MPOOL_BASE_DECREMENT;
- 
- 	/* Release the region lock. */
- 	R_UNLOCK(dbenv, memreg);
- 
- 	/* Adjust the priority of every buffer in the system. */
- 	for (hp = R_ADDR(memreg, c_mp->htab),
- 	    bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
- 		/*
- 		 * Skip empty buckets.
- 		 *
- 		 * We can check for empty buckets before locking as we
- 		 * only care if the pointer is zero or non-zero.
- 		 */
- 		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
- 			continue;
- 
- 		MUTEX_LOCK(dbenv, &hp->hash_mutex);
- 		for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
- 		    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
- 			if (bhp->priority != UINT32_T_MAX &&
- 			    bhp->priority > MPOOL_BASE_DECREMENT)
- 				bhp->priority -= MPOOL_BASE_DECREMENT;
- 		MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
- 	}
- 
- 	/* Reacquire the region lock. */
- 	R_LOCK(dbenv, memreg);
- }
- 
  #ifdef DIAGNOSTIC
  /*
   * __memp_check_order --
--- 376,381 ----
*** dbreg/dbreg_rec.c.orig	2002-08-17 07:22:52.000000000 -0700
--- dbreg/dbreg_rec.c	2003-11-08 10:59:19.000000000 -0800
***************
*** 174,192 ****
  			 * Typically, closes should match an open which means
  			 * that if this is a close, there should be a valid
  			 * entry in the dbentry table when we get here,
! 			 * however there is an exception.  If this is an
  			 * OPENFILES pass, then we may have started from
  			 * a log file other than the first, and the
  			 * corresponding open appears in an earlier file.
! 			 * We can ignore that case, but all others are errors.
  			 */
  			dbe = &dblp->dbentry[argp->fileid];
  			if (dbe->dbp == NULL && !dbe->deleted) {
  				/* No valid entry here. */
! 				if ((argp->opcode != LOG_CLOSE &&
! 				    argp->opcode != LOG_RCLOSE) ||
! 				    (op != DB_TXN_OPENFILES &&
! 				    op !=DB_TXN_POPENFILES)) {
  					__db_err(dbenv,
  					    "Improper file close at %lu/%lu",
  					    (u_long)lsnp->file,
--- 174,193 ----
  			 * Typically, closes should match an open which means
  			 * that if this is a close, there should be a valid
  			 * entry in the dbentry table when we get here,
! 			 * however there are exceptions.  1. If this is an
  			 * OPENFILES pass, then we may have started from
  			 * a log file other than the first, and the
  			 * corresponding open appears in an earlier file.
! 			 * 2. If we are undoing an open on an abort or
! 			 * recovery, it's possible that we failed after
! 			 * the log record, but before we actually entered
! 			 * a handle here.
  			 */
  			dbe = &dblp->dbentry[argp->fileid];
  			if (dbe->dbp == NULL && !dbe->deleted) {
  				/* No valid entry here. */
! 				if (DB_REDO(op) ||
! 				    argp->opcode == LOG_CHECKPOINT) {
  					__db_err(dbenv,
  					    "Improper file close at %lu/%lu",
  					    (u_long)lsnp->file,
*** env/env_recover.c.orig.1	2002-08-22 14:52:51.000000000 -0700
--- env/env_recover.c	2003-11-15 08:20:59.000000000 -0800
***************
*** 232,243 ****
  	 * we'll still need to do a vtruncate based on information we haven't
  	 * yet collected.
  	 */
! 	if (ret == DB_NOTFOUND) {
  		ret = 0;
! 		if (max_lsn == NULL)
! 			goto done;
! 	}
! 	if (ret != 0)
  		goto err;
  
  	hi_txn = txnid;
--- 232,240 ----
  	 * we'll still need to do a vtruncate based on information we haven't
  	 * yet collected.
  	 */
! 	if (ret == DB_NOTFOUND) 
  		ret = 0;
! 	else if (ret != 0)
  		goto err;
  
  	hi_txn = txnid;
***************
*** 331,337 ****
  
  	/* Find a low txnid. */
  	ret = 0;
! 	do {
  		/* txnid is after rectype, which is a u_int32. */
  		memcpy(&txnid,
  		    (u_int8_t *)data.data + sizeof(u_int32_t), sizeof(txnid));
--- 328,334 ----
  
  	/* Find a low txnid. */
  	ret = 0;
! 	if (hi_txn != 0) do {
  		/* txnid is after rectype, which is a u_int32. */
  		memcpy(&txnid,
  		    (u_int8_t *)data.data + sizeof(u_int32_t), sizeof(txnid));
***************
*** 344,354 ****
  	 * There are no transactions and we're not recovering to an LSN (see
  	 * above), so there is nothing to do.
  	 */
! 	if (ret == DB_NOTFOUND) {
  		ret = 0;
- 		if (max_lsn == NULL)
- 			goto done;
- 	}
  
  	/* Reset to the first lsn. */
  	if (ret != 0 || (ret = logc->get(logc, &first_lsn, &data, DB_SET)) != 0)
--- 341,348 ----
  	 * There are no transactions and we're not recovering to an LSN (see
  	 * above), so there is nothing to do.
  	 */
! 	if (ret == DB_NOTFOUND) 
  		ret = 0;
  
  	/* Reset to the first lsn. */
  	if (ret != 0 || (ret = logc->get(logc, &first_lsn, &data, DB_SET)) != 0)
***************
*** 367,372 ****
--- 361,370 ----
  	    txninfo, &data, &first_lsn, &last_lsn, nfiles, 1)) != 0)
  		goto err;
  
+ 	/* If there were no transactions, then we can bail out early. */
+ 	if (hi_txn == 0 && max_lsn == NULL)
+ 		goto done;
+ 		
  	/*
  	 * Pass #2.
  	 *
***************
*** 483,488 ****
--- 481,487 ----
  	if ((ret = __dbreg_close_files(dbenv)) != 0)
  		goto err;
  
+ done:
  	if (max_lsn != NULL) {
  		region->last_ckp = ((DB_TXNHEAD *)txninfo)->ckplsn;
  
***************
*** 538,544 ****
  		__db_err(dbenv, "Recovery complete at %.24s", ctime(&now));
  		__db_err(dbenv, "%s %lx %s [%lu][%lu]",
  		    "Maximum transaction ID",
! 		    ((DB_TXNHEAD *)txninfo)->maxid,
  		    "Recovery checkpoint",
  		    (u_long)region->last_ckp.file,
  		    (u_long)region->last_ckp.offset);
--- 537,544 ----
  		__db_err(dbenv, "Recovery complete at %.24s", ctime(&now));
  		__db_err(dbenv, "%s %lx %s [%lu][%lu]",
  		    "Maximum transaction ID",
! 		    txninfo == NULL ? TXN_MINIMUM :
! 			((DB_TXNHEAD *)txninfo)->maxid,
  		    "Recovery checkpoint",
  		    (u_long)region->last_ckp.file,
  		    (u_long)region->last_ckp.offset);
***************
*** 550,556 ****
  		    (u_long)lsn.file, (u_long)lsn.offset, pass);
  	}
  
- done:
  err:	if (lockid != DB_LOCK_INVALIDID) {
  		if ((t_ret = __rep_unlockpages(dbenv, lockid)) != 0 && ret == 0)
  			ret = t_ret;
--- 550,555 ----
1	niro	144	*** dbinc/mp.h.orig 2004-02-02 10:24:53.000000000 -0800
2			--- dbinc/mp.h 2004-02-02 10:26:27.000000000 -0800
3			***************
4			* 149,154 **
5			--- 149,161 ----
6			* region lock).
7			*/
8			DB_MPOOL_STAT stat; /* Per-cache mpool statistics. */
9			+
10			+ /*
11			+ * We track page puts so that we can decide when allocation is never
12			+ * going to succeed. We don't lock the field, all we care about is
13			+ * if it changes.
14			+ */
15			+ u_int32_t put_counter; /* Count of page put calls. */
16			};
17
18			struct __db_mpool_hash {
19			*** mp/mp_fput.c.orig 2002-08-13 06:26:41.000000000 -0700
20			--- mp/mp_fput.c 2004-02-02 10:22:35.000000000 -0800
21			***************
22			* 19,24 **
23			--- 19,26 ----
24			#include "dbinc/db_shash.h"
25			#include "dbinc/mp.h"
26
27			+ static void __memp_reset_lru __P((DB_ENV , REGINFO ));
28			+
29			/*
30			* __memp_fput --
31			* Mpool file put function.
32			***************
33			* 198,202 **
34			--- 200,255 ----
35
36			MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
37
38			+ /*
39			+ * On every buffer put we update the buffer generation number and check
40			+ * for wraparound.
41			+ */
42			+ if (++c_mp->lru_count == UINT32_T_MAX)
43			+ __memp_reset_lru(dbenv, dbmp->reginfo);
44			+
45			return (0);
46			}
47			+
48			+ /*
49			+ * __memp_reset_lru --
50			+ * Reset the cache LRU counter.
51			+ */
52			+ static void
53			+ __memp_reset_lru(dbenv, memreg)
54			+ DB_ENV *dbenv;
55			+ REGINFO *memreg;
56			+ {
57			+ BH *bhp;
58			+ DB_MPOOL_HASH *hp;
59			+ MPOOL *c_mp;
60			+ int bucket;
61			+
62			+ c_mp = memreg->primary;
63			+
64			+ /*
65			+ * Update the counter so all future allocations will start at the
66			+ * bottom.
67			+ */
68			+ c_mp->lru_count -= MPOOL_BASE_DECREMENT;
69			+
70			+ /* Adjust the priority of every buffer in the system. */
71			+ for (hp = R_ADDR(memreg, c_mp->htab),
72			+ bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
73			+ /*
74			+ * Skip empty buckets.
75			+ *
76			+ * We can check for empty buckets before locking as we
77			+ * only care if the pointer is zero or non-zero.
78			+ */
79			+ if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
80			+ continue;
81			+
82			+ MUTEX_LOCK(dbenv, &hp->hash_mutex);
83			+ for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
84			+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
85			+ if (bhp->priority != UINT32_T_MAX &&
86			+ bhp->priority > MPOOL_BASE_DECREMENT)
87			+ bhp->priority -= MPOOL_BASE_DECREMENT;
88			+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
89			+ }
90			+ }
91			*** mp/mp_alloc.c.orig 2002-08-17 07:23:25.000000000 -0700
92			--- mp/mp_alloc.c 2004-02-02 10:28:15.000000000 -0800
93			***************
94			* 25,31 **
95			} HS;
96
97			static void __memp_bad_buffer __P((DB_MPOOL_HASH *));
98			- static void __memp_reset_lru __P((DB_ENV , REGINFO , MPOOL *));
99
100			/*
101			* __memp_alloc --
102			--- 25,30 ----
103			***************
104			* 50,57 **
105			MPOOL *c_mp;
106			MPOOLFILE *bh_mfp;
107			size_t freed_space;
108			! u_int32_t buckets, buffers, high_priority, max_na, priority;
109			! int aggressive, ret;
110			void *p;
111
112			dbenv = dbmp->dbenv;
113			--- 49,57 ----
114			MPOOL *c_mp;
115			MPOOLFILE *bh_mfp;
116			size_t freed_space;
117			! u_int32_t buckets, buffers, high_priority, priority, put_counter;
118			! u_int32_t total_buckets;
119			! int aggressive, giveup, ret;
120			void *p;
121
122			dbenv = dbmp->dbenv;
123			***************
124			* 59,76 **
125			dbht = R_ADDR(memreg, c_mp->htab);
126			hp_end = &dbht[c_mp->htab_buckets];
127
128			! buckets = buffers = 0;
129			! aggressive = 0;
130
131			c_mp->stat.st_alloc++;
132
133			/*
134			- * Get aggressive if we've tried to flush the number of pages as are
135			- * in the system without finding space.
136			- */
137			- max_na = 5 * c_mp->htab_buckets;
138			-
139			- /*
140			* If we're allocating a buffer, and the one we're discarding is the
141			* same size, we don't want to waste the time to re-integrate it into
142			* the shared memory free list. If the DB_MPOOLFILE argument isn't
143			--- 59,71 ----
144			dbht = R_ADDR(memreg, c_mp->htab);
145			hp_end = &dbht[c_mp->htab_buckets];
146
147			! buckets = buffers = put_counter = total_buckets = 0;
148			! aggressive = giveup = 0;
149			! hp_tmp = NULL;
150
151			c_mp->stat.st_alloc++;
152
153			/*
154			* If we're allocating a buffer, and the one we're discarding is the
155			* same size, we don't want to waste the time to re-integrate it into
156			* the shared memory free list. If the DB_MPOOLFILE argument isn't
157			***************
158			* 81,99 **
159			len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize;
160
161			R_LOCK(dbenv, memreg);
162			-
163			- /*
164			- * On every buffer allocation we update the buffer generation number
165			- * and check for wraparound.
166			- */
167			- if (++c_mp->lru_count == UINT32_T_MAX)
168			- __memp_reset_lru(dbenv, memreg, c_mp);
169			-
170			/*
171			* Anything newer than 1/10th of the buffer pool is ignored during
172			* allocation (unless allocation starts failing).
173			*/
174			- DB_ASSERT(c_mp->lru_count > c_mp->stat.st_pages / 10);
175			high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10;
176
177			/*
178			--- 76,85 ----
179			***************
180			* 120,129 **
181			* We're not holding the region locked here, these statistics
182			* can't be trusted.
183			*/
184			! if (buckets != 0) {
185			! if (buckets > c_mp->stat.st_alloc_max_buckets)
186			! c_mp->stat.st_alloc_max_buckets = buckets;
187			! c_mp->stat.st_alloc_buckets += buckets;
188			}
189			if (buffers != 0) {
190			if (buffers > c_mp->stat.st_alloc_max_pages)
191			--- 106,116 ----
192			* We're not holding the region locked here, these statistics
193			* can't be trusted.
194			*/
195			! total_buckets += buckets;
196			! if (total_buckets != 0) {
197			! if (total_buckets > c_mp->stat.st_alloc_max_buckets)
198			! c_mp->stat.st_alloc_max_buckets = total_buckets;
199			! c_mp->stat.st_alloc_buckets += total_buckets;
200			}
201			if (buffers != 0) {
202			if (buffers > c_mp->stat.st_alloc_max_pages)
203			***************
204			* 131,136 **
205			--- 118,129 ----
206			c_mp->stat.st_alloc_pages += buffers;
207			}
208			return (0);
209			+ } else if (giveup \|\| c_mp->stat.st_pages == 0) {
210			+ R_UNLOCK(dbenv, memreg);
211			+
212			+ __db_err(dbenv,
213			+ "unable to allocate space from the buffer cache");
214			+ return (ret);
215			}
216
217			/*
218			***************
219			* 138,163 **
220			* we need. Reset our free-space counter.
221			*/
222			freed_space = 0;
223
224			/*
225			* Walk the hash buckets and find the next two with potentially useful
226			* buffers. Free the buffer with the lowest priority from the buckets'
227			* chains.
228			*/
229			! for (hp_tmp = NULL;;) {
230			/* Check for wrap around. */
231			hp = &dbht[c_mp->last_checked++];
232			if (hp >= hp_end) {
233			c_mp->last_checked = 0;
234			!
235			! /*
236			! * If we've gone through all of the hash buckets, try
237			! * an allocation. If the cache is small, the old page
238			! * size is small, and the new page size is large, we
239			! * might have freed enough memory (but not 3 times the
240			! * memory).
241			! */
242			! goto alloc;
243			}
244
245			/*
246			--- 131,154 ----
247			* we need. Reset our free-space counter.
248			*/
249			freed_space = 0;
250			+ total_buckets += buckets;
251			+ buckets = 0;
252
253			/*
254			* Walk the hash buckets and find the next two with potentially useful
255			* buffers. Free the buffer with the lowest priority from the buckets'
256			* chains.
257			*/
258			! for (;;) {
259			! /* All pages have been freed, make one last try */
260			! if (c_mp->stat.st_pages == 0)
261			! goto alloc;
262			!
263			/* Check for wrap around. */
264			hp = &dbht[c_mp->last_checked++];
265			if (hp >= hp_end) {
266			c_mp->last_checked = 0;
267			! hp = &dbht[c_mp->last_checked++];
268			}
269
270			/*
271			***************
272			* 172,210 **
273			/*
274			* The failure mode is when there are too many buffers we can't
275			* write or there's not enough memory in the system. We don't
276			! * have a metric for deciding if allocation has no possible way
277			! * to succeed, so we don't ever fail, we assume memory will be
278			! * available if we wait long enough.
279			*
280			! * Get aggressive if we've tried to flush 5 times the number of
281			! * hash buckets as are in the system -- it's possible we have
282			! * been repeatedly trying to flush the same buffers, although
283			! * it's unlikely. Aggressive means:
284			*
285			* a: set a flag to attempt to flush high priority buffers as
286			* well as other buffers.
287			* b: sync the mpool to force out queue extent pages. While we
288			* might not have enough space for what we want and flushing
289			* is expensive, why not?
290			! * c: sleep for a second -- hopefully someone else will run and
291			! * free up some memory. Try to allocate memory too, in case
292			! * the other thread returns its memory to the region.
293			! * d: look at a buffer in every hash bucket rather than choose
294			* the more preferable of two.
295			*
296			* !!!
297			* This test ignores pathological cases like no buffers in the
298			* system -- that shouldn't be possible.
299			*/
300			! if ((++buckets % max_na) == 0) {
301			! aggressive = 1;
302			!
303			R_UNLOCK(dbenv, memreg);
304
305			! (void)__memp_sync_int(
306			! dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
307			!
308			! (void)__os_sleep(dbenv, 1, 0);
309
310			R_LOCK(dbenv, memreg);
311			goto alloc;
312			--- 163,221 ----
313			/*
314			* The failure mode is when there are too many buffers we can't
315			* write or there's not enough memory in the system. We don't
316			! * have a way to know that allocation has no way to succeed.
317			! * We fail if there were no pages returned to the cache after
318			! * we've been trying for a relatively long time.
319			*
320			! * Get aggressive if we've tried to flush the number of hash
321			! * buckets as are in the system and have not found any more
322			! * space. Aggressive means:
323			*
324			* a: set a flag to attempt to flush high priority buffers as
325			* well as other buffers.
326			* b: sync the mpool to force out queue extent pages. While we
327			* might not have enough space for what we want and flushing
328			* is expensive, why not?
329			! * c: look at a buffer in every hash bucket rather than choose
330			* the more preferable of two.
331			+ * d: start to think about giving up.
332			+ *
333			+ * If we get here twice, sleep for a second, hopefully someone
334			+ * else will run and free up some memory.
335			+ *
336			+ * Always try to allocate memory too, in case some other thread
337			+ * returns its memory to the region.
338			*
339			* !!!
340			* This test ignores pathological cases like no buffers in the
341			* system -- that shouldn't be possible.
342			*/
343			! if ((++buckets % c_mp->htab_buckets) == 0) {
344			! if (freed_space > 0)
345			! goto alloc;
346			R_UNLOCK(dbenv, memreg);
347
348			! switch (++aggressive) {
349			! case 1:
350			! break;
351			! case 2:
352			! put_counter = c_mp->put_counter;
353			! /* FALLTHROUGH */
354			! case 3:
355			! case 4:
356			! case 5:
357			! case 6:
358			! (void)__memp_sync_int(
359			! dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
360			!
361			! (void)__os_sleep(dbenv, 1, 0);
362			! break;
363			! default:
364			! aggressive = 1;
365			! if (put_counter == c_mp->put_counter)
366			! giveup = 1;
367			! break;
368			! }
369
370			R_LOCK(dbenv, memreg);
371			goto alloc;
372			***************
373			* 277,283 **
374			* thread may have acquired this buffer and incremented the ref
375			* count after we wrote it, in which case we can't have it.
376			*
377			! * If there's a write error, avoid selecting this buffer again
378			* by making it the bucket's least-desirable buffer.
379			*/
380			if (ret != 0 \|\| bhp->ref != 0) {
381			--- 288,295 ----
382			* thread may have acquired this buffer and incremented the ref
383			* count after we wrote it, in which case we can't have it.
384			*
385			! * If there's a write error and we're having problems finding
386			! * something to allocate, avoid selecting this buffer again
387			* by making it the bucket's least-desirable buffer.
388			*/
389			if (ret != 0 \|\| bhp->ref != 0) {
390			***************
391			* 301,306 **
392			--- 313,320 ----
393
394			freed_space += __db_shsizeof(bhp);
395			__memp_bhfree(dbmp, hp, bhp, 1);
396			+ if (aggressive > 1)
397			+ aggressive = 1;
398
399			/*
400			* Unlock this hash bucket and re-acquire the region lock. If
401			***************
402			* 362,415 **
403			hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
404			}
405
406			- /*
407			- * __memp_reset_lru --
408			- * Reset the cache LRU counter.
409			- */
410			- static void
411			- __memp_reset_lru(dbenv, memreg, c_mp)
412			- DB_ENV *dbenv;
413			- REGINFO *memreg;
414			- MPOOL *c_mp;
415			- {
416			- BH *bhp;
417			- DB_MPOOL_HASH *hp;
418			- int bucket;
419			-
420			- /*
421			- * Update the counter so all future allocations will start at the
422			- * bottom.
423			- */
424			- c_mp->lru_count -= MPOOL_BASE_DECREMENT;
425			-
426			- /* Release the region lock. */
427			- R_UNLOCK(dbenv, memreg);
428			-
429			- /* Adjust the priority of every buffer in the system. */
430			- for (hp = R_ADDR(memreg, c_mp->htab),
431			- bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
432			- /*
433			- * Skip empty buckets.
434			- *
435			- * We can check for empty buckets before locking as we
436			- * only care if the pointer is zero or non-zero.
437			- */
438			- if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
439			- continue;
440			-
441			- MUTEX_LOCK(dbenv, &hp->hash_mutex);
442			- for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
443			- bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
444			- if (bhp->priority != UINT32_T_MAX &&
445			- bhp->priority > MPOOL_BASE_DECREMENT)
446			- bhp->priority -= MPOOL_BASE_DECREMENT;
447			- MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
448			- }
449			-
450			- /* Reacquire the region lock. */
451			- R_LOCK(dbenv, memreg);
452			- }
453			-
454			#ifdef DIAGNOSTIC
455			/*
456			* __memp_check_order --
457			--- 376,381 ----
458			*** dbreg/dbreg_rec.c.orig 2002-08-17 07:22:52.000000000 -0700
459			--- dbreg/dbreg_rec.c 2003-11-08 10:59:19.000000000 -0800
460			***************
461			* 174,192 **
462			* Typically, closes should match an open which means
463			* that if this is a close, there should be a valid
464			* entry in the dbentry table when we get here,
465			! * however there is an exception. If this is an
466			* OPENFILES pass, then we may have started from
467			* a log file other than the first, and the
468			* corresponding open appears in an earlier file.
469			! * We can ignore that case, but all others are errors.
470			*/
471			dbe = &dblp->dbentry[argp->fileid];
472			if (dbe->dbp == NULL && !dbe->deleted) {
473			/* No valid entry here. */
474			! if ((argp->opcode != LOG_CLOSE &&
475			! argp->opcode != LOG_RCLOSE) \|\|
476			! (op != DB_TXN_OPENFILES &&
477			! op !=DB_TXN_POPENFILES)) {
478			__db_err(dbenv,
479			"Improper file close at %lu/%lu",
480			(u_long)lsnp->file,
481			--- 174,193 ----
482			* Typically, closes should match an open which means
483			* that if this is a close, there should be a valid
484			* entry in the dbentry table when we get here,
485			! * however there are exceptions. 1. If this is an
486			* OPENFILES pass, then we may have started from
487			* a log file other than the first, and the
488			* corresponding open appears in an earlier file.
489			! * 2. If we are undoing an open on an abort or
490			! * recovery, it's possible that we failed after
491			! * the log record, but before we actually entered
492			! * a handle here.
493			*/
494			dbe = &dblp->dbentry[argp->fileid];
495			if (dbe->dbp == NULL && !dbe->deleted) {
496			/* No valid entry here. */
497			! if (DB_REDO(op) \|\|
498			! argp->opcode == LOG_CHECKPOINT) {
499			__db_err(dbenv,
500			"Improper file close at %lu/%lu",
501			(u_long)lsnp->file,
502			*** env/env_recover.c.orig.1 2002-08-22 14:52:51.000000000 -0700
503			--- env/env_recover.c 2003-11-15 08:20:59.000000000 -0800
504			***************
505			* 232,243 **
506			* we'll still need to do a vtruncate based on information we haven't
507			* yet collected.
508			*/
509			! if (ret == DB_NOTFOUND) {
510			ret = 0;
511			! if (max_lsn == NULL)
512			! goto done;
513			! }
514			! if (ret != 0)
515			goto err;
516
517			hi_txn = txnid;
518			--- 232,240 ----
519			* we'll still need to do a vtruncate based on information we haven't
520			* yet collected.
521			*/
522			! if (ret == DB_NOTFOUND)
523			ret = 0;
524			! else if (ret != 0)
525			goto err;
526
527			hi_txn = txnid;
528			***************
529			* 331,337 **
530
531			/* Find a low txnid. */
532			ret = 0;
533			! do {
534			/* txnid is after rectype, which is a u_int32. */
535			memcpy(&txnid,
536			(u_int8_t *)data.data + sizeof(u_int32_t), sizeof(txnid));
537			--- 328,334 ----
538
539			/* Find a low txnid. */
540			ret = 0;
541			! if (hi_txn != 0) do {
542			/* txnid is after rectype, which is a u_int32. */
543			memcpy(&txnid,
544			(u_int8_t *)data.data + sizeof(u_int32_t), sizeof(txnid));
545			***************
546			* 344,354 **
547			* There are no transactions and we're not recovering to an LSN (see
548			* above), so there is nothing to do.
549			*/
550			! if (ret == DB_NOTFOUND) {
551			ret = 0;
552			- if (max_lsn == NULL)
553			- goto done;
554			- }
555
556			/* Reset to the first lsn. */
557			if (ret != 0 \|\| (ret = logc->get(logc, &first_lsn, &data, DB_SET)) != 0)
558			--- 341,348 ----
559			* There are no transactions and we're not recovering to an LSN (see
560			* above), so there is nothing to do.
561			*/
562			! if (ret == DB_NOTFOUND)
563			ret = 0;
564
565			/* Reset to the first lsn. */
566			if (ret != 0 \|\| (ret = logc->get(logc, &first_lsn, &data, DB_SET)) != 0)
567			***************
568			* 367,372 **
569			--- 361,370 ----
570			txninfo, &data, &first_lsn, &last_lsn, nfiles, 1)) != 0)
571			goto err;
572
573			+ /* If there were no transactions, then we can bail out early. */
574			+ if (hi_txn == 0 && max_lsn == NULL)
575			+ goto done;
576			+
577			/*
578			* Pass #2.
579			*
580			***************
581			* 483,488 **
582			--- 481,487 ----
583			if ((ret = __dbreg_close_files(dbenv)) != 0)
584			goto err;
585
586			+ done:
587			if (max_lsn != NULL) {
588			region->last_ckp = ((DB_TXNHEAD *)txninfo)->ckplsn;
589
590			***************
591			* 538,544 **
592			__db_err(dbenv, "Recovery complete at %.24s", ctime(&now));
593			__db_err(dbenv, "%s %lx %s [%lu][%lu]",
594			"Maximum transaction ID",
595			! ((DB_TXNHEAD *)txninfo)->maxid,
596			"Recovery checkpoint",
597			(u_long)region->last_ckp.file,
598			(u_long)region->last_ckp.offset);
599			--- 537,544 ----
600			__db_err(dbenv, "Recovery complete at %.24s", ctime(&now));
601			__db_err(dbenv, "%s %lx %s [%lu][%lu]",
602			"Maximum transaction ID",
603			! txninfo == NULL ? TXN_MINIMUM :
604			! ((DB_TXNHEAD *)txninfo)->maxid,
605			"Recovery checkpoint",
606			(u_long)region->last_ckp.file,
607			(u_long)region->last_ckp.offset);
608			***************
609			* 550,556 **
610			(u_long)lsn.file, (u_long)lsn.offset, pass);
611			}
612
613			- done:
614			err: if (lockid != DB_LOCK_INVALIDID) {
615			if ((t_ret = __rep_unlockpages(dbenv, lockid)) != 0 && ret == 0)
616			ret = t_ret;
617			--- 550,555 ----