db/patches/patch.4.1.25.2

*** dbinc/mp.h.orig	2004-02-02 10:24:53.000000000 -0800
--- dbinc/mp.h	2004-02-02 10:26:27.000000000 -0800
***************
*** 149,154 ****
--- 149,161 ----
  	 * region lock).
  	 */
  	DB_MPOOL_STAT stat;		/* Per-cache mpool statistics. */
+  
+ 	 /*
+ 	  * We track page puts so that we can decide when allocation is never
+ 	  * going to succeed.  We don't lock the field, all we care about is
+ 	  * if it changes.
+ 	  */
+ 	 u_int32_t  put_counter;                /* Count of page put calls. */
  };
  
  struct __db_mpool_hash {
*** mp/mp_fput.c.orig	2002-08-13 06:26:41.000000000 -0700
--- mp/mp_fput.c	2004-02-02 10:22:35.000000000 -0800
***************
*** 19,24 ****
--- 19,26 ----
  #include "dbinc/db_shash.h"
  #include "dbinc/mp.h"
  
+ static void __memp_reset_lru __P((DB_ENV *, REGINFO *));
+ 
  /*
   * __memp_fput --
   *	Mpool file put function.
***************
*** 198,202 ****
--- 200,255 ----
  
  	MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
  
+ 	/*
+ 	 * On every buffer put we update the buffer generation number and check
+ 	 * for wraparound.
+ 	 */
+ 	if (++c_mp->lru_count == UINT32_T_MAX)
+ 		__memp_reset_lru(dbenv, dbmp->reginfo);
+ 
  	return (0);
  }
+ 
+ /*
+  * __memp_reset_lru --
+  *	Reset the cache LRU counter.
+  */
+ static void
+ __memp_reset_lru(dbenv, memreg)
+ 	DB_ENV *dbenv;
+ 	REGINFO *memreg;
+ {
+ 	BH *bhp;
+ 	DB_MPOOL_HASH *hp;
+ 	MPOOL *c_mp;
+ 	int bucket;
+ 
+ 	c_mp = memreg->primary;
+ 
+ 	/*
+ 	 * Update the counter so all future allocations will start at the
+ 	 * bottom.
+ 	 */
+ 	c_mp->lru_count -= MPOOL_BASE_DECREMENT;
+ 
+ 	/* Adjust the priority of every buffer in the system. */
+ 	for (hp = R_ADDR(memreg, c_mp->htab),
+ 	    bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+ 		/*
+ 		 * Skip empty buckets.
+ 		 *
+ 		 * We can check for empty buckets before locking as we
+ 		 * only care if the pointer is zero or non-zero.
+ 		 */
+ 		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+ 			continue;
+ 
+ 		MUTEX_LOCK(dbenv, &hp->hash_mutex);
+ 		for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+ 		    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
+ 			if (bhp->priority != UINT32_T_MAX &&
+ 			    bhp->priority > MPOOL_BASE_DECREMENT)
+ 				bhp->priority -= MPOOL_BASE_DECREMENT;
+ 		MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+ 	}
+ }
*** mp/mp_alloc.c.orig	2002-08-17 07:23:25.000000000 -0700
--- mp/mp_alloc.c	2004-02-02 10:28:15.000000000 -0800
***************
*** 25,31 ****
  } HS;
  
  static void __memp_bad_buffer __P((DB_MPOOL_HASH *));
- static void __memp_reset_lru __P((DB_ENV *, REGINFO *, MPOOL *));
  
  /*
   * __memp_alloc --
--- 25,30 ----
***************
*** 50,57 ****
  	MPOOL *c_mp;
  	MPOOLFILE *bh_mfp;
  	size_t freed_space;
! 	u_int32_t buckets, buffers, high_priority, max_na, priority;
! 	int aggressive, ret;
  	void *p;
  
  	dbenv = dbmp->dbenv;
--- 49,57 ----
  	MPOOL *c_mp;
  	MPOOLFILE *bh_mfp;
  	size_t freed_space;
! 	u_int32_t buckets, buffers, high_priority, priority, put_counter;
! 	u_int32_t total_buckets;
! 	int aggressive, giveup, ret;
  	void *p;
  
  	dbenv = dbmp->dbenv;
***************
*** 59,76 ****
  	dbht = R_ADDR(memreg, c_mp->htab);
  	hp_end = &dbht[c_mp->htab_buckets];
  
! 	buckets = buffers = 0;
! 	aggressive = 0;
  
  	c_mp->stat.st_alloc++;
  
  	/*
- 	 * Get aggressive if we've tried to flush the number of pages as are
- 	 * in the system without finding space.
- 	 */
- 	max_na = 5 * c_mp->htab_buckets;
- 
- 	/*
  	 * If we're allocating a buffer, and the one we're discarding is the
  	 * same size, we don't want to waste the time to re-integrate it into
  	 * the shared memory free list.  If the DB_MPOOLFILE argument isn't
--- 59,71 ----
  	dbht = R_ADDR(memreg, c_mp->htab);
  	hp_end = &dbht[c_mp->htab_buckets];
  
! 	buckets = buffers = put_counter = total_buckets = 0;
! 	aggressive = giveup = 0;
! 	hp_tmp = NULL;
  
  	c_mp->stat.st_alloc++;
  
  	/*
  	 * If we're allocating a buffer, and the one we're discarding is the
  	 * same size, we don't want to waste the time to re-integrate it into
  	 * the shared memory free list.  If the DB_MPOOLFILE argument isn't
***************
*** 81,99 ****
  		len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize;
  
  	R_LOCK(dbenv, memreg);
- 
- 	/*
- 	 * On every buffer allocation we update the buffer generation number
- 	 * and check for wraparound.
- 	 */
- 	if (++c_mp->lru_count == UINT32_T_MAX)
- 		__memp_reset_lru(dbenv, memreg, c_mp);
- 
  	/*
  	 * Anything newer than 1/10th of the buffer pool is ignored during
  	 * allocation (unless allocation starts failing).
  	 */
- 	DB_ASSERT(c_mp->lru_count > c_mp->stat.st_pages / 10);
  	high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10;
  
  	/*
--- 76,85 ----
***************
*** 120,129 ****
  		 * We're not holding the region locked here, these statistics
  		 * can't be trusted.
  		 */
! 		if (buckets != 0) {
! 			if (buckets > c_mp->stat.st_alloc_max_buckets)
! 				c_mp->stat.st_alloc_max_buckets = buckets;
! 			c_mp->stat.st_alloc_buckets += buckets;
  		}
  		if (buffers != 0) {
  			if (buffers > c_mp->stat.st_alloc_max_pages)
--- 106,116 ----
  		 * We're not holding the region locked here, these statistics
  		 * can't be trusted.
  		 */
! 		total_buckets += buckets;
! 		if (total_buckets != 0) {
! 			if (total_buckets > c_mp->stat.st_alloc_max_buckets)
! 				c_mp->stat.st_alloc_max_buckets = total_buckets;
! 			c_mp->stat.st_alloc_buckets += total_buckets;
  		}
  		if (buffers != 0) {
  			if (buffers > c_mp->stat.st_alloc_max_pages)
***************
*** 131,136 ****
--- 118,129 ----
  			c_mp->stat.st_alloc_pages += buffers;
  		}
  		return (0);
+ 	} else if (giveup || c_mp->stat.st_pages == 0) {
+ 		R_UNLOCK(dbenv, memreg);
+ 
+ 		__db_err(dbenv,
+ 		    "unable to allocate space from the buffer cache");
+ 		return (ret);
  	}
  
  	/*
***************
*** 138,163 ****
  	 * we need.  Reset our free-space counter.
  	 */
  	freed_space = 0;
  
  	/*
  	 * Walk the hash buckets and find the next two with potentially useful
  	 * buffers.  Free the buffer with the lowest priority from the buckets'
  	 * chains.
  	 */
! 	for (hp_tmp = NULL;;) {
  		/* Check for wrap around. */
  		hp = &dbht[c_mp->last_checked++];
  		if (hp >= hp_end) {
  			c_mp->last_checked = 0;
! 
! 			/*
! 			 * If we've gone through all of the hash buckets, try
! 			 * an allocation.  If the cache is small, the old page
! 			 * size is small, and the new page size is large, we
! 			 * might have freed enough memory (but not 3 times the
! 			 * memory).
! 			 */
! 			goto alloc;
  		}
  
  		/*
--- 131,154 ----
  	 * we need.  Reset our free-space counter.
  	 */
  	freed_space = 0;
+ 	total_buckets += buckets;
+ 	buckets = 0;
  
  	/*
  	 * Walk the hash buckets and find the next two with potentially useful
  	 * buffers.  Free the buffer with the lowest priority from the buckets'
  	 * chains.
  	 */
! 	for (;;) {
! 		/* All pages have been freed, make one last try */
! 		if (c_mp->stat.st_pages == 0)
! 			goto alloc;
! 
  		/* Check for wrap around. */
  		hp = &dbht[c_mp->last_checked++];
  		if (hp >= hp_end) {
  			c_mp->last_checked = 0;
! 			hp = &dbht[c_mp->last_checked++];
  		}
  
  		/*
***************
*** 172,210 ****
  		/*
  		 * The failure mode is when there are too many buffers we can't
  		 * write or there's not enough memory in the system.  We don't
! 		 * have a metric for deciding if allocation has no possible way
! 		 * to succeed, so we don't ever fail, we assume memory will be
! 		 * available if we wait long enough.
  		 *
! 		 * Get aggressive if we've tried to flush 5 times the number of
! 		 * hash buckets as are in the system -- it's possible we have
! 		 * been repeatedly trying to flush the same buffers, although
! 		 * it's unlikely.  Aggressive means:
  		 *
  		 * a: set a flag to attempt to flush high priority buffers as
  		 *    well as other buffers.
  		 * b: sync the mpool to force out queue extent pages.  While we
  		 *    might not have enough space for what we want and flushing
  		 *    is expensive, why not?
! 		 * c: sleep for a second -- hopefully someone else will run and
! 		 *    free up some memory.  Try to allocate memory too, in case
! 		 *    the other thread returns its memory to the region.
! 		 * d: look at a buffer in every hash bucket rather than choose
  		 *    the more preferable of two.
  		 *
  		 * !!!
  		 * This test ignores pathological cases like no buffers in the
  		 * system -- that shouldn't be possible.
  		 */
! 		if ((++buckets % max_na) == 0) {
! 			aggressive = 1;
! 
  			R_UNLOCK(dbenv, memreg);
  
! 			(void)__memp_sync_int(
! 			    dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
! 
! 			(void)__os_sleep(dbenv, 1, 0);
  
  			R_LOCK(dbenv, memreg);
  			goto alloc;
--- 163,221 ----
  		/*
  		 * The failure mode is when there are too many buffers we can't
  		 * write or there's not enough memory in the system.  We don't
! 		 * have a way to know that allocation has no way to succeed.
! 		 * We fail if there were no pages returned to the cache after
! 		 * we've been trying for a relatively long time.
  		 *
! 		 * Get aggressive if we've tried to flush the number of hash
! 		 * buckets as are in the system and have not found any more
! 		 * space.  Aggressive means:
  		 *
  		 * a: set a flag to attempt to flush high priority buffers as
  		 *    well as other buffers.
  		 * b: sync the mpool to force out queue extent pages.  While we
  		 *    might not have enough space for what we want and flushing
  		 *    is expensive, why not?
! 		 * c: look at a buffer in every hash bucket rather than choose
  		 *    the more preferable of two.
+ 		 * d: start to think about giving up.
+ 		 *
+ 		 * If we get here twice, sleep for a second, hopefully someone
+ 		 * else will run and free up some memory.
+ 		 *
+ 		 * Always try to allocate memory too, in case some other thread
+ 		 * returns its memory to the region.
  		 *
  		 * !!!
  		 * This test ignores pathological cases like no buffers in the
  		 * system -- that shouldn't be possible.
  		 */
! 		if ((++buckets % c_mp->htab_buckets) == 0) {
! 			if (freed_space > 0)
! 				goto alloc;
  			R_UNLOCK(dbenv, memreg);
  
! 			switch (++aggressive) {
! 			case 1:
! 				break;
! 			case 2:
! 				put_counter = c_mp->put_counter;
! 				/* FALLTHROUGH */
! 			case 3:
! 			case 4:
! 			case 5:
! 			case 6:
! 				(void)__memp_sync_int(
! 				    dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
! 
! 				(void)__os_sleep(dbenv, 1, 0);
! 				break;
! 			default:
! 				aggressive = 1;
! 				if (put_counter == c_mp->put_counter)
! 					giveup = 1;
! 				break;
! 			}
  
  			R_LOCK(dbenv, memreg);
  			goto alloc;
***************
*** 277,283 ****
  		 * thread may have acquired this buffer and incremented the ref
  		 * count after we wrote it, in which case we can't have it.
  		 *
! 		 * If there's a write error, avoid selecting this buffer again
  		 * by making it the bucket's least-desirable buffer.
  		 */
  		if (ret != 0 || bhp->ref != 0) {
--- 288,295 ----
  		 * thread may have acquired this buffer and incremented the ref
  		 * count after we wrote it, in which case we can't have it.
  		 *
! 		 * If there's a write error and we're having problems finding
! 		 * something to allocate, avoid selecting this buffer again
  		 * by making it the bucket's least-desirable buffer.
  		 */
  		if (ret != 0 || bhp->ref != 0) {
***************
*** 301,306 ****
--- 313,320 ----
  
  		freed_space += __db_shsizeof(bhp);
  		__memp_bhfree(dbmp, hp, bhp, 1);
+ 		if (aggressive > 1)
+ 			aggressive = 1;
  
  		/*
  		 * Unlock this hash bucket and re-acquire the region lock. If
***************
*** 362,415 ****
  	hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
  }
  
- /*
-  * __memp_reset_lru --
-  *	Reset the cache LRU counter.
-  */
- static void
- __memp_reset_lru(dbenv, memreg, c_mp)
- 	DB_ENV *dbenv;
- 	REGINFO *memreg;
- 	MPOOL *c_mp;
- {
- 	BH *bhp;
- 	DB_MPOOL_HASH *hp;
- 	int bucket;
- 
- 	/*
- 	 * Update the counter so all future allocations will start at the
- 	 * bottom.
- 	 */
- 	c_mp->lru_count -= MPOOL_BASE_DECREMENT;
- 
- 	/* Release the region lock. */
- 	R_UNLOCK(dbenv, memreg);
- 
- 	/* Adjust the priority of every buffer in the system. */
- 	for (hp = R_ADDR(memreg, c_mp->htab),
- 	    bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
- 		/*
- 		 * Skip empty buckets.
- 		 *
- 		 * We can check for empty buckets before locking as we
- 		 * only care if the pointer is zero or non-zero.
- 		 */
- 		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
- 			continue;
- 
- 		MUTEX_LOCK(dbenv, &hp->hash_mutex);
- 		for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
- 		    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
- 			if (bhp->priority != UINT32_T_MAX &&
- 			    bhp->priority > MPOOL_BASE_DECREMENT)
- 				bhp->priority -= MPOOL_BASE_DECREMENT;
- 		MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
- 	}
- 
- 	/* Reacquire the region lock. */
- 	R_LOCK(dbenv, memreg);
- }
- 
  #ifdef DIAGNOSTIC
  /*
   * __memp_check_order --
--- 376,381 ----
*** dbreg/dbreg_rec.c.orig	2002-08-17 07:22:52.000000000 -0700
--- dbreg/dbreg_rec.c	2003-11-08 10:59:19.000000000 -0800
***************
*** 174,192 ****
  			 * Typically, closes should match an open which means
  			 * that if this is a close, there should be a valid
  			 * entry in the dbentry table when we get here,
! 			 * however there is an exception.  If this is an
  			 * OPENFILES pass, then we may have started from
  			 * a log file other than the first, and the
  			 * corresponding open appears in an earlier file.
! 			 * We can ignore that case, but all others are errors.
  			 */
  			dbe = &dblp->dbentry[argp->fileid];
  			if (dbe->dbp == NULL && !dbe->deleted) {
  				/* No valid entry here. */
! 				if ((argp->opcode != LOG_CLOSE &&
! 				    argp->opcode != LOG_RCLOSE) ||
! 				    (op != DB_TXN_OPENFILES &&
! 				    op !=DB_TXN_POPENFILES)) {
  					__db_err(dbenv,
  					    "Improper file close at %lu/%lu",
  					    (u_long)lsnp->file,
--- 174,193 ----
  			 * Typically, closes should match an open which means
  			 * that if this is a close, there should be a valid
  			 * entry in the dbentry table when we get here,
! 			 * however there are exceptions.  1. If this is an
  			 * OPENFILES pass, then we may have started from
  			 * a log file other than the first, and the
  			 * corresponding open appears in an earlier file.
! 			 * 2. If we are undoing an open on an abort or
! 			 * recovery, it's possible that we failed after
! 			 * the log record, but before we actually entered
! 			 * a handle here.
  			 */
  			dbe = &dblp->dbentry[argp->fileid];
  			if (dbe->dbp == NULL && !dbe->deleted) {
  				/* No valid entry here. */
! 				if (DB_REDO(op) ||
! 				    argp->opcode == LOG_CHECKPOINT) {
  					__db_err(dbenv,
  					    "Improper file close at %lu/%lu",
  					    (u_long)lsnp->file,
*** env/env_recover.c.orig.1	2002-08-22 14:52:51.000000000 -0700
--- env/env_recover.c	2003-11-15 08:20:59.000000000 -0800
***************
*** 232,243 ****
  	 * we'll still need to do a vtruncate based on information we haven't
  	 * yet collected.
  	 */
! 	if (ret == DB_NOTFOUND) {
  		ret = 0;
! 		if (max_lsn == NULL)
! 			goto done;
! 	}
! 	if (ret != 0)
  		goto err;
  
  	hi_txn = txnid;
--- 232,240 ----
  	 * we'll still need to do a vtruncate based on information we haven't
  	 * yet collected.
  	 */
! 	if (ret == DB_NOTFOUND) 
  		ret = 0;
! 	else if (ret != 0)
  		goto err;
  
  	hi_txn = txnid;
***************
*** 331,337 ****
  
  	/* Find a low txnid. */
  	ret = 0;
! 	do {
  		/* txnid is after rectype, which is a u_int32. */
  		memcpy(&txnid,
  		    (u_int8_t *)data.data + sizeof(u_int32_t), sizeof(txnid));
--- 328,334 ----
  
  	/* Find a low txnid. */
  	ret = 0;
! 	if (hi_txn != 0) do {
  		/* txnid is after rectype, which is a u_int32. */
  		memcpy(&txnid,
  		    (u_int8_t *)data.data + sizeof(u_int32_t), sizeof(txnid));
***************
*** 344,354 ****
  	 * There are no transactions and we're not recovering to an LSN (see
  	 * above), so there is nothing to do.
  	 */
! 	if (ret == DB_NOTFOUND) {
  		ret = 0;
- 		if (max_lsn == NULL)
- 			goto done;
- 	}
  
  	/* Reset to the first lsn. */
  	if (ret != 0 || (ret = logc->get(logc, &first_lsn, &data, DB_SET)) != 0)
--- 341,348 ----
  	 * There are no transactions and we're not recovering to an LSN (see
  	 * above), so there is nothing to do.
  	 */
! 	if (ret == DB_NOTFOUND) 
  		ret = 0;
  
  	/* Reset to the first lsn. */
  	if (ret != 0 || (ret = logc->get(logc, &first_lsn, &data, DB_SET)) != 0)
***************
*** 367,372 ****
--- 361,370 ----
  	    txninfo, &data, &first_lsn, &last_lsn, nfiles, 1)) != 0)
  		goto err;
  
+ 	/* If there were no transactions, then we can bail out early. */
+ 	if (hi_txn == 0 && max_lsn == NULL)
+ 		goto done;
+ 		
  	/*
  	 * Pass #2.
  	 *
***************
*** 483,488 ****
--- 481,487 ----
  	if ((ret = __dbreg_close_files(dbenv)) != 0)
  		goto err;
  
+ done:
  	if (max_lsn != NULL) {
  		region->last_ckp = ((DB_TXNHEAD *)txninfo)->ckplsn;
  
***************
*** 538,544 ****
  		__db_err(dbenv, "Recovery complete at %.24s", ctime(&now));
  		__db_err(dbenv, "%s %lx %s [%lu][%lu]",
  		    "Maximum transaction ID",
! 		    ((DB_TXNHEAD *)txninfo)->maxid,
  		    "Recovery checkpoint",
  		    (u_long)region->last_ckp.file,
  		    (u_long)region->last_ckp.offset);
--- 537,544 ----
  		__db_err(dbenv, "Recovery complete at %.24s", ctime(&now));
  		__db_err(dbenv, "%s %lx %s [%lu][%lu]",
  		    "Maximum transaction ID",
! 		    txninfo == NULL ? TXN_MINIMUM :
! 			((DB_TXNHEAD *)txninfo)->maxid,
  		    "Recovery checkpoint",
  		    (u_long)region->last_ckp.file,
  		    (u_long)region->last_ckp.offset);
***************
*** 550,556 ****
  		    (u_long)lsn.file, (u_long)lsn.offset, pass);
  	}
  
- done:
  err:	if (lockid != DB_LOCK_INVALIDID) {
  		if ((t_ret = __rep_unlockpages(dbenv, lockid)) != 0 && ret == 0)
  			ret = t_ret;
--- 550,555 ----
1	*** dbinc/mp.h.orig 2004-02-02 10:24:53.000000000 -0800
2	--- dbinc/mp.h 2004-02-02 10:26:27.000000000 -0800
3	***************
4	* 149,154 **
5	--- 149,161 ----
6	* region lock).
7	*/
8	DB_MPOOL_STAT stat; /* Per-cache mpool statistics. */
9	+
10	+ /*
11	+ * We track page puts so that we can decide when allocation is never
12	+ * going to succeed. We don't lock the field, all we care about is
13	+ * if it changes.
14	+ */
15	+ u_int32_t put_counter; /* Count of page put calls. */
16	};
17
18	struct __db_mpool_hash {
19	*** mp/mp_fput.c.orig 2002-08-13 06:26:41.000000000 -0700
20	--- mp/mp_fput.c 2004-02-02 10:22:35.000000000 -0800
21	***************
22	* 19,24 **
23	--- 19,26 ----
24	#include "dbinc/db_shash.h"
25	#include "dbinc/mp.h"
26
27	+ static void __memp_reset_lru __P((DB_ENV , REGINFO ));
28	+
29	/*
30	* __memp_fput --
31	* Mpool file put function.
32	***************
33	* 198,202 **
34	--- 200,255 ----
35
36	MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
37
38	+ /*
39	+ * On every buffer put we update the buffer generation number and check
40	+ * for wraparound.
41	+ */
42	+ if (++c_mp->lru_count == UINT32_T_MAX)
43	+ __memp_reset_lru(dbenv, dbmp->reginfo);
44	+
45	return (0);
46	}
47	+
48	+ /*
49	+ * __memp_reset_lru --
50	+ * Reset the cache LRU counter.
51	+ */
52	+ static void
53	+ __memp_reset_lru(dbenv, memreg)
54	+ DB_ENV *dbenv;
55	+ REGINFO *memreg;
56	+ {
57	+ BH *bhp;
58	+ DB_MPOOL_HASH *hp;
59	+ MPOOL *c_mp;
60	+ int bucket;
61	+
62	+ c_mp = memreg->primary;
63	+
64	+ /*
65	+ * Update the counter so all future allocations will start at the
66	+ * bottom.
67	+ */
68	+ c_mp->lru_count -= MPOOL_BASE_DECREMENT;
69	+
70	+ /* Adjust the priority of every buffer in the system. */
71	+ for (hp = R_ADDR(memreg, c_mp->htab),
72	+ bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
73	+ /*
74	+ * Skip empty buckets.
75	+ *
76	+ * We can check for empty buckets before locking as we
77	+ * only care if the pointer is zero or non-zero.
78	+ */
79	+ if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
80	+ continue;
81	+
82	+ MUTEX_LOCK(dbenv, &hp->hash_mutex);
83	+ for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
84	+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
85	+ if (bhp->priority != UINT32_T_MAX &&
86	+ bhp->priority > MPOOL_BASE_DECREMENT)
87	+ bhp->priority -= MPOOL_BASE_DECREMENT;
88	+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
89	+ }
90	+ }
91	*** mp/mp_alloc.c.orig 2002-08-17 07:23:25.000000000 -0700
92	--- mp/mp_alloc.c 2004-02-02 10:28:15.000000000 -0800
93	***************
94	* 25,31 **
95	} HS;
96
97	static void __memp_bad_buffer __P((DB_MPOOL_HASH *));
98	- static void __memp_reset_lru __P((DB_ENV , REGINFO , MPOOL *));
99
100	/*
101	* __memp_alloc --
102	--- 25,30 ----
103	***************
104	* 50,57 **
105	MPOOL *c_mp;
106	MPOOLFILE *bh_mfp;
107	size_t freed_space;
108	! u_int32_t buckets, buffers, high_priority, max_na, priority;
109	! int aggressive, ret;
110	void *p;
111
112	dbenv = dbmp->dbenv;
113	--- 49,57 ----
114	MPOOL *c_mp;
115	MPOOLFILE *bh_mfp;
116	size_t freed_space;
117	! u_int32_t buckets, buffers, high_priority, priority, put_counter;
118	! u_int32_t total_buckets;
119	! int aggressive, giveup, ret;
120	void *p;
121
122	dbenv = dbmp->dbenv;
123	***************
124	* 59,76 **
125	dbht = R_ADDR(memreg, c_mp->htab);
126	hp_end = &dbht[c_mp->htab_buckets];
127
128	! buckets = buffers = 0;
129	! aggressive = 0;
130
131	c_mp->stat.st_alloc++;
132
133	/*
134	- * Get aggressive if we've tried to flush the number of pages as are
135	- * in the system without finding space.
136	- */
137	- max_na = 5 * c_mp->htab_buckets;
138	-
139	- /*
140	* If we're allocating a buffer, and the one we're discarding is the
141	* same size, we don't want to waste the time to re-integrate it into
142	* the shared memory free list. If the DB_MPOOLFILE argument isn't
143	--- 59,71 ----
144	dbht = R_ADDR(memreg, c_mp->htab);
145	hp_end = &dbht[c_mp->htab_buckets];
146
147	! buckets = buffers = put_counter = total_buckets = 0;
148	! aggressive = giveup = 0;
149	! hp_tmp = NULL;
150
151	c_mp->stat.st_alloc++;
152
153	/*
154	* If we're allocating a buffer, and the one we're discarding is the
155	* same size, we don't want to waste the time to re-integrate it into
156	* the shared memory free list. If the DB_MPOOLFILE argument isn't
157	***************
158	* 81,99 **
159	len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize;
160
161	R_LOCK(dbenv, memreg);
162	-
163	- /*
164	- * On every buffer allocation we update the buffer generation number
165	- * and check for wraparound.
166	- */
167	- if (++c_mp->lru_count == UINT32_T_MAX)
168	- __memp_reset_lru(dbenv, memreg, c_mp);
169	-
170	/*
171	* Anything newer than 1/10th of the buffer pool is ignored during
172	* allocation (unless allocation starts failing).
173	*/
174	- DB_ASSERT(c_mp->lru_count > c_mp->stat.st_pages / 10);
175	high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10;
176
177	/*
178	--- 76,85 ----
179	***************
180	* 120,129 **
181	* We're not holding the region locked here, these statistics
182	* can't be trusted.
183	*/
184	! if (buckets != 0) {
185	! if (buckets > c_mp->stat.st_alloc_max_buckets)
186	! c_mp->stat.st_alloc_max_buckets = buckets;
187	! c_mp->stat.st_alloc_buckets += buckets;
188	}
189	if (buffers != 0) {
190	if (buffers > c_mp->stat.st_alloc_max_pages)
191	--- 106,116 ----
192	* We're not holding the region locked here, these statistics
193	* can't be trusted.
194	*/
195	! total_buckets += buckets;
196	! if (total_buckets != 0) {
197	! if (total_buckets > c_mp->stat.st_alloc_max_buckets)
198	! c_mp->stat.st_alloc_max_buckets = total_buckets;
199	! c_mp->stat.st_alloc_buckets += total_buckets;
200	}
201	if (buffers != 0) {
202	if (buffers > c_mp->stat.st_alloc_max_pages)
203	***************
204	* 131,136 **
205	--- 118,129 ----
206	c_mp->stat.st_alloc_pages += buffers;
207	}
208	return (0);
209	+ } else if (giveup \|\| c_mp->stat.st_pages == 0) {
210	+ R_UNLOCK(dbenv, memreg);
211	+
212	+ __db_err(dbenv,
213	+ "unable to allocate space from the buffer cache");
214	+ return (ret);
215	}
216
217	/*
218	***************
219	* 138,163 **
220	* we need. Reset our free-space counter.
221	*/
222	freed_space = 0;
223
224	/*
225	* Walk the hash buckets and find the next two with potentially useful
226	* buffers. Free the buffer with the lowest priority from the buckets'
227	* chains.
228	*/
229	! for (hp_tmp = NULL;;) {
230	/* Check for wrap around. */
231	hp = &dbht[c_mp->last_checked++];
232	if (hp >= hp_end) {
233	c_mp->last_checked = 0;
234	!
235	! /*
236	! * If we've gone through all of the hash buckets, try
237	! * an allocation. If the cache is small, the old page
238	! * size is small, and the new page size is large, we
239	! * might have freed enough memory (but not 3 times the
240	! * memory).
241	! */
242	! goto alloc;
243	}
244
245	/*
246	--- 131,154 ----
247	* we need. Reset our free-space counter.
248	*/
249	freed_space = 0;
250	+ total_buckets += buckets;
251	+ buckets = 0;
252
253	/*
254	* Walk the hash buckets and find the next two with potentially useful
255	* buffers. Free the buffer with the lowest priority from the buckets'
256	* chains.
257	*/
258	! for (;;) {
259	! /* All pages have been freed, make one last try */
260	! if (c_mp->stat.st_pages == 0)
261	! goto alloc;
262	!
263	/* Check for wrap around. */
264	hp = &dbht[c_mp->last_checked++];
265	if (hp >= hp_end) {
266	c_mp->last_checked = 0;
267	! hp = &dbht[c_mp->last_checked++];
268	}
269
270	/*
271	***************
272	* 172,210 **
273	/*
274	* The failure mode is when there are too many buffers we can't
275	* write or there's not enough memory in the system. We don't
276	! * have a metric for deciding if allocation has no possible way
277	! * to succeed, so we don't ever fail, we assume memory will be
278	! * available if we wait long enough.
279	*
280	! * Get aggressive if we've tried to flush 5 times the number of
281	! * hash buckets as are in the system -- it's possible we have
282	! * been repeatedly trying to flush the same buffers, although
283	! * it's unlikely. Aggressive means:
284	*
285	* a: set a flag to attempt to flush high priority buffers as
286	* well as other buffers.
287	* b: sync the mpool to force out queue extent pages. While we
288	* might not have enough space for what we want and flushing
289	* is expensive, why not?
290	! * c: sleep for a second -- hopefully someone else will run and
291	! * free up some memory. Try to allocate memory too, in case
292	! * the other thread returns its memory to the region.
293	! * d: look at a buffer in every hash bucket rather than choose
294	* the more preferable of two.
295	*
296	* !!!
297	* This test ignores pathological cases like no buffers in the
298	* system -- that shouldn't be possible.
299	*/
300	! if ((++buckets % max_na) == 0) {
301	! aggressive = 1;
302	!
303	R_UNLOCK(dbenv, memreg);
304
305	! (void)__memp_sync_int(
306	! dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
307	!
308	! (void)__os_sleep(dbenv, 1, 0);
309
310	R_LOCK(dbenv, memreg);
311	goto alloc;
312	--- 163,221 ----
313	/*
314	* The failure mode is when there are too many buffers we can't
315	* write or there's not enough memory in the system. We don't
316	! * have a way to know that allocation has no way to succeed.
317	! * We fail if there were no pages returned to the cache after
318	! * we've been trying for a relatively long time.
319	*
320	! * Get aggressive if we've tried to flush the number of hash
321	! * buckets as are in the system and have not found any more
322	! * space. Aggressive means:
323	*
324	* a: set a flag to attempt to flush high priority buffers as
325	* well as other buffers.
326	* b: sync the mpool to force out queue extent pages. While we
327	* might not have enough space for what we want and flushing
328	* is expensive, why not?
329	! * c: look at a buffer in every hash bucket rather than choose
330	* the more preferable of two.
331	+ * d: start to think about giving up.
332	+ *
333	+ * If we get here twice, sleep for a second, hopefully someone
334	+ * else will run and free up some memory.
335	+ *
336	+ * Always try to allocate memory too, in case some other thread
337	+ * returns its memory to the region.
338	*
339	* !!!
340	* This test ignores pathological cases like no buffers in the
341	* system -- that shouldn't be possible.
342	*/
343	! if ((++buckets % c_mp->htab_buckets) == 0) {
344	! if (freed_space > 0)
345	! goto alloc;
346	R_UNLOCK(dbenv, memreg);
347
348	! switch (++aggressive) {
349	! case 1:
350	! break;
351	! case 2:
352	! put_counter = c_mp->put_counter;
353	! /* FALLTHROUGH */
354	! case 3:
355	! case 4:
356	! case 5:
357	! case 6:
358	! (void)__memp_sync_int(
359	! dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
360	!
361	! (void)__os_sleep(dbenv, 1, 0);
362	! break;
363	! default:
364	! aggressive = 1;
365	! if (put_counter == c_mp->put_counter)
366	! giveup = 1;
367	! break;
368	! }
369
370	R_LOCK(dbenv, memreg);
371	goto alloc;
372	***************
373	* 277,283 **
374	* thread may have acquired this buffer and incremented the ref
375	* count after we wrote it, in which case we can't have it.
376	*
377	! * If there's a write error, avoid selecting this buffer again
378	* by making it the bucket's least-desirable buffer.
379	*/
380	if (ret != 0 \|\| bhp->ref != 0) {
381	--- 288,295 ----
382	* thread may have acquired this buffer and incremented the ref
383	* count after we wrote it, in which case we can't have it.
384	*
385	! * If there's a write error and we're having problems finding
386	! * something to allocate, avoid selecting this buffer again
387	* by making it the bucket's least-desirable buffer.
388	*/
389	if (ret != 0 \|\| bhp->ref != 0) {
390	***************
391	* 301,306 **
392	--- 313,320 ----
393
394	freed_space += __db_shsizeof(bhp);
395	__memp_bhfree(dbmp, hp, bhp, 1);
396	+ if (aggressive > 1)
397	+ aggressive = 1;
398
399	/*
400	* Unlock this hash bucket and re-acquire the region lock. If
401	***************
402	* 362,415 **
403	hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
404	}
405
406	- /*
407	- * __memp_reset_lru --
408	- * Reset the cache LRU counter.
409	- */
410	- static void
411	- __memp_reset_lru(dbenv, memreg, c_mp)
412	- DB_ENV *dbenv;
413	- REGINFO *memreg;
414	- MPOOL *c_mp;
415	- {
416	- BH *bhp;
417	- DB_MPOOL_HASH *hp;
418	- int bucket;
419	-
420	- /*
421	- * Update the counter so all future allocations will start at the
422	- * bottom.
423	- */
424	- c_mp->lru_count -= MPOOL_BASE_DECREMENT;
425	-
426	- /* Release the region lock. */
427	- R_UNLOCK(dbenv, memreg);
428	-
429	- /* Adjust the priority of every buffer in the system. */
430	- for (hp = R_ADDR(memreg, c_mp->htab),
431	- bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
432	- /*
433	- * Skip empty buckets.
434	- *
435	- * We can check for empty buckets before locking as we
436	- * only care if the pointer is zero or non-zero.
437	- */
438	- if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
439	- continue;
440	-
441	- MUTEX_LOCK(dbenv, &hp->hash_mutex);
442	- for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
443	- bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
444	- if (bhp->priority != UINT32_T_MAX &&
445	- bhp->priority > MPOOL_BASE_DECREMENT)
446	- bhp->priority -= MPOOL_BASE_DECREMENT;
447	- MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
448	- }
449	-
450	- /* Reacquire the region lock. */
451	- R_LOCK(dbenv, memreg);
452	- }
453	-
454	#ifdef DIAGNOSTIC
455	/*
456	* __memp_check_order --
457	--- 376,381 ----
458	*** dbreg/dbreg_rec.c.orig 2002-08-17 07:22:52.000000000 -0700
459	--- dbreg/dbreg_rec.c 2003-11-08 10:59:19.000000000 -0800
460	***************
461	* 174,192 **
462	* Typically, closes should match an open which means
463	* that if this is a close, there should be a valid
464	* entry in the dbentry table when we get here,
465	! * however there is an exception. If this is an
466	* OPENFILES pass, then we may have started from
467	* a log file other than the first, and the
468	* corresponding open appears in an earlier file.
469	! * We can ignore that case, but all others are errors.
470	*/
471	dbe = &dblp->dbentry[argp->fileid];
472	if (dbe->dbp == NULL && !dbe->deleted) {
473	/* No valid entry here. */
474	! if ((argp->opcode != LOG_CLOSE &&
475	! argp->opcode != LOG_RCLOSE) \|\|
476	! (op != DB_TXN_OPENFILES &&
477	! op !=DB_TXN_POPENFILES)) {
478	__db_err(dbenv,
479	"Improper file close at %lu/%lu",
480	(u_long)lsnp->file,
481	--- 174,193 ----
482	* Typically, closes should match an open which means
483	* that if this is a close, there should be a valid
484	* entry in the dbentry table when we get here,
485	! * however there are exceptions. 1. If this is an
486	* OPENFILES pass, then we may have started from
487	* a log file other than the first, and the
488	* corresponding open appears in an earlier file.
489	! * 2. If we are undoing an open on an abort or
490	! * recovery, it's possible that we failed after
491	! * the log record, but before we actually entered
492	! * a handle here.
493	*/
494	dbe = &dblp->dbentry[argp->fileid];
495	if (dbe->dbp == NULL && !dbe->deleted) {
496	/* No valid entry here. */
497	! if (DB_REDO(op) \|\|
498	! argp->opcode == LOG_CHECKPOINT) {
499	__db_err(dbenv,
500	"Improper file close at %lu/%lu",
501	(u_long)lsnp->file,
502	*** env/env_recover.c.orig.1 2002-08-22 14:52:51.000000000 -0700
503	--- env/env_recover.c 2003-11-15 08:20:59.000000000 -0800
504	***************
505	* 232,243 **
506	* we'll still need to do a vtruncate based on information we haven't
507	* yet collected.
508	*/
509	! if (ret == DB_NOTFOUND) {
510	ret = 0;
511	! if (max_lsn == NULL)
512	! goto done;
513	! }
514	! if (ret != 0)
515	goto err;
516
517	hi_txn = txnid;
518	--- 232,240 ----
519	* we'll still need to do a vtruncate based on information we haven't
520	* yet collected.
521	*/
522	! if (ret == DB_NOTFOUND)
523	ret = 0;
524	! else if (ret != 0)
525	goto err;
526
527	hi_txn = txnid;
528	***************
529	* 331,337 **
530
531	/* Find a low txnid. */
532	ret = 0;
533	! do {
534	/* txnid is after rectype, which is a u_int32. */
535	memcpy(&txnid,
536	(u_int8_t *)data.data + sizeof(u_int32_t), sizeof(txnid));
537	--- 328,334 ----
538
539	/* Find a low txnid. */
540	ret = 0;
541	! if (hi_txn != 0) do {
542	/* txnid is after rectype, which is a u_int32. */
543	memcpy(&txnid,
544	(u_int8_t *)data.data + sizeof(u_int32_t), sizeof(txnid));
545	***************
546	* 344,354 **
547	* There are no transactions and we're not recovering to an LSN (see
548	* above), so there is nothing to do.
549	*/
550	! if (ret == DB_NOTFOUND) {
551	ret = 0;
552	- if (max_lsn == NULL)
553	- goto done;
554	- }
555
556	/* Reset to the first lsn. */
557	if (ret != 0 \|\| (ret = logc->get(logc, &first_lsn, &data, DB_SET)) != 0)
558	--- 341,348 ----
559	* There are no transactions and we're not recovering to an LSN (see
560	* above), so there is nothing to do.
561	*/
562	! if (ret == DB_NOTFOUND)
563	ret = 0;
564
565	/* Reset to the first lsn. */
566	if (ret != 0 \|\| (ret = logc->get(logc, &first_lsn, &data, DB_SET)) != 0)
567	***************
568	* 367,372 **
569	--- 361,370 ----
570	txninfo, &data, &first_lsn, &last_lsn, nfiles, 1)) != 0)
571	goto err;
572
573	+ /* If there were no transactions, then we can bail out early. */
574	+ if (hi_txn == 0 && max_lsn == NULL)
575	+ goto done;
576	+
577	/*
578	* Pass #2.
579	*
580	***************
581	* 483,488 **
582	--- 481,487 ----
583	if ((ret = __dbreg_close_files(dbenv)) != 0)
584	goto err;
585
586	+ done:
587	if (max_lsn != NULL) {
588	region->last_ckp = ((DB_TXNHEAD *)txninfo)->ckplsn;
589
590	***************
591	* 538,544 **
592	__db_err(dbenv, "Recovery complete at %.24s", ctime(&now));
593	__db_err(dbenv, "%s %lx %s [%lu][%lu]",
594	"Maximum transaction ID",
595	! ((DB_TXNHEAD *)txninfo)->maxid,
596	"Recovery checkpoint",
597	(u_long)region->last_ckp.file,
598	(u_long)region->last_ckp.offset);
599	--- 537,544 ----
600	__db_err(dbenv, "Recovery complete at %.24s", ctime(&now));
601	__db_err(dbenv, "%s %lx %s [%lu][%lu]",
602	"Maximum transaction ID",
603	! txninfo == NULL ? TXN_MINIMUM :
604	! ((DB_TXNHEAD *)txninfo)->maxid,
605	"Recovery checkpoint",
606	(u_long)region->last_ckp.file,
607	(u_long)region->last_ckp.offset);
608	***************
609	* 550,556 **
610	(u_long)lsn.file, (u_long)lsn.offset, pass);
611	}
612
613	- done:
614	err: if (lockid != DB_LOCK_INVALIDID) {
615	if ((t_ret = __rep_unlockpages(dbenv, lockid)) != 0 && ret == 0)
616	ret = t_ret;
617	--- 550,555 ----