bt_put.c

Go to the documentation of this file.
00001 /*-
00002  * See the file LICENSE for redistribution information.
00003  *
00004  * Copyright (c) 1996-2002
00005  *      Sleepycat Software.  All rights reserved.
00006  */
00007 /*
00008  * Copyright (c) 1990, 1993, 1994, 1995, 1996
00009  *      Keith Bostic.  All rights reserved.
00010  */
00011 /*
00012  * Copyright (c) 1990, 1993, 1994, 1995
00013  *      The Regents of the University of California.  All rights reserved.
00014  *
00015  * This code is derived from software contributed to Berkeley by
00016  * Mike Olson.
00017  *
00018  * Redistribution and use in source and binary forms, with or without
00019  * modification, are permitted provided that the following conditions
00020  * are met:
00021  * 1. Redistributions of source code must retain the above copyright
00022  *    notice, this list of conditions and the following disclaimer.
00023  * 2. Redistributions in binary form must reproduce the above copyright
00024  *    notice, this list of conditions and the following disclaimer in the
00025  *    documentation and/or other materials provided with the distribution.
00026  * 3. Neither the name of the University nor the names of its contributors
00027  *    may be used to endorse or promote products derived from this software
00028  *    without specific prior written permission.
00029  *
00030  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
00031  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00032  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00033  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
00034  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00035  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00036  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00037  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00038  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00039  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00040  * SUCH DAMAGE.
00041  */
00042 
00043 #include "db_config.h"
00044 
00045 #ifndef lint
00046 static const char revid[] = "$Id: bt_put.c,v 11.69 2002/08/06 06:11:12 bostic Exp $";
00047 #endif /* not lint */
00048 
00049 #ifndef NO_SYSTEM_INCLUDES
00050 #include <sys/types.h>
00051 
00052 #include <string.h>
00053 #endif
00054 
00055 #include "db_int.h"
00056 #include "dbinc/db_page.h"
00057 #include "dbinc/btree.h"
00058 
00059 static int __bam_build
00060                __P((DBC *, u_int32_t, DBT *, PAGE *, u_int32_t, u_int32_t));
00061 static int __bam_dup_convert __P((DBC *, PAGE *, u_int32_t));
00062 static int __bam_ovput
00063                __P((DBC *, u_int32_t, db_pgno_t, PAGE *, u_int32_t, DBT *));
00064 static u_int32_t
00065            __bam_partsize __P((DB *, u_int32_t, DBT *, PAGE *, u_int32_t));
00066 
00067 /*
00068  * __bam_iitem --
00069  *      Insert an item into the tree.
00070  *
00071  * PUBLIC: int __bam_iitem __P((DBC *, DBT *, DBT *, u_int32_t, u_int32_t));
00072  */
00073 int
00074 __bam_iitem(dbc, key, data, op, flags)
00075         DBC *dbc;
00076         DBT *key, *data;
00077         u_int32_t op, flags;
00078 {
00079         BKEYDATA *bk, bk_tmp;
00080         BTREE *t;
00081         BTREE_CURSOR *cp;
00082         DB *dbp;
00083         DBT bk_hdr, tdbt;
00084         DB_MPOOLFILE *mpf;
00085         PAGE *h;
00086         db_indx_t indx;
00087         u_int32_t data_size, have_bytes, need_bytes, needed;
00088         int cmp, bigkey, bigdata, dupadjust, padrec, replace, ret, was_deleted;
00089 
00090         COMPQUIET(bk, NULL);
00091 
00092         dbp = dbc->dbp;
00093         mpf = dbp->mpf;
00094         cp = (BTREE_CURSOR *)dbc->internal;
00095         t = dbp->bt_internal;
00096         h = cp->page;
00097         indx = cp->indx;
00098         dupadjust = replace = was_deleted = 0;
00099 
00100         /*
00101          * Fixed-length records with partial puts: it's an error to specify
00102          * anything other simple overwrite.
00103          */
00104         if (F_ISSET(dbp, DB_AM_FIXEDLEN) &&
00105             F_ISSET(data, DB_DBT_PARTIAL) && data->dlen != data->size) {
00106                 data_size = data->size;
00107                 goto len_err;
00108         }
00109 
00110         /*
00111          * Figure out how much space the data will take, including if it's a
00112          * partial record.
00113          *
00114          * Fixed-length records: it's an error to specify a record that's
00115          * longer than the fixed-length, and we never require less than
00116          * the fixed-length record size.
00117          */
00118         data_size = F_ISSET(data, DB_DBT_PARTIAL) ?
00119             __bam_partsize(dbp, op, data, h, indx) : data->size;
00120         padrec = 0;
00121         if (F_ISSET(dbp, DB_AM_FIXEDLEN)) {
00122                 if (data_size > t->re_len) {
00123 len_err:                __db_err(dbp->dbenv,
00124                             "Length improper for fixed length record %lu",
00125                             (u_long)data_size);
00126                         return (EINVAL);
00127                 }
00128 
00129                 /* Records that are deleted anyway needn't be padded out. */
00130                 if (!LF_ISSET(BI_DELETED) && data_size < t->re_len) {
00131                         padrec = 1;
00132                         data_size = t->re_len;
00133                 }
00134         }
00135 
00136         /*
00137          * Handle partial puts or short fixed-length records: build the
00138          * real record.
00139          */
00140         if (padrec || F_ISSET(data, DB_DBT_PARTIAL)) {
00141                 tdbt = *data;
00142                 if ((ret =
00143                     __bam_build(dbc, op, &tdbt, h, indx, data_size)) != 0)
00144                         return (ret);
00145                 data = &tdbt;
00146         }
00147 
00148         /*
00149          * If the user has specified a duplicate comparison function, return
00150          * an error if DB_CURRENT was specified and the replacement data
00151          * doesn't compare equal to the current data.  This stops apps from
00152          * screwing up the duplicate sort order.  We have to do this after
00153          * we build the real record so that we're comparing the real items.
00154          */
00155         if (op == DB_CURRENT && dbp->dup_compare != NULL) {
00156                 if ((ret = __bam_cmp(dbp, data, h,
00157                     indx + (TYPE(h) == P_LBTREE ? O_INDX : 0),
00158                     dbp->dup_compare, &cmp)) != 0)
00159                         return (ret);
00160                 if (cmp != 0) {
00161                         __db_err(dbp->dbenv,
00162                             "Current data differs from put data");
00163                         return (EINVAL);
00164                 }
00165         }
00166 
00167         /*
00168          * If the key or data item won't fit on a page, we'll have to store
00169          * them on overflow pages.
00170          */
00171         needed = 0;
00172         bigdata = data_size > cp->ovflsize;
00173         switch (op) {
00174         case DB_KEYFIRST:
00175                 /* We're adding a new key and data pair. */
00176                 bigkey = key->size > cp->ovflsize;
00177                 if (bigkey)
00178                         needed += BOVERFLOW_PSIZE;
00179                 else
00180                         needed += BKEYDATA_PSIZE(key->size);
00181                 if (bigdata)
00182                         needed += BOVERFLOW_PSIZE;
00183                 else
00184                         needed += BKEYDATA_PSIZE(data_size);
00185                 break;
00186         case DB_AFTER:
00187         case DB_BEFORE:
00188         case DB_CURRENT:
00189                 /*
00190                  * We're either overwriting the data item of a key/data pair
00191                  * or we're creating a new on-page duplicate and only adding
00192                  * a data item.
00193                  *
00194                  * !!!
00195                  * We're not currently correcting for space reclaimed from
00196                  * already deleted items, but I don't think it's worth the
00197                  * complexity.
00198                  */
00199                 bigkey = 0;
00200                 if (op == DB_CURRENT) {
00201                         bk = GET_BKEYDATA(dbp, h,
00202                             indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
00203                         if (B_TYPE(bk->type) == B_KEYDATA)
00204                                 have_bytes = BKEYDATA_PSIZE(bk->len);
00205                         else
00206                                 have_bytes = BOVERFLOW_PSIZE;
00207                         need_bytes = 0;
00208                 } else {
00209                         have_bytes = 0;
00210                         need_bytes = sizeof(db_indx_t);
00211                 }
00212                 if (bigdata)
00213                         need_bytes += BOVERFLOW_PSIZE;
00214                 else
00215                         need_bytes += BKEYDATA_PSIZE(data_size);
00216 
00217                 if (have_bytes < need_bytes)
00218                         needed += need_bytes - have_bytes;
00219                 break;
00220         default:
00221                 return (__db_unknown_flag(dbp->dbenv, "__bam_iitem", op));
00222         }
00223 
00224         /*
00225          * If there's not enough room, or the user has put a ceiling on the
00226          * number of keys permitted in the page, split the page.
00227          *
00228          * XXX
00229          * The t->bt_maxkey test here may be insufficient -- do we have to
00230          * check in the btree split code, so we don't undo it there!?!?
00231          */
00232         if (P_FREESPACE(dbp, h) < needed ||
00233             (t->bt_maxkey != 0 && NUM_ENT(h) > t->bt_maxkey))
00234                 return (DB_NEEDSPLIT);
00235 
00236         /*
00237          * The code breaks it up into five cases:
00238          *
00239          * 1. Insert a new key/data pair.
00240          * 2. Append a new data item (a new duplicate).
00241          * 3. Insert a new data item (a new duplicate).
00242          * 4. Delete and re-add the data item (overflow item).
00243          * 5. Overwrite the data item.
00244          */
00245         switch (op) {
00246         case DB_KEYFIRST:               /* 1. Insert a new key/data pair. */
00247                 if (bigkey) {
00248                         if ((ret = __bam_ovput(dbc,
00249                             B_OVERFLOW, PGNO_INVALID, h, indx, key)) != 0)
00250                                 return (ret);
00251                 } else
00252                         if ((ret = __db_pitem(dbc, h, indx,
00253                             BKEYDATA_SIZE(key->size), NULL, key)) != 0)
00254                                 return (ret);
00255 
00256                 if ((ret = __bam_ca_di(dbc, PGNO(h), indx, 1)) != 0)
00257                         return (ret);
00258                 ++indx;
00259                 break;
00260         case DB_AFTER:                  /* 2. Append a new data item. */
00261                 if (TYPE(h) == P_LBTREE) {
00262                         /* Copy the key for the duplicate and adjust cursors. */
00263                         if ((ret =
00264                             __bam_adjindx(dbc, h, indx + P_INDX, indx, 1)) != 0)
00265                                 return (ret);
00266                         if ((ret =
00267                             __bam_ca_di(dbc, PGNO(h), indx + P_INDX, 1)) != 0)
00268                                 return (ret);
00269 
00270                         indx += 3;
00271                         dupadjust = 1;
00272 
00273                         cp->indx += 2;
00274                 } else {
00275                         ++indx;
00276                         cp->indx += 1;
00277                 }
00278                 break;
00279         case DB_BEFORE:                 /* 3. Insert a new data item. */
00280                 if (TYPE(h) == P_LBTREE) {
00281                         /* Copy the key for the duplicate and adjust cursors. */
00282                         if ((ret = __bam_adjindx(dbc, h, indx, indx, 1)) != 0)
00283                                 return (ret);
00284                         if ((ret = __bam_ca_di(dbc, PGNO(h), indx, 1)) != 0)
00285                                 return (ret);
00286 
00287                         ++indx;
00288                         dupadjust = 1;
00289                 }
00290                 break;
00291         case DB_CURRENT:
00292                  /*
00293                   * Clear the cursor's deleted flag.  The problem is that if
00294                   * we deadlock or fail while deleting the overflow item or
00295                   * replacing the non-overflow item, a subsequent cursor close
00296                   * will try and remove the item because the cursor's delete
00297                   * flag is set
00298                   */
00299                 (void)__bam_ca_delete(dbp, PGNO(h), indx, 0);
00300 
00301                 if (TYPE(h) == P_LBTREE) {
00302                         ++indx;
00303                         dupadjust = 1;
00304 
00305                         /*
00306                          * In a Btree deleted records aren't counted (deleted
00307                          * records are counted in a Recno because all accesses
00308                          * are based on record number).  If it's a Btree and
00309                          * it's a DB_CURRENT operation overwriting a previously
00310                          * deleted record, increment the record count.
00311                          */
00312                         was_deleted = B_DISSET(bk->type);
00313                 }
00314 
00315                 /*
00316                  * 4. Delete and re-add the data item.
00317                  *
00318                  * If we're changing the type of the on-page structure, or we
00319                  * are referencing offpage items, we have to delete and then
00320                  * re-add the item.  We do not do any cursor adjustments here
00321                  * because we're going to immediately re-add the item into the
00322                  * same slot.
00323                  */
00324                 if (bigdata || B_TYPE(bk->type) != B_KEYDATA) {
00325                         if ((ret = __bam_ditem(dbc, h, indx)) != 0)
00326                                 return (ret);
00327                         break;
00328                 }
00329 
00330                 /* 5. Overwrite the data item. */
00331                 replace = 1;
00332                 break;
00333         default:
00334                 return (__db_unknown_flag(dbp->dbenv, "__bam_iitem", op));
00335         }
00336 
00337         /* Add the data. */
00338         if (bigdata) {
00339                 /*
00340                  * We do not have to handle deleted (BI_DELETED) records
00341                  * in this case; the actual records should never be created.
00342                  */
00343                 DB_ASSERT(!LF_ISSET(BI_DELETED));
00344                 if ((ret = __bam_ovput(dbc,
00345                     B_OVERFLOW, PGNO_INVALID, h, indx, data)) != 0)
00346                         return (ret);
00347         } else {
00348                 if (LF_ISSET(BI_DELETED)) {
00349                         B_TSET(bk_tmp.type, B_KEYDATA, 1);
00350                         bk_tmp.len = data->size;
00351                         bk_hdr.data = &bk_tmp;
00352                         bk_hdr.size = SSZA(BKEYDATA, data);
00353                         ret = __db_pitem(dbc, h, indx,
00354                             BKEYDATA_SIZE(data->size), &bk_hdr, data);
00355                 } else if (replace)
00356                         ret = __bam_ritem(dbc, h, indx, data);
00357                 else
00358                         ret = __db_pitem(dbc, h, indx,
00359                             BKEYDATA_SIZE(data->size), NULL, data);
00360                 if (ret != 0)
00361                         return (ret);
00362         }
00363         if ((ret = mpf->set(mpf, h, DB_MPOOL_DIRTY)) != 0)
00364                 return (ret);
00365 
00366         /*
00367          * Re-position the cursors if necessary and reset the current cursor
00368          * to point to the new item.
00369          */
00370         if (op != DB_CURRENT) {
00371                 if ((ret = __bam_ca_di(dbc, PGNO(h), indx, 1)) != 0)
00372                         return (ret);
00373                 cp->indx = TYPE(h) == P_LBTREE ? indx - O_INDX : indx;
00374         }
00375 
00376         /*
00377          * If we've changed the record count, update the tree.  There's no
00378          * need to adjust the count if the operation not performed on the
00379          * current record or when the current record was previously deleted.
00380          */
00381         if (F_ISSET(cp, C_RECNUM) && (op != DB_CURRENT || was_deleted))
00382                 if ((ret = __bam_adjust(dbc, 1)) != 0)
00383                         return (ret);
00384 
00385         /*
00386          * If a Btree leaf page is at least 50% full and we may have added or
00387          * modified a duplicate data item, see if the set of duplicates takes
00388          * up at least 25% of the space on the page.  If it does, move it onto
00389          * its own page.
00390          */
00391         if (dupadjust && P_FREESPACE(dbp, h) <= dbp->pgsize / 2) {
00392                 if ((ret = __bam_dup_convert(dbc, h, indx - O_INDX)) != 0)
00393                         return (ret);
00394         }
00395 
00396         /* If we've modified a recno file, set the flag. */
00397         if (dbc->dbtype == DB_RECNO)
00398                 t->re_modified = 1;
00399 
00400         return (ret);
00401 }
00402 
00403 /*
00404  * __bam_partsize --
00405  *      Figure out how much space a partial data item is in total.
00406  */
00407 static u_int32_t
00408 __bam_partsize(dbp, op, data, h, indx)
00409         DB *dbp;
00410         u_int32_t op, indx;
00411         DBT *data;
00412         PAGE *h;
00413 {
00414         BKEYDATA *bk;
00415         u_int32_t nbytes;
00416 
00417         /*
00418          * If the record doesn't already exist, it's simply the data we're
00419          * provided.
00420          */
00421         if (op != DB_CURRENT)
00422                 return (data->doff + data->size);
00423 
00424         /*
00425          * Otherwise, it's the data provided plus any already existing data
00426          * that we're not replacing.
00427          */
00428         bk = GET_BKEYDATA(dbp, h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
00429         nbytes =
00430             B_TYPE(bk->type) == B_OVERFLOW ? ((BOVERFLOW *)bk)->tlen : bk->len;
00431 
00432         return (__db_partsize(nbytes, data));
00433 }
00434 
00435 /*
00436  * __bam_build --
00437  *      Build the real record for a partial put, or short fixed-length record.
00438  */
00439 static int
00440 __bam_build(dbc, op, dbt, h, indx, nbytes)
00441         DBC *dbc;
00442         u_int32_t op, indx, nbytes;
00443         DBT *dbt;
00444         PAGE *h;
00445 {
00446         BKEYDATA *bk, tbk;
00447         BOVERFLOW *bo;
00448         BTREE *t;
00449         DB *dbp;
00450         DBT copy, *rdata;
00451         u_int32_t len, tlen;
00452         u_int8_t *p;
00453         int ret;
00454 
00455         COMPQUIET(bo, NULL);
00456 
00457         dbp = dbc->dbp;
00458         t = dbp->bt_internal;
00459 
00460         /* We use the record data return memory, it's only a short-term use. */
00461         rdata = &dbc->my_rdata;
00462         if (rdata->ulen < nbytes) {
00463                 if ((ret = __os_realloc(dbp->dbenv,
00464                     nbytes, &rdata->data)) != 0) {
00465                         rdata->ulen = 0;
00466                         rdata->data = NULL;
00467                         return (ret);
00468                 }
00469                 rdata->ulen = nbytes;
00470         }
00471 
00472         /*
00473          * We use nul or pad bytes for any part of the record that isn't
00474          * specified; get it over with.
00475          */
00476         memset(rdata->data,
00477            F_ISSET(dbp, DB_AM_FIXEDLEN) ? t->re_pad : 0, nbytes);
00478 
00479         /*
00480          * In the next clauses, we need to do three things: a) set p to point
00481          * to the place at which to copy the user's data, b) set tlen to the
00482          * total length of the record, not including the bytes contributed by
00483          * the user, and c) copy any valid data from an existing record.  If
00484          * it's not a partial put (this code is called for both partial puts
00485          * and fixed-length record padding) or it's a new key, we can cut to
00486          * the chase.
00487          */
00488         if (!F_ISSET(dbt, DB_DBT_PARTIAL) || op != DB_CURRENT) {
00489                 p = (u_int8_t *)rdata->data + dbt->doff;
00490                 tlen = dbt->doff;
00491                 goto user_copy;
00492         }
00493 
00494         /* Find the current record. */
00495         if (indx < NUM_ENT(h)) {
00496                 bk = GET_BKEYDATA(dbp, h, indx + (TYPE(h) == P_LBTREE ?
00497                     O_INDX : 0));
00498                 bo = (BOVERFLOW *)bk;
00499         } else {
00500                 bk = &tbk;
00501                 B_TSET(bk->type, B_KEYDATA, 0);
00502                 bk->len = 0;
00503         }
00504         if (B_TYPE(bk->type) == B_OVERFLOW) {
00505                 /*
00506                  * In the case of an overflow record, we shift things around
00507                  * in the current record rather than allocate a separate copy.
00508                  */
00509                 memset(&copy, 0, sizeof(copy));
00510                 if ((ret = __db_goff(dbp, &copy, bo->tlen,
00511                     bo->pgno, &rdata->data, &rdata->ulen)) != 0)
00512                         return (ret);
00513 
00514                 /* Skip any leading data from the original record. */
00515                 tlen = dbt->doff;
00516                 p = (u_int8_t *)rdata->data + dbt->doff;
00517 
00518                 /*
00519                  * Copy in any trailing data from the original record.
00520                  *
00521                  * If the original record was larger than the original offset
00522                  * plus the bytes being deleted, there is trailing data in the
00523                  * original record we need to preserve.  If we aren't deleting
00524                  * the same number of bytes as we're inserting, copy it up or
00525                  * down, into place.
00526                  *
00527                  * Use memmove(), the regions may overlap.
00528                  */
00529                 if (bo->tlen > dbt->doff + dbt->dlen) {
00530                         len = bo->tlen - (dbt->doff + dbt->dlen);
00531                         if (dbt->dlen != dbt->size)
00532                                 memmove(p + dbt->size, p + dbt->dlen, len);
00533                         tlen += len;
00534                 }
00535         } else {
00536                 /* Copy in any leading data from the original record. */
00537                 memcpy(rdata->data,
00538                     bk->data, dbt->doff > bk->len ? bk->len : dbt->doff);
00539                 tlen = dbt->doff;
00540                 p = (u_int8_t *)rdata->data + dbt->doff;
00541 
00542                 /* Copy in any trailing data from the original record. */
00543                 len = dbt->doff + dbt->dlen;
00544                 if (bk->len > len) {
00545                         memcpy(p + dbt->size, bk->data + len, bk->len - len);
00546                         tlen += bk->len - len;
00547                 }
00548         }
00549 
00550 user_copy:
00551         /*
00552          * Copy in the application provided data -- p and tlen must have been
00553          * initialized above.
00554          */
00555         memcpy(p, dbt->data, dbt->size);
00556         tlen += dbt->size;
00557 
00558         /* Set the DBT to reference our new record. */
00559         rdata->size = F_ISSET(dbp, DB_AM_FIXEDLEN) ? t->re_len : tlen;
00560         rdata->dlen = 0;
00561         rdata->doff = 0;
00562         rdata->flags = 0;
00563         *dbt = *rdata;
00564         return (0);
00565 }
00566 
00567 /*
00568  * __bam_ritem --
00569  *      Replace an item on a page.
00570  *
00571  * PUBLIC: int __bam_ritem __P((DBC *, PAGE *, u_int32_t, DBT *));
00572  */
00573 int
00574 __bam_ritem(dbc, h, indx, data)
00575         DBC *dbc;
00576         PAGE *h;
00577         u_int32_t indx;
00578         DBT *data;
00579 {
00580         BKEYDATA *bk;
00581         DB *dbp;
00582         DBT orig, repl;
00583         db_indx_t cnt, lo, ln, min, off, prefix, suffix;
00584         int32_t nbytes;
00585         int ret;
00586         db_indx_t *inp;
00587         u_int8_t *p, *t;
00588 
00589         dbp = dbc->dbp;
00590 
00591         /*
00592          * Replace a single item onto a page.  The logic figuring out where
00593          * to insert and whether it fits is handled in the caller.  All we do
00594          * here is manage the page shuffling.
00595          */
00596         bk = GET_BKEYDATA(dbp, h, indx);
00597 
00598         /* Log the change. */
00599         if (DBC_LOGGING(dbc)) {
00600                 /*
00601                  * We might as well check to see if the two data items share
00602                  * a common prefix and suffix -- it can save us a lot of log
00603                  * message if they're large.
00604                  */
00605                 min = data->size < bk->len ? data->size : bk->len;
00606                 for (prefix = 0,
00607                     p = bk->data, t = data->data;
00608                     prefix < min && *p == *t; ++prefix, ++p, ++t)
00609                         ;
00610 
00611                 min -= prefix;
00612                 for (suffix = 0,
00613                     p = (u_int8_t *)bk->data + bk->len - 1,
00614                     t = (u_int8_t *)data->data + data->size - 1;
00615                     suffix < min && *p == *t; ++suffix, --p, --t)
00616                         ;
00617 
00618                 /* We only log the parts of the keys that have changed. */
00619                 orig.data = (u_int8_t *)bk->data + prefix;
00620                 orig.size = bk->len - (prefix + suffix);
00621                 repl.data = (u_int8_t *)data->data + prefix;
00622                 repl.size = data->size - (prefix + suffix);
00623                 if ((ret = __bam_repl_log(dbp, dbc->txn, &LSN(h), 0, PGNO(h),
00624                     &LSN(h), (u_int32_t)indx, (u_int32_t)B_DISSET(bk->type),
00625                     &orig, &repl, (u_int32_t)prefix, (u_int32_t)suffix)) != 0)
00626                         return (ret);
00627         } else
00628                 LSN_NOT_LOGGED(LSN(h));
00629 
00630         /*
00631          * Set references to the first in-use byte on the page and the
00632          * first byte of the item being replaced.
00633          */
00634         inp = P_INP(dbp, h);
00635         p = (u_int8_t *)h + HOFFSET(h);
00636         t = (u_int8_t *)bk;
00637 
00638         /*
00639          * If the entry is growing in size, shift the beginning of the data
00640          * part of the page down.  If the entry is shrinking in size, shift
00641          * the beginning of the data part of the page up.  Use memmove(3),
00642          * the regions overlap.
00643          */
00644         lo = BKEYDATA_SIZE(bk->len);
00645         ln = (db_indx_t)BKEYDATA_SIZE(data->size);
00646         if (lo != ln) {
00647                 nbytes = lo - ln;               /* Signed difference. */
00648                 if (p == t)                     /* First index is fast. */
00649                         inp[indx] += nbytes;
00650                 else {                          /* Else, shift the page. */
00651                         memmove(p + nbytes, p, t - p);
00652 
00653                         /* Adjust the indices' offsets. */
00654                         off = inp[indx];
00655                         for (cnt = 0; cnt < NUM_ENT(h); ++cnt)
00656                                 if (inp[cnt] <= off)
00657                                         inp[cnt] += nbytes;
00658                 }
00659 
00660                 /* Clean up the page and adjust the item's reference. */
00661                 HOFFSET(h) += nbytes;
00662                 t += nbytes;
00663         }
00664 
00665         /* Copy the new item onto the page. */
00666         bk = (BKEYDATA *)t;
00667         B_TSET(bk->type, B_KEYDATA, 0);
00668         bk->len = data->size;
00669         memcpy(bk->data, data->data, data->size);
00670 
00671         return (0);
00672 }
00673 
00674 /*
00675  * __bam_dup_convert --
00676  *      Check to see if the duplicate set at indx should have its own page.
00677  *      If it should, create it.
00678  */
00679 static int
00680 __bam_dup_convert(dbc, h, indx)
00681         DBC *dbc;
00682         PAGE *h;
00683         u_int32_t indx;
00684 {
00685         BKEYDATA *bk;
00686         DB *dbp;
00687         DBT hdr;
00688         DB_MPOOLFILE *mpf;
00689         PAGE *dp;
00690         db_indx_t cnt, cpindx, dindx, first, *inp, sz;
00691         int ret;
00692 
00693         dbp = dbc->dbp;
00694         mpf = dbp->mpf;
00695         inp = P_INP(dbp, h);
00696 
00697         /*
00698          * Count the duplicate records and calculate how much room they're
00699          * using on the page.
00700          */
00701         while (indx > 0 && inp[indx] == inp[indx - P_INDX])
00702                 indx -= P_INDX;
00703         for (cnt = 0, sz = 0, first = indx;; ++cnt, indx += P_INDX) {
00704                 if (indx >= NUM_ENT(h) || inp[first] != inp[indx])
00705                         break;
00706                 bk = GET_BKEYDATA(dbp, h, indx);
00707                 sz += B_TYPE(bk->type) == B_KEYDATA ?
00708                     BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE;
00709                 bk = GET_BKEYDATA(dbp, h, indx + O_INDX);
00710                 sz += B_TYPE(bk->type) == B_KEYDATA ?
00711                     BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE;
00712         }
00713 
00714         /*
00715          * We have to do these checks when the user is replacing the cursor's
00716          * data item -- if the application replaces a duplicate item with a
00717          * larger data item, it can increase the amount of space used by the
00718          * duplicates, requiring this check.  But that means we may have done
00719          * this check when it wasn't a duplicate item after all.
00720          */
00721         if (cnt == 1)
00722                 return (0);
00723 
00724         /*
00725          * If this set of duplicates is using more than 25% of the page, move
00726          * them off.  The choice of 25% is a WAG, but the value must be small
00727          * enough that we can always split a page without putting duplicates
00728          * on two different pages.
00729          */
00730         if (sz < dbp->pgsize / 4)
00731                 return (0);
00732 
00733         /* Get a new page. */
00734         if ((ret = __db_new(dbc,
00735             dbp->dup_compare == NULL ? P_LRECNO : P_LDUP, &dp)) != 0)
00736                 return (ret);
00737         P_INIT(dp, dbp->pgsize, dp->pgno,
00738             PGNO_INVALID, PGNO_INVALID, LEAFLEVEL, TYPE(dp));
00739 
00740         /*
00741          * Move this set of duplicates off the page.  First points to the first
00742          * key of the first duplicate key/data pair, cnt is the number of pairs
00743          * we're dealing with.
00744          */
00745         memset(&hdr, 0, sizeof(hdr));
00746         dindx = first;
00747         indx = first;
00748         cpindx = 0;
00749         do {
00750                 /* Move cursors referencing the old entry to the new entry. */
00751                 if ((ret = __bam_ca_dup(dbc, first,
00752                     PGNO(h), indx, PGNO(dp), cpindx)) != 0)
00753                         goto err;
00754 
00755                 /*
00756                  * Copy the entry to the new page.  If the off-duplicate page
00757                  * If the off-duplicate page is a Btree page (i.e. dup_compare
00758                  * will be non-NULL, we use Btree pages for sorted dups,
00759                  * and Recno pages for unsorted dups), move all entries
00760                  * normally, even deleted ones.  If it's a Recno page,
00761                  * deleted entries are discarded (if the deleted entry is
00762                  * overflow, then free up those pages).
00763                  */
00764                 bk = GET_BKEYDATA(dbp, h, dindx + 1);
00765                 hdr.data = bk;
00766                 hdr.size = B_TYPE(bk->type) == B_KEYDATA ?
00767                     BKEYDATA_SIZE(bk->len) : BOVERFLOW_SIZE;
00768                 if (dbp->dup_compare == NULL && B_DISSET(bk->type)) {
00769                         /*
00770                          * Unsorted dups, i.e. recno page, and we have
00771                          * a deleted entry, don't move it, but if it was
00772                          * an overflow entry, we need to free those pages.
00773                          */
00774                         if (B_TYPE(bk->type) == B_OVERFLOW &&
00775                             (ret = __db_doff(dbc,
00776                             (GET_BOVERFLOW(dbp, h, dindx + 1))->pgno)) != 0)
00777                                 goto err;
00778                 } else {
00779                         if ((ret = __db_pitem(
00780                             dbc, dp, cpindx, hdr.size, &hdr, NULL)) != 0)
00781                                 goto err;
00782                         ++cpindx;
00783                 }
00784                 /* Delete all but the last reference to the key. */
00785                 if (cnt != 1) {
00786                         if ((ret = __bam_adjindx(dbc,
00787                             h, dindx, first + 1, 0)) != 0)
00788                                 goto err;
00789                 } else
00790                         dindx++;
00791 
00792                 /* Delete the data item. */
00793                 if ((ret = __db_ditem(dbc, h, dindx, hdr.size)) != 0)
00794                         goto err;
00795                 indx += P_INDX;
00796         } while (--cnt);
00797 
00798         /* Put in a new data item that points to the duplicates page. */
00799         if ((ret = __bam_ovput(dbc,
00800             B_DUPLICATE, dp->pgno, h, first + 1, NULL)) != 0)
00801                 goto err;
00802 
00803         /* Adjust cursors for all the above movments. */
00804         if ((ret = __bam_ca_di(dbc,
00805             PGNO(h), first + P_INDX, first + P_INDX - indx)) != 0)
00806                 goto err;
00807 
00808         return (mpf->put(mpf, dp, DB_MPOOL_DIRTY));
00809 
00810 err:    (void)mpf->put(mpf, dp, 0);
00811         return (ret);
00812 }
00813 
00814 /*
00815  * __bam_ovput --
00816  *      Build an item for an off-page duplicates page or overflow page and
00817  *      insert it on the page.
00818  */
00819 static int
00820 __bam_ovput(dbc, type, pgno, h, indx, item)
00821         DBC *dbc;
00822         u_int32_t type, indx;
00823         db_pgno_t pgno;
00824         PAGE *h;
00825         DBT *item;
00826 {
00827         BOVERFLOW bo;
00828         DBT hdr;
00829         int ret;
00830 
00831         UMRW_SET(bo.unused1);
00832         B_TSET(bo.type, type, 0);
00833         UMRW_SET(bo.unused2);
00834 
00835         /*
00836          * If we're creating an overflow item, do so and acquire the page
00837          * number for it.  If we're creating an off-page duplicates tree,
00838          * we are giving the page number as an argument.
00839          */
00840         if (type == B_OVERFLOW) {
00841                 if ((ret = __db_poff(dbc, item, &bo.pgno)) != 0)
00842                         return (ret);
00843                 bo.tlen = item->size;
00844         } else {
00845                 bo.pgno = pgno;
00846                 bo.tlen = 0;
00847         }
00848 
00849         /* Store the new record on the page. */
00850         memset(&hdr, 0, sizeof(hdr));
00851         hdr.data = &bo;
00852         hdr.size = BOVERFLOW_SIZE;
00853         return (__db_pitem(dbc, h, indx, BOVERFLOW_SIZE, &hdr, NULL));
00854 }

Generated on Wed Jul 20 21:03:00 2005 for MySQL 5.0.9 Beta by  doxygen 1.4.3