Drizzled Public API Documentation

row0sel.cc
00001 /*****************************************************************************
00002 
00003 Copyright (C) 1997, 2010, Innobase Oy. All Rights Reserved.
00004 Copyright (C) 2008, Google Inc.
00005 
00006 Portions of this file contain modifications contributed and copyrighted by
00007 Google, Inc. Those modifications are gratefully acknowledged and are described
00008 briefly in the InnoDB documentation. The contributions by Google are
00009 incorporated with their permission, and subject to the conditions contained in
00010 the file COPYING.Google.
00011 
00012 This program is free software; you can redistribute it and/or modify it under
00013 the terms of the GNU General Public License as published by the Free Software
00014 Foundation; version 2 of the License.
00015 
00016 This program is distributed in the hope that it will be useful, but WITHOUT
00017 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
00018 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
00019 
00020 You should have received a copy of the GNU General Public License along with
00021 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
00022 St, Fifth Floor, Boston, MA 02110-1301 USA
00023 
00024 *****************************************************************************/
00025 
00026 /***************************************************/
00033 #include "row0sel.h"
00034 
00035 #ifdef UNIV_NONINL
00036 #include "row0sel.ic"
00037 #endif
00038 
00039 #include "dict0dict.h"
00040 #include "dict0boot.h"
00041 #include "trx0undo.h"
00042 #include "trx0trx.h"
00043 #include "btr0btr.h"
00044 #include "btr0cur.h"
00045 #include "btr0sea.h"
00046 #include "mach0data.h"
00047 #include "que0que.h"
00048 #include "row0upd.h"
00049 #include "row0row.h"
00050 #include "row0vers.h"
00051 #include "rem0cmp.h"
00052 #include "lock0lock.h"
00053 #include "eval0eval.h"
00054 #include "pars0sym.h"
00055 #include "pars0pars.h"
00056 #include "row0mysql.h"
00057 #include "read0read.h"
00058 #include "buf0lru.h"
00059 #include "ha_prototypes.h"
00060 
00061 /* Maximum number of rows to prefetch; MySQL interface has another parameter */
00062 #define SEL_MAX_N_PREFETCH  16
00063 
00064 /* Number of rows fetched, after which to start prefetching; MySQL interface
00065 has another parameter */
00066 #define SEL_PREFETCH_LIMIT  1
00067 
00068 /* When a select has accessed about this many pages, it returns control back
00069 to que_run_threads: this is to allow canceling runaway queries */
00070 
00071 #define SEL_COST_LIMIT  100
00072 
00073 /* Flags for search shortcut */
00074 #define SEL_FOUND 0
00075 #define SEL_EXHAUSTED 1
00076 #define SEL_RETRY 2
00077 
00078 /********************************************************************/
00085 static
00086 ibool
00087 row_sel_sec_rec_is_for_blob(
00088 /*========================*/
00089   ulint   mtype,    
00090   ulint   prtype,   
00091   ulint   mbminmaxlen,  
00093   const byte* clust_field,  
00099   ulint   clust_len,  
00100   const byte* sec_field,  
00101   ulint   sec_len,  
00102   ulint   zip_size) 
00103 {
00104   ulint len;
00105   byte  buf[DICT_MAX_INDEX_COL_LEN];
00106 
00107   ut_a(clust_len >= BTR_EXTERN_FIELD_REF_SIZE);
00108 
00109   if (UNIV_UNLIKELY
00110       (!memcmp(clust_field + clust_len - BTR_EXTERN_FIELD_REF_SIZE,
00111          field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
00112     /* The externally stored field was not written yet.
00113     This record should only be seen by
00114     recv_recovery_rollback_active() or any
00115     TRX_ISO_READ_UNCOMMITTED transactions. */
00116     return(FALSE);
00117   }
00118 
00119   len = btr_copy_externally_stored_field_prefix(buf, sizeof buf,
00120                   zip_size,
00121                   clust_field, clust_len);
00122 
00123   if (UNIV_UNLIKELY(len == 0)) {
00124     /* The BLOB was being deleted as the server crashed.
00125     There should not be any secondary index records
00126     referring to this clustered index record, because
00127     btr_free_externally_stored_field() is called after all
00128     secondary index entries of the row have been purged. */
00129     return(FALSE);
00130   }
00131 
00132   len = dtype_get_at_most_n_mbchars(prtype, mbminmaxlen,
00133             sec_len, len, (const char*) buf);
00134 
00135   return(!cmp_data_data(mtype, prtype, buf, len, sec_field, sec_len));
00136 }
00137 
00138 /********************************************************************/
00147 static
00148 ibool
00149 row_sel_sec_rec_is_for_clust_rec(
00150 /*=============================*/
00151   const rec_t*  sec_rec,  
00152   dict_index_t* sec_index,  
00153   const rec_t*  clust_rec,  
00157   dict_index_t* clust_index)  
00158 {
00159   const byte* sec_field;
00160   ulint   sec_len;
00161   const byte* clust_field;
00162   ulint   n;
00163   ulint   i;
00164   mem_heap_t* heap    = NULL;
00165   ulint   clust_offsets_[REC_OFFS_NORMAL_SIZE];
00166   ulint   sec_offsets_[REC_OFFS_SMALL_SIZE];
00167   ulint*    clust_offs  = clust_offsets_;
00168   ulint*    sec_offs  = sec_offsets_;
00169   ibool   is_equal  = TRUE;
00170 
00171   rec_offs_init(clust_offsets_);
00172   rec_offs_init(sec_offsets_);
00173 
00174   if (rec_get_deleted_flag(clust_rec,
00175          dict_table_is_comp(clust_index->table))) {
00176 
00177     /* The clustered index record is delete-marked;
00178     it is not visible in the read view.  Besides,
00179     if there are any externally stored columns,
00180     some of them may have already been purged. */
00181     return(FALSE);
00182   }
00183 
00184   clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs,
00185              ULINT_UNDEFINED, &heap);
00186   sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs,
00187            ULINT_UNDEFINED, &heap);
00188 
00189   n = dict_index_get_n_ordering_defined_by_user(sec_index);
00190 
00191   for (i = 0; i < n; i++) {
00192     const dict_field_t* ifield;
00193     const dict_col_t* col;
00194     ulint     clust_pos;
00195     ulint     clust_len;
00196     ulint     len;
00197 
00198     ifield = dict_index_get_nth_field(sec_index, i);
00199     col = dict_field_get_col(ifield);
00200     clust_pos = dict_col_get_clust_pos(col, clust_index);
00201 
00202     clust_field = rec_get_nth_field(
00203       clust_rec, clust_offs, clust_pos, &clust_len);
00204     sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len);
00205 
00206     len = clust_len;
00207 
00208     if (ifield->prefix_len > 0 && len != UNIV_SQL_NULL) {
00209 
00210       if (rec_offs_nth_extern(clust_offs, clust_pos)) {
00211         len -= BTR_EXTERN_FIELD_REF_SIZE;
00212       }
00213 
00214       len = dtype_get_at_most_n_mbchars(
00215         col->prtype, col->mbminmaxlen,
00216         ifield->prefix_len, len, (char*) clust_field);
00217 
00218       if (rec_offs_nth_extern(clust_offs, clust_pos)
00219           && len < sec_len) {
00220         if (!row_sel_sec_rec_is_for_blob(
00221               col->mtype, col->prtype,
00222               col->mbminmaxlen,
00223               clust_field, clust_len,
00224               sec_field, sec_len,
00225               dict_table_zip_size(
00226                 clust_index->table))) {
00227           goto inequal;
00228         }
00229 
00230         continue;
00231       }
00232     }
00233 
00234     if (0 != cmp_data_data(col->mtype, col->prtype,
00235                clust_field, len,
00236                sec_field, sec_len)) {
00237 inequal:
00238       is_equal = FALSE;
00239       goto func_exit;
00240     }
00241   }
00242 
00243 func_exit:
00244   if (UNIV_LIKELY_NULL(heap)) {
00245     mem_heap_free(heap);
00246   }
00247   return(is_equal);
00248 }
00249 
00250 /*********************************************************************/
00253 UNIV_INTERN
00254 sel_node_t*
00255 sel_node_create(
00256 /*============*/
00257   mem_heap_t* heap) 
00258 {
00259   sel_node_t* node;
00260 
00261         node = static_cast<sel_node_t *>(mem_heap_alloc(heap, sizeof(sel_node_t)));
00262   node->common.type = QUE_NODE_SELECT;
00263   node->state = SEL_NODE_OPEN;
00264 
00265   node->plans = NULL;
00266 
00267   return(node);
00268 }
00269 
00270 /*********************************************************************/
00273 UNIV_INTERN
00274 void
00275 sel_node_free_private(
00276 /*==================*/
00277   sel_node_t* node) 
00278 {
00279   ulint i;
00280   plan_t* plan;
00281 
00282   if (node->plans != NULL) {
00283     for (i = 0; i < node->n_tables; i++) {
00284       plan = sel_node_get_nth_plan(node, i);
00285 
00286       btr_pcur_close(&(plan->pcur));
00287       btr_pcur_close(&(plan->clust_pcur));
00288 
00289       if (plan->old_vers_heap) {
00290         mem_heap_free(plan->old_vers_heap);
00291       }
00292     }
00293   }
00294 }
00295 
00296 /*********************************************************************/
00299 UNIV_INLINE
00300 void
00301 sel_eval_select_list(
00302 /*=================*/
00303   sel_node_t* node) 
00304 {
00305   que_node_t* exp;
00306 
00307   exp = node->select_list;
00308 
00309   while (exp) {
00310     eval_exp(exp);
00311 
00312     exp = que_node_get_next(exp);
00313   }
00314 }
00315 
00316 /*********************************************************************/
00319 UNIV_INLINE
00320 void
00321 sel_assign_into_var_values(
00322 /*=======================*/
00323   sym_node_t* var,  
00324   sel_node_t* node) 
00325 {
00326   que_node_t* exp;
00327 
00328   if (var == NULL) {
00329 
00330     return;
00331   }
00332 
00333   exp = node->select_list;
00334 
00335   while (var) {
00336     ut_ad(exp);
00337 
00338     eval_node_copy_val(var->alias, exp);
00339 
00340     exp = que_node_get_next(exp);
00341                 var = static_cast<sym_node_t *>(que_node_get_next(var));
00342   }
00343 }
00344 
00345 /*********************************************************************/
00348 UNIV_INLINE
00349 void
00350 sel_reset_aggregate_vals(
00351 /*=====================*/
00352   sel_node_t* node) 
00353 {
00354   func_node_t*  func_node;
00355 
00356   ut_ad(node->is_aggregate);
00357 
00358         func_node = static_cast<func_node_t *>(node->select_list);
00359 
00360   while (func_node) {
00361     eval_node_set_int_val(func_node, 0);
00362 
00363                 func_node = static_cast<func_node_t *>(que_node_get_next(func_node));
00364   }
00365 
00366   node->aggregate_already_fetched = FALSE;
00367 }
00368 
00369 /*********************************************************************/
00371 UNIV_INLINE
00372 void
00373 row_sel_copy_input_variable_vals(
00374 /*=============================*/
00375   sel_node_t* node) 
00376 {
00377   sym_node_t* var;
00378 
00379   var = UT_LIST_GET_FIRST(node->copy_variables);
00380 
00381   while (var) {
00382     eval_node_copy_val(var, var->alias);
00383 
00384     var->indirection = NULL;
00385 
00386     var = UT_LIST_GET_NEXT(col_var_list, var);
00387   }
00388 }
00389 
00390 /*********************************************************************/
00392 static
00393 void
00394 row_sel_fetch_columns(
00395 /*==================*/
00396   dict_index_t* index,  
00397   const rec_t*  rec,  
00399   const ulint*  offsets,
00400   sym_node_t* column) 
00402 {
00403   dfield_t* val;
00404   ulint   index_type;
00405   ulint   field_no;
00406   const byte* data;
00407   ulint   len;
00408 
00409   ut_ad(rec_offs_validate(rec, index, offsets));
00410 
00411   if (dict_index_is_clust(index)) {
00412     index_type = SYM_CLUST_FIELD_NO;
00413   } else {
00414     index_type = SYM_SEC_FIELD_NO;
00415   }
00416 
00417   while (column) {
00418     mem_heap_t* heap = NULL;
00419     ibool   needs_copy;
00420 
00421     field_no = column->field_nos[index_type];
00422 
00423     if (field_no != ULINT_UNDEFINED) {
00424 
00425       if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets,
00426                     field_no))) {
00427 
00428         /* Copy an externally stored field to the
00429         temporary heap, if possible. */
00430 
00431         heap = mem_heap_create(1);
00432 
00433         data = btr_rec_copy_externally_stored_field(
00434           rec, offsets,
00435           dict_table_zip_size(index->table),
00436           field_no, &len, heap);
00437 
00438         /* data == NULL means that the
00439         externally stored field was not
00440         written yet. This record
00441         should only be seen by
00442         recv_recovery_rollback_active() or any
00443         TRX_ISO_READ_UNCOMMITTED
00444         transactions. The InnoDB SQL parser
00445         (the sole caller of this function)
00446         does not implement READ UNCOMMITTED,
00447         and it is not involved during rollback. */
00448         ut_a(data);
00449         ut_a(len != UNIV_SQL_NULL);
00450 
00451         needs_copy = TRUE;
00452       } else {
00453         data = rec_get_nth_field(rec, offsets,
00454                field_no, &len);
00455 
00456         needs_copy = column->copy_val;
00457       }
00458 
00459       if (needs_copy) {
00460         eval_node_copy_and_alloc_val(column, data,
00461                    len);
00462       } else {
00463         val = que_node_get_val(column);
00464         dfield_set_data(val, data, len);
00465       }
00466 
00467       if (UNIV_LIKELY_NULL(heap)) {
00468         mem_heap_free(heap);
00469       }
00470     }
00471 
00472     column = UT_LIST_GET_NEXT(col_var_list, column);
00473   }
00474 }
00475 
00476 /*********************************************************************/
00478 static
00479 void
00480 sel_col_prefetch_buf_alloc(
00481 /*=======================*/
00482   sym_node_t* column) 
00483 {
00484   sel_buf_t*  sel_buf;
00485   ulint   i;
00486 
00487   ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
00488 
00489         column->prefetch_buf = static_cast<sel_buf_t *>(mem_alloc(SEL_MAX_N_PREFETCH
00490                                                                   * sizeof(sel_buf_t)));
00491   for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
00492     sel_buf = column->prefetch_buf + i;
00493 
00494     sel_buf->data = NULL;
00495 
00496     sel_buf->val_buf_size = 0;
00497   }
00498 }
00499 
00500 /*********************************************************************/
00503 UNIV_INTERN
00504 void
00505 sel_col_prefetch_buf_free(
00506 /*======================*/
00507   sel_buf_t*  prefetch_buf) 
00508 {
00509   sel_buf_t*  sel_buf;
00510   ulint   i;
00511 
00512   for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
00513     sel_buf = prefetch_buf + i;
00514 
00515     if (sel_buf->val_buf_size > 0) {
00516 
00517       mem_free(sel_buf->data);
00518     }
00519   }
00520 }
00521 
00522 /*********************************************************************/
00525 static
00526 void
00527 sel_pop_prefetched_row(
00528 /*===================*/
00529   plan_t* plan) 
00530 {
00531   sym_node_t* column;
00532   sel_buf_t*  sel_buf;
00533   dfield_t* val;
00534   byte*   data;
00535   ulint   len;
00536   ulint   val_buf_size;
00537 
00538   ut_ad(plan->n_rows_prefetched > 0);
00539 
00540   column = UT_LIST_GET_FIRST(plan->columns);
00541 
00542   while (column) {
00543     val = que_node_get_val(column);
00544 
00545     if (!column->copy_val) {
00546       /* We did not really push any value for the
00547       column */
00548 
00549       ut_ad(!column->prefetch_buf);
00550       ut_ad(que_node_get_val_buf_size(column) == 0);
00551       ut_d(dfield_set_null(val));
00552 
00553       goto next_col;
00554     }
00555 
00556     ut_ad(column->prefetch_buf);
00557     ut_ad(!dfield_is_ext(val));
00558 
00559     sel_buf = column->prefetch_buf + plan->first_prefetched;
00560 
00561     data = sel_buf->data;
00562     len = sel_buf->len;
00563     val_buf_size = sel_buf->val_buf_size;
00564 
00565     /* We must keep track of the allocated memory for
00566     column values to be able to free it later: therefore
00567     we swap the values for sel_buf and val */
00568 
00569                 sel_buf->data = static_cast<byte *>(dfield_get_data(val));
00570     sel_buf->len = dfield_get_len(val);
00571     sel_buf->val_buf_size = que_node_get_val_buf_size(column);
00572 
00573     dfield_set_data(val, data, len);
00574     que_node_set_val_buf_size(column, val_buf_size);
00575 next_col:
00576     column = UT_LIST_GET_NEXT(col_var_list, column);
00577   }
00578 
00579   plan->n_rows_prefetched--;
00580 
00581   plan->first_prefetched++;
00582 }
00583 
00584 /*********************************************************************/
00587 UNIV_INLINE
00588 void
00589 sel_push_prefetched_row(
00590 /*====================*/
00591   plan_t* plan) 
00592 {
00593   sym_node_t* column;
00594   sel_buf_t*  sel_buf;
00595   dfield_t* val;
00596   byte*   data;
00597   ulint   len;
00598   ulint   pos;
00599   ulint   val_buf_size;
00600 
00601   if (plan->n_rows_prefetched == 0) {
00602     pos = 0;
00603     plan->first_prefetched = 0;
00604   } else {
00605     pos = plan->n_rows_prefetched;
00606 
00607     /* We have the convention that pushing new rows starts only
00608     after the prefetch stack has been emptied: */
00609 
00610     ut_ad(plan->first_prefetched == 0);
00611   }
00612 
00613   plan->n_rows_prefetched++;
00614 
00615   ut_ad(pos < SEL_MAX_N_PREFETCH);
00616 
00617   column = UT_LIST_GET_FIRST(plan->columns);
00618 
00619   while (column) {
00620     if (!column->copy_val) {
00621       /* There is no sense to push pointers to database
00622       page fields when we do not keep latch on the page! */
00623 
00624       goto next_col;
00625     }
00626 
00627     if (!column->prefetch_buf) {
00628       /* Allocate a new prefetch buffer */
00629 
00630       sel_col_prefetch_buf_alloc(column);
00631     }
00632 
00633     sel_buf = column->prefetch_buf + pos;
00634 
00635     val = que_node_get_val(column);
00636 
00637                 data = static_cast<byte *>(dfield_get_data(val));
00638     len = dfield_get_len(val);
00639     val_buf_size = que_node_get_val_buf_size(column);
00640 
00641     /* We must keep track of the allocated memory for
00642     column values to be able to free it later: therefore
00643     we swap the values for sel_buf and val */
00644 
00645     dfield_set_data(val, sel_buf->data, sel_buf->len);
00646     que_node_set_val_buf_size(column, sel_buf->val_buf_size);
00647 
00648     sel_buf->data = data;
00649     sel_buf->len = len;
00650     sel_buf->val_buf_size = val_buf_size;
00651 next_col:
00652     column = UT_LIST_GET_NEXT(col_var_list, column);
00653   }
00654 }
00655 
00656 /*********************************************************************/
00659 static
00660 ulint
00661 row_sel_build_prev_vers(
00662 /*====================*/
00663   read_view_t*  read_view,  
00664   dict_index_t* index,    
00665   rec_t*    rec,    
00666   ulint**   offsets,  
00668   mem_heap_t**  offset_heap,  
00670   mem_heap_t**    old_vers_heap,  
00671   rec_t**   old_vers, 
00675   mtr_t*    mtr)    
00676 {
00677   ulint err;
00678 
00679   if (*old_vers_heap) {
00680     mem_heap_empty(*old_vers_heap);
00681   } else {
00682     *old_vers_heap = mem_heap_create(512);
00683   }
00684 
00685   err = row_vers_build_for_consistent_read(
00686     rec, mtr, index, offsets, read_view, offset_heap,
00687     *old_vers_heap, old_vers);
00688   return(err);
00689 }
00690 
00691 /*********************************************************************/
00695 static
00696 ulint
00697 row_sel_build_committed_vers_for_mysql(
00698 /*===================================*/
00699   dict_index_t* clust_index,  
00700   row_prebuilt_t* prebuilt, 
00701   const rec_t*  rec,    
00702   ulint**   offsets,  
00704   mem_heap_t**  offset_heap,  
00706   const rec_t** old_vers, 
00710   mtr_t*    mtr)    
00711 {
00712   ulint err;
00713 
00714   if (prebuilt->old_vers_heap) {
00715     mem_heap_empty(prebuilt->old_vers_heap);
00716   } else {
00717     prebuilt->old_vers_heap = mem_heap_create(200);
00718   }
00719 
00720   err = row_vers_build_for_semi_consistent_read(
00721     rec, mtr, clust_index, offsets, offset_heap,
00722     prebuilt->old_vers_heap, old_vers);
00723   return(err);
00724 }
00725 
00726 /*********************************************************************/
00730 UNIV_INLINE
00731 ibool
00732 row_sel_test_end_conds(
00733 /*===================*/
00734   plan_t* plan) 
00737 {
00738   func_node_t*  cond;
00739 
00740   /* All conditions in end_conds are comparisons of a column to an
00741   expression */
00742 
00743   cond = UT_LIST_GET_FIRST(plan->end_conds);
00744 
00745   while (cond) {
00746     /* Evaluate the left side of the comparison, i.e., get the
00747     column value if there is an indirection */
00748 
00749           eval_sym(static_cast<sym_node_t *>(cond->args));
00750 
00751     /* Do the comparison */
00752 
00753     if (!eval_cmp(cond)) {
00754 
00755       return(FALSE);
00756     }
00757 
00758     cond = UT_LIST_GET_NEXT(cond_list, cond);
00759   }
00760 
00761   return(TRUE);
00762 }
00763 
00764 /*********************************************************************/
00767 UNIV_INLINE
00768 ibool
00769 row_sel_test_other_conds(
00770 /*=====================*/
00771   plan_t* plan) 
00773 {
00774   func_node_t*  cond;
00775 
00776   cond = UT_LIST_GET_FIRST(plan->other_conds);
00777 
00778   while (cond) {
00779     eval_exp(cond);
00780 
00781     if (!eval_node_get_ibool_val(cond)) {
00782 
00783       return(FALSE);
00784     }
00785 
00786     cond = UT_LIST_GET_NEXT(cond_list, cond);
00787   }
00788 
00789   return(TRUE);
00790 }
00791 
00792 /*********************************************************************/
00796 static
00797 ulint
00798 row_sel_get_clust_rec(
00799 /*==================*/
00800   sel_node_t* node, 
00801   plan_t*   plan, 
00802   rec_t*    rec,  
00803   que_thr_t*  thr,  
00804   rec_t**   out_rec,
00808   mtr_t*    mtr)  
00811 {
00812   dict_index_t* index;
00813   rec_t*    clust_rec;
00814   rec_t*    old_vers;
00815   ulint   err;
00816   mem_heap_t* heap    = NULL;
00817   ulint   offsets_[REC_OFFS_NORMAL_SIZE];
00818   ulint*    offsets   = offsets_;
00819   rec_offs_init(offsets_);
00820 
00821   *out_rec = NULL;
00822 
00823   offsets = rec_get_offsets(rec,
00824           btr_pcur_get_btr_cur(&plan->pcur)->index,
00825           offsets, ULINT_UNDEFINED, &heap);
00826 
00827   row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets);
00828 
00829   index = dict_table_get_first_index(plan->table);
00830 
00831   btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE,
00832            BTR_SEARCH_LEAF, &plan->clust_pcur,
00833            0, mtr);
00834 
00835   clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
00836 
00837   /* Note: only if the search ends up on a non-infimum record is the
00838   low_match value the real match to the search tuple */
00839 
00840   if (!page_rec_is_user_rec(clust_rec)
00841       || btr_pcur_get_low_match(&(plan->clust_pcur))
00842       < dict_index_get_n_unique(index)) {
00843 
00844     ut_a(rec_get_deleted_flag(rec,
00845             dict_table_is_comp(plan->table)));
00846     ut_a(node->read_view);
00847 
00848     /* In a rare case it is possible that no clust rec is found
00849     for a delete-marked secondary index record: if in row0umod.c
00850     in row_undo_mod_remove_clust_low() we have already removed
00851     the clust rec, while purge is still cleaning and removing
00852     secondary index records associated with earlier versions of
00853     the clustered index record. In that case we know that the
00854     clustered index record did not exist in the read view of
00855     trx. */
00856 
00857     goto func_exit;
00858   }
00859 
00860   offsets = rec_get_offsets(clust_rec, index, offsets,
00861           ULINT_UNDEFINED, &heap);
00862 
00863   if (!node->read_view) {
00864     /* Try to place a lock on the index record */
00865 
00866     /* If innodb_locks_unsafe_for_binlog option is used
00867     or this session is using READ COMMITTED isolation level
00868     we lock only the record, i.e., next-key locking is
00869     not used. */
00870     ulint lock_type;
00871     trx_t*  trx;
00872 
00873     trx = thr_get_trx(thr);
00874 
00875     if (srv_locks_unsafe_for_binlog
00876         || trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
00877       lock_type = LOCK_REC_NOT_GAP;
00878     } else {
00879       lock_type = LOCK_ORDINARY;
00880     }
00881 
00882     err = lock_clust_rec_read_check_and_lock(
00883       0, btr_pcur_get_block(&plan->clust_pcur),
00884       clust_rec, index, offsets,
00885                         static_cast<lock_mode>(node->row_lock_mode), lock_type, thr);
00886 
00887     switch (err) {
00888     case DB_SUCCESS:
00889     case DB_SUCCESS_LOCKED_REC:
00890       /* Declare the variable uninitialized in Valgrind.
00891       It should be set to DB_SUCCESS at func_exit. */
00892       UNIV_MEM_INVALID(&err, sizeof err);
00893       break;
00894     default:
00895       goto err_exit;
00896     }
00897   } else {
00898     /* This is a non-locking consistent read: if necessary, fetch
00899     a previous version of the record */
00900 
00901     old_vers = NULL;
00902 
00903     if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets,
00904                node->read_view)) {
00905 
00906       err = row_sel_build_prev_vers(
00907         node->read_view, index, clust_rec,
00908         &offsets, &heap, &plan->old_vers_heap,
00909         &old_vers, mtr);
00910 
00911       if (err != DB_SUCCESS) {
00912 
00913         goto err_exit;
00914       }
00915 
00916       clust_rec = old_vers;
00917 
00918       if (clust_rec == NULL) {
00919         goto func_exit;
00920       }
00921     }
00922 
00923     /* If we had to go to an earlier version of row or the
00924     secondary index record is delete marked, then it may be that
00925     the secondary index record corresponding to clust_rec
00926     (or old_vers) is not rec; in that case we must ignore
00927     such row because in our snapshot rec would not have existed.
00928     Remember that from rec we cannot see directly which transaction
00929     id corresponds to it: we have to go to the clustered index
00930     record. A query where we want to fetch all rows where
00931     the secondary index value is in some interval would return
00932     a wrong result if we would not drop rows which we come to
00933     visit through secondary index records that would not really
00934     exist in our snapshot. */
00935 
00936     if ((old_vers
00937          || rec_get_deleted_flag(rec, dict_table_is_comp(
00938                  plan->table)))
00939         && !row_sel_sec_rec_is_for_clust_rec(rec, plan->index,
00940                clust_rec, index)) {
00941       goto func_exit;
00942     }
00943   }
00944 
00945   /* Fetch the columns needed in test conditions.  The clustered
00946   index record is protected by a page latch that was acquired
00947   when plan->clust_pcur was positioned.  The latch will not be
00948   released until mtr_commit(mtr). */
00949 
00950   ut_ad(!rec_get_deleted_flag(clust_rec, rec_offs_comp(offsets)));
00951   row_sel_fetch_columns(index, clust_rec, offsets,
00952             UT_LIST_GET_FIRST(plan->columns));
00953   *out_rec = clust_rec;
00954 func_exit:
00955   err = DB_SUCCESS;
00956 err_exit:
00957   if (UNIV_LIKELY_NULL(heap)) {
00958     mem_heap_free(heap);
00959   }
00960   return(err);
00961 }
00962 
00963 /*********************************************************************/
00966 UNIV_INLINE
00967 enum db_err
00968 sel_set_rec_lock(
00969 /*=============*/
00970   const buf_block_t*  block,  
00971   const rec_t*    rec,  
00972   dict_index_t*   index,  
00973   const ulint*    offsets,
00974   ulint     mode, 
00975   ulint     type, 
00977   que_thr_t*    thr)  
00978 {
00979   trx_t*    trx;
00980   enum db_err err;
00981 
00982   trx = thr_get_trx(thr);
00983 
00984   if (UT_LIST_GET_LEN(trx->trx_locks) > 10000) {
00985     if (buf_LRU_buf_pool_running_out()) {
00986 
00987       return(DB_LOCK_TABLE_FULL);
00988     }
00989   }
00990 
00991   if (dict_index_is_clust(index)) {
00992     err = lock_clust_rec_read_check_and_lock(0, block, rec, index,
00993                                                          offsets, static_cast<lock_mode>(mode), type, thr);
00994   } else {
00995     err = lock_sec_rec_read_check_and_lock(0, block, rec, index,
00996                                                        offsets, static_cast<lock_mode>(mode), type, thr);
00997   }
00998 
00999   return(err);
01000 }
01001 
01002 /*********************************************************************/
01004 static
01005 void
01006 row_sel_open_pcur(
01007 /*==============*/
01008   plan_t*   plan,   
01009   ibool   search_latch_locked,
01013   mtr_t*    mtr)    
01014 {
01015   dict_index_t* index;
01016   func_node_t*  cond;
01017   que_node_t* exp;
01018   ulint   n_fields;
01019   ulint   has_search_latch = 0; /* RW_S_LATCH or 0 */
01020   ulint   i;
01021 
01022   if (search_latch_locked) {
01023     has_search_latch = RW_S_LATCH;
01024   }
01025 
01026   index = plan->index;
01027 
01028   /* Calculate the value of the search tuple: the exact match columns
01029   get their expressions evaluated when we evaluate the right sides of
01030   end_conds */
01031 
01032   cond = UT_LIST_GET_FIRST(plan->end_conds);
01033 
01034   while (cond) {
01035     eval_exp(que_node_get_next(cond->args));
01036 
01037     cond = UT_LIST_GET_NEXT(cond_list, cond);
01038   }
01039 
01040   if (plan->tuple) {
01041     n_fields = dtuple_get_n_fields(plan->tuple);
01042 
01043     if (plan->n_exact_match < n_fields) {
01044       /* There is a non-exact match field which must be
01045       evaluated separately */
01046 
01047       eval_exp(plan->tuple_exps[n_fields - 1]);
01048     }
01049 
01050     for (i = 0; i < n_fields; i++) {
01051       exp = plan->tuple_exps[i];
01052 
01053       dfield_copy_data(dtuple_get_nth_field(plan->tuple, i),
01054            que_node_get_val(exp));
01055     }
01056 
01057     /* Open pcur to the index */
01058 
01059     btr_pcur_open_with_no_init(index, plan->tuple, plan->mode,
01060              BTR_SEARCH_LEAF, &plan->pcur,
01061              has_search_latch, mtr);
01062   } else {
01063     /* Open the cursor to the start or the end of the index
01064     (FALSE: no init) */
01065 
01066     btr_pcur_open_at_index_side(plan->asc, index, BTR_SEARCH_LEAF,
01067               &(plan->pcur), FALSE, mtr);
01068   }
01069 
01070   ut_ad(plan->n_rows_prefetched == 0);
01071   ut_ad(plan->n_rows_fetched == 0);
01072   ut_ad(plan->cursor_at_end == FALSE);
01073 
01074   plan->pcur_is_open = TRUE;
01075 }
01076 
01077 /*********************************************************************/
01083 static
01084 ibool
01085 row_sel_restore_pcur_pos(
01086 /*=====================*/
01087   plan_t*   plan, 
01088   mtr_t*    mtr)  
01089 {
01090   ibool equal_position;
01091   ulint relative_position;
01092 
01093   ut_ad(!plan->cursor_at_end);
01094 
01095   relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
01096 
01097   equal_position = btr_pcur_restore_position(BTR_SEARCH_LEAF,
01098                &(plan->pcur), mtr);
01099 
01100   /* If the cursor is traveling upwards, and relative_position is
01101 
01102   (1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock
01103   yet on the successor of the page infimum;
01104   (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
01105   first record GREATER than the predecessor of a page supremum; we have
01106   not yet processed the cursor record: no need to move the cursor to the
01107   next record;
01108   (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
01109   last record LESS or EQUAL to the old stored user record; (a) if
01110   equal_position is FALSE, this means that the cursor is now on a record
01111   less than the old user record, and we must move to the next record;
01112   (b) if equal_position is TRUE, then if
01113   plan->stored_cursor_rec_processed is TRUE, we must move to the next
01114   record, else there is no need to move the cursor. */
01115 
01116   if (plan->asc) {
01117     if (relative_position == BTR_PCUR_ON) {
01118 
01119       if (equal_position) {
01120 
01121         return(plan->stored_cursor_rec_processed);
01122       }
01123 
01124       return(TRUE);
01125     }
01126 
01127     ut_ad(relative_position == BTR_PCUR_AFTER
01128           || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
01129 
01130     return(FALSE);
01131   }
01132 
01133   /* If the cursor is traveling downwards, and relative_position is
01134 
01135   (1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on
01136   the last record LESS than the successor of a page infimum; we have not
01137   processed the cursor record: no need to move the cursor;
01138   (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
01139   first record GREATER than the predecessor of a page supremum; we have
01140   processed the cursor record: we should move the cursor to the previous
01141   record;
01142   (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
01143   last record LESS or EQUAL to the old stored user record; (a) if
01144   equal_position is FALSE, this means that the cursor is now on a record
01145   less than the old user record, and we need not move to the previous
01146   record; (b) if equal_position is TRUE, then if
01147   plan->stored_cursor_rec_processed is TRUE, we must move to the previous
01148   record, else there is no need to move the cursor. */
01149 
01150   if (relative_position == BTR_PCUR_BEFORE
01151       || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
01152 
01153     return(FALSE);
01154   }
01155 
01156   if (relative_position == BTR_PCUR_ON) {
01157 
01158     if (equal_position) {
01159 
01160       return(plan->stored_cursor_rec_processed);
01161     }
01162 
01163     return(FALSE);
01164   }
01165 
01166   ut_ad(relative_position == BTR_PCUR_AFTER
01167         || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
01168 
01169   return(TRUE);
01170 }
01171 
01172 /*********************************************************************/
01174 UNIV_INLINE
01175 void
01176 plan_reset_cursor(
01177 /*==============*/
01178   plan_t* plan) 
01179 {
01180   plan->pcur_is_open = FALSE;
01181   plan->cursor_at_end = FALSE;
01182   plan->n_rows_fetched = 0;
01183   plan->n_rows_prefetched = 0;
01184 }
01185 
01186 /*********************************************************************/
01190 static
01191 ulint
01192 row_sel_try_search_shortcut(
01193 /*========================*/
01194   sel_node_t* node, 
01195   plan_t*   plan, 
01197   mtr_t*    mtr)  
01198 {
01199   dict_index_t* index;
01200   rec_t*    rec;
01201   mem_heap_t* heap    = NULL;
01202   ulint   offsets_[REC_OFFS_NORMAL_SIZE];
01203   ulint*    offsets   = offsets_;
01204   ulint   ret;
01205   rec_offs_init(offsets_);
01206 
01207   index = plan->index;
01208 
01209   ut_ad(node->read_view);
01210   ut_ad(plan->unique_search);
01211   ut_ad(!plan->must_get_clust);
01212 #ifdef UNIV_SYNC_DEBUG
01213   ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
01214 #endif /* UNIV_SYNC_DEBUG */
01215 
01216   row_sel_open_pcur(plan, TRUE, mtr);
01217 
01218   rec = btr_pcur_get_rec(&(plan->pcur));
01219 
01220   if (!page_rec_is_user_rec(rec)) {
01221 
01222     return(SEL_RETRY);
01223   }
01224 
01225   ut_ad(plan->mode == PAGE_CUR_GE);
01226 
01227   /* As the cursor is now placed on a user record after a search with
01228   the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
01229   fields in the user record matched to the search tuple */
01230 
01231   if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
01232 
01233     return(SEL_EXHAUSTED);
01234   }
01235 
01236   /* This is a non-locking consistent read: if necessary, fetch
01237   a previous version of the record */
01238 
01239   offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
01240 
01241   if (dict_index_is_clust(index)) {
01242     if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
01243                node->read_view)) {
01244       ret = SEL_RETRY;
01245       goto func_exit;
01246     }
01247   } else if (!lock_sec_rec_cons_read_sees(rec, node->read_view)) {
01248 
01249     ret = SEL_RETRY;
01250     goto func_exit;
01251   }
01252 
01253   /* Test the deleted flag. */
01254 
01255   if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) {
01256 
01257     ret = SEL_EXHAUSTED;
01258     goto func_exit;
01259   }
01260 
01261   /* Fetch the columns needed in test conditions.  The index
01262   record is protected by a page latch that was acquired when
01263   plan->pcur was positioned.  The latch will not be released
01264   until mtr_commit(mtr). */
01265 
01266   row_sel_fetch_columns(index, rec, offsets,
01267             UT_LIST_GET_FIRST(plan->columns));
01268 
01269   /* Test the rest of search conditions */
01270 
01271   if (!row_sel_test_other_conds(plan)) {
01272 
01273     ret = SEL_EXHAUSTED;
01274     goto func_exit;
01275   }
01276 
01277   ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
01278 
01279   plan->n_rows_fetched++;
01280   ret = SEL_FOUND;
01281 func_exit:
01282   if (UNIV_LIKELY_NULL(heap)) {
01283     mem_heap_free(heap);
01284   }
01285   return(ret);
01286 }
01287 
01288 /*********************************************************************/
01291 static
01292 ulint
01293 row_sel(
01294 /*====*/
01295   sel_node_t* node, 
01296   que_thr_t*  thr)  
01297 {
01298   dict_index_t* index;
01299   plan_t*   plan;
01300   mtr_t   mtr;
01301   ibool   moved;
01302   rec_t*    rec;
01303   rec_t*    old_vers;
01304   rec_t*    clust_rec;
01305   ibool   search_latch_locked;
01306   ibool   consistent_read;
01307 
01308   /* The following flag becomes TRUE when we are doing a
01309   consistent read from a non-clustered index and we must look
01310   at the clustered index to find out the previous delete mark
01311   state of the non-clustered record: */
01312 
01313   ibool   cons_read_requires_clust_rec  = FALSE;
01314   ulint   cost_counter      = 0;
01315   ibool   cursor_just_opened;
01316   ibool   must_go_to_next;
01317   ibool   mtr_has_extra_clust_latch = FALSE;
01318   /* TRUE if the search was made using
01319   a non-clustered index, and we had to
01320   access the clustered record: now &mtr
01321   contains a clustered index latch, and
01322   &mtr must be committed before we move
01323   to the next non-clustered record */
01324   ulint   found_flag;
01325   ulint   err;
01326   mem_heap_t* heap        = NULL;
01327   ulint   offsets_[REC_OFFS_NORMAL_SIZE];
01328   ulint*    offsets       = offsets_;
01329   rec_offs_init(offsets_);
01330 
01331   ut_ad(thr->run_node == node);
01332 
01333   search_latch_locked = FALSE;
01334 
01335   if (node->read_view) {
01336     /* In consistent reads, we try to do with the hash index and
01337     not to use the buffer page get. This is to reduce memory bus
01338     load resulting from semaphore operations. The search latch
01339     will be s-locked when we access an index with a unique search
01340     condition, but not locked when we access an index with a
01341     less selective search condition. */
01342 
01343     consistent_read = TRUE;
01344   } else {
01345     consistent_read = FALSE;
01346   }
01347 
01348 table_loop:
01349   /* TABLE LOOP
01350   ----------
01351   This is the outer major loop in calculating a join. We come here when
01352   node->fetch_table changes, and after adding a row to aggregate totals
01353   and, of course, when this function is called. */
01354 
01355   ut_ad(mtr_has_extra_clust_latch == FALSE);
01356 
01357   plan = sel_node_get_nth_plan(node, node->fetch_table);
01358   index = plan->index;
01359 
01360   if (plan->n_rows_prefetched > 0) {
01361     sel_pop_prefetched_row(plan);
01362 
01363     goto next_table_no_mtr;
01364   }
01365 
01366   if (plan->cursor_at_end) {
01367     /* The cursor has already reached the result set end: no more
01368     rows to process for this table cursor, as also the prefetch
01369     stack was empty */
01370 
01371     ut_ad(plan->pcur_is_open);
01372 
01373     goto table_exhausted_no_mtr;
01374   }
01375 
01376   /* Open a cursor to index, or restore an open cursor position */
01377 
01378   mtr_start(&mtr);
01379 
01380   if (consistent_read && plan->unique_search && !plan->pcur_is_open
01381       && !plan->must_get_clust
01382       && !plan->table->big_rows) {
01383     if (!search_latch_locked) {
01384       rw_lock_s_lock(&btr_search_latch);
01385 
01386       search_latch_locked = TRUE;
01387     } else if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_WAIT_EX) {
01388 
01389       /* There is an x-latch request waiting: release the
01390       s-latch for a moment; as an s-latch here is often
01391       kept for some 10 searches before being released,
01392       a waiting x-latch request would block other threads
01393       from acquiring an s-latch for a long time, lowering
01394       performance significantly in multiprocessors. */
01395 
01396       rw_lock_s_unlock(&btr_search_latch);
01397       rw_lock_s_lock(&btr_search_latch);
01398     }
01399 
01400     found_flag = row_sel_try_search_shortcut(node, plan, &mtr);
01401 
01402     if (found_flag == SEL_FOUND) {
01403 
01404       goto next_table;
01405 
01406     } else if (found_flag == SEL_EXHAUSTED) {
01407 
01408       goto table_exhausted;
01409     }
01410 
01411     ut_ad(found_flag == SEL_RETRY);
01412 
01413     plan_reset_cursor(plan);
01414 
01415     mtr_commit(&mtr);
01416     mtr_start(&mtr);
01417   }
01418 
01419   if (search_latch_locked) {
01420     rw_lock_s_unlock(&btr_search_latch);
01421 
01422     search_latch_locked = FALSE;
01423   }
01424 
01425   if (!plan->pcur_is_open) {
01426     /* Evaluate the expressions to build the search tuple and
01427     open the cursor */
01428 
01429     row_sel_open_pcur(plan, search_latch_locked, &mtr);
01430 
01431     cursor_just_opened = TRUE;
01432 
01433     /* A new search was made: increment the cost counter */
01434     cost_counter++;
01435   } else {
01436     /* Restore pcur position to the index */
01437 
01438     must_go_to_next = row_sel_restore_pcur_pos(plan, &mtr);
01439 
01440     cursor_just_opened = FALSE;
01441 
01442     if (must_go_to_next) {
01443       /* We have already processed the cursor record: move
01444       to the next */
01445 
01446       goto next_rec;
01447     }
01448   }
01449 
01450 rec_loop:
01451   /* RECORD LOOP
01452   -----------
01453   In this loop we use pcur and try to fetch a qualifying row, and
01454   also fill the prefetch buffer for this table if n_rows_fetched has
01455   exceeded a threshold. While we are inside this loop, the following
01456   holds:
01457   (1) &mtr is started,
01458   (2) pcur is positioned and open.
01459 
01460   NOTE that if cursor_just_opened is TRUE here, it means that we came
01461   to this point right after row_sel_open_pcur. */
01462 
01463   ut_ad(mtr_has_extra_clust_latch == FALSE);
01464 
01465   rec = btr_pcur_get_rec(&(plan->pcur));
01466 
01467   /* PHASE 1: Set a lock if specified */
01468 
01469   if (!node->asc && cursor_just_opened
01470       && !page_rec_is_supremum(rec)) {
01471 
01472     /* When we open a cursor for a descending search, we must set
01473     a next-key lock on the successor record: otherwise it would
01474     be possible to insert new records next to the cursor position,
01475     and it might be that these new records should appear in the
01476     search result set, resulting in the phantom problem. */
01477 
01478     if (!consistent_read) {
01479 
01480       /* If innodb_locks_unsafe_for_binlog option is used
01481       or this session is using READ COMMITTED isolation
01482       level, we lock only the record, i.e., next-key
01483       locking is not used. */
01484 
01485       rec_t*  next_rec = page_rec_get_next(rec);
01486       ulint lock_type;
01487       trx_t*  trx;
01488 
01489       trx = thr_get_trx(thr);
01490 
01491       offsets = rec_get_offsets(next_rec, index, offsets,
01492               ULINT_UNDEFINED, &heap);
01493 
01494       if (srv_locks_unsafe_for_binlog
01495           || trx->isolation_level
01496           <= TRX_ISO_READ_COMMITTED) {
01497 
01498         if (page_rec_is_supremum(next_rec)) {
01499 
01500           goto skip_lock;
01501         }
01502 
01503         lock_type = LOCK_REC_NOT_GAP;
01504       } else {
01505         lock_type = LOCK_ORDINARY;
01506       }
01507 
01508       err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
01509                  next_rec, index, offsets,
01510                  node->row_lock_mode,
01511                  lock_type, thr);
01512 
01513       switch (err) {
01514       case DB_SUCCESS_LOCKED_REC:
01515         err = DB_SUCCESS;
01516       case DB_SUCCESS:
01517         break;
01518       default:
01519         /* Note that in this case we will store in pcur
01520         the PREDECESSOR of the record we are waiting
01521         the lock for */
01522         goto lock_wait_or_error;
01523       }
01524     }
01525   }
01526 
01527 skip_lock:
01528   if (page_rec_is_infimum(rec)) {
01529 
01530     /* The infimum record on a page cannot be in the result set,
01531     and neither can a record lock be placed on it: we skip such
01532     a record. We also increment the cost counter as we may have
01533     processed yet another page of index. */
01534 
01535     cost_counter++;
01536 
01537     goto next_rec;
01538   }
01539 
01540   if (!consistent_read) {
01541     /* Try to place a lock on the index record */
01542 
01543     /* If innodb_locks_unsafe_for_binlog option is used
01544     or this session is using READ COMMITTED isolation level,
01545     we lock only the record, i.e., next-key locking is
01546     not used. */
01547 
01548     ulint lock_type;
01549     trx_t*  trx;
01550 
01551     offsets = rec_get_offsets(rec, index, offsets,
01552             ULINT_UNDEFINED, &heap);
01553 
01554     trx = thr_get_trx(thr);
01555 
01556     if (srv_locks_unsafe_for_binlog
01557         || trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
01558 
01559       if (page_rec_is_supremum(rec)) {
01560 
01561         goto next_rec;
01562       }
01563 
01564       lock_type = LOCK_REC_NOT_GAP;
01565     } else {
01566       lock_type = LOCK_ORDINARY;
01567     }
01568 
01569     err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
01570                rec, index, offsets,
01571                node->row_lock_mode, lock_type, thr);
01572 
01573     switch (err) {
01574     case DB_SUCCESS_LOCKED_REC:
01575       err = DB_SUCCESS;
01576     case DB_SUCCESS:
01577       break;
01578     default:
01579       goto lock_wait_or_error;
01580     }
01581   }
01582 
01583   if (page_rec_is_supremum(rec)) {
01584 
01585     /* A page supremum record cannot be in the result set: skip
01586     it now when we have placed a possible lock on it */
01587 
01588     goto next_rec;
01589   }
01590 
01591   ut_ad(page_rec_is_user_rec(rec));
01592 
01593   if (cost_counter > SEL_COST_LIMIT) {
01594 
01595     /* Now that we have placed the necessary locks, we can stop
01596     for a while and store the cursor position; NOTE that if we
01597     would store the cursor position BEFORE placing a record lock,
01598     it might happen that the cursor would jump over some records
01599     that another transaction could meanwhile insert adjacent to
01600     the cursor: this would result in the phantom problem. */
01601 
01602     goto stop_for_a_while;
01603   }
01604 
01605   /* PHASE 2: Check a mixed index mix id if needed */
01606 
01607   if (plan->unique_search && cursor_just_opened) {
01608 
01609     ut_ad(plan->mode == PAGE_CUR_GE);
01610 
01611     /* As the cursor is now placed on a user record after a search
01612     with the mode PAGE_CUR_GE, the up_match field in the cursor
01613     tells how many fields in the user record matched to the search
01614     tuple */
01615 
01616     if (btr_pcur_get_up_match(&(plan->pcur))
01617         < plan->n_exact_match) {
01618       goto table_exhausted;
01619     }
01620 
01621     /* Ok, no need to test end_conds or mix id */
01622 
01623   }
01624 
01625   /* We are ready to look at a possible new index entry in the result
01626   set: the cursor is now placed on a user record */
01627 
01628   /* PHASE 3: Get previous version in a consistent read */
01629 
01630   cons_read_requires_clust_rec = FALSE;
01631   offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
01632 
01633   if (consistent_read) {
01634     /* This is a non-locking consistent read: if necessary, fetch
01635     a previous version of the record */
01636 
01637     if (dict_index_is_clust(index)) {
01638 
01639       if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
01640                  node->read_view)) {
01641 
01642         err = row_sel_build_prev_vers(
01643           node->read_view, index, rec,
01644           &offsets, &heap, &plan->old_vers_heap,
01645           &old_vers, &mtr);
01646 
01647         if (err != DB_SUCCESS) {
01648 
01649           goto lock_wait_or_error;
01650         }
01651 
01652         if (old_vers == NULL) {
01653           /* The record does not exist
01654           in our read view. Skip it, but
01655           first attempt to determine
01656           whether the index segment we
01657           are searching through has been
01658           exhausted. */
01659 
01660           offsets = rec_get_offsets(
01661             rec, index, offsets,
01662             ULINT_UNDEFINED, &heap);
01663 
01664           /* Fetch the columns needed in
01665           test conditions. The clustered
01666           index record is protected by a
01667           page latch that was acquired
01668           by row_sel_open_pcur() or
01669           row_sel_restore_pcur_pos().
01670           The latch will not be released
01671           until mtr_commit(mtr). */
01672 
01673           row_sel_fetch_columns(
01674             index, rec, offsets,
01675             UT_LIST_GET_FIRST(
01676               plan->columns));
01677 
01678           if (!row_sel_test_end_conds(plan)) {
01679 
01680             goto table_exhausted;
01681           }
01682 
01683           goto next_rec;
01684         }
01685 
01686         rec = old_vers;
01687       }
01688     } else if (!lock_sec_rec_cons_read_sees(rec,
01689               node->read_view)) {
01690       cons_read_requires_clust_rec = TRUE;
01691     }
01692   }
01693 
01694   /* PHASE 4: Test search end conditions and deleted flag */
01695 
01696   /* Fetch the columns needed in test conditions.  The record is
01697   protected by a page latch that was acquired by
01698   row_sel_open_pcur() or row_sel_restore_pcur_pos().  The latch
01699   will not be released until mtr_commit(mtr). */
01700 
01701   row_sel_fetch_columns(index, rec, offsets,
01702             UT_LIST_GET_FIRST(plan->columns));
01703 
01704   /* Test the selection end conditions: these can only contain columns
01705   which already are found in the index, even though the index might be
01706   non-clustered */
01707 
01708   if (plan->unique_search && cursor_just_opened) {
01709 
01710     /* No test necessary: the test was already made above */
01711 
01712   } else if (!row_sel_test_end_conds(plan)) {
01713 
01714     goto table_exhausted;
01715   }
01716 
01717   if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))
01718       && !cons_read_requires_clust_rec) {
01719 
01720     /* The record is delete marked: we can skip it if this is
01721     not a consistent read which might see an earlier version
01722     of a non-clustered index record */
01723 
01724     if (plan->unique_search) {
01725 
01726       goto table_exhausted;
01727     }
01728 
01729     goto next_rec;
01730   }
01731 
01732   /* PHASE 5: Get the clustered index record, if needed and if we did
01733   not do the search using the clustered index */
01734 
01735   if (plan->must_get_clust || cons_read_requires_clust_rec) {
01736 
01737     /* It was a non-clustered index and we must fetch also the
01738     clustered index record */
01739 
01740     err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
01741               &mtr);
01742     mtr_has_extra_clust_latch = TRUE;
01743 
01744     switch (err) {
01745     case DB_SUCCESS_LOCKED_REC:
01746       err = DB_SUCCESS;
01747     case DB_SUCCESS:
01748       break;
01749     default:
01750       goto lock_wait_or_error;
01751     }
01752 
01753     /* Retrieving the clustered record required a search:
01754     increment the cost counter */
01755 
01756     cost_counter++;
01757 
01758     if (clust_rec == NULL) {
01759       /* The record did not exist in the read view */
01760       ut_ad(consistent_read);
01761 
01762       goto next_rec;
01763     }
01764 
01765     if (rec_get_deleted_flag(clust_rec,
01766            dict_table_is_comp(plan->table))) {
01767 
01768       /* The record is delete marked: we can skip it */
01769 
01770       goto next_rec;
01771     }
01772 
01773     if (node->can_get_updated) {
01774 
01775       btr_pcur_store_position(&(plan->clust_pcur), &mtr);
01776     }
01777   }
01778 
01779   /* PHASE 6: Test the rest of search conditions */
01780 
01781   if (!row_sel_test_other_conds(plan)) {
01782 
01783     if (plan->unique_search) {
01784 
01785       goto table_exhausted;
01786     }
01787 
01788     goto next_rec;
01789   }
01790 
01791   /* PHASE 7: We found a new qualifying row for the current table; push
01792   the row if prefetch is on, or move to the next table in the join */
01793 
01794   plan->n_rows_fetched++;
01795 
01796   ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
01797 
01798   if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
01799       || plan->unique_search || plan->no_prefetch
01800       || plan->table->big_rows) {
01801 
01802     /* No prefetch in operation: go to the next table */
01803 
01804     goto next_table;
01805   }
01806 
01807   sel_push_prefetched_row(plan);
01808 
01809   if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
01810 
01811     /* The prefetch buffer is now full */
01812 
01813     sel_pop_prefetched_row(plan);
01814 
01815     goto next_table;
01816   }
01817 
01818 next_rec:
01819   ut_ad(!search_latch_locked);
01820 
01821   if (mtr_has_extra_clust_latch) {
01822 
01823     /* We must commit &mtr if we are moving to the next
01824     non-clustered index record, because we could break the
01825     latching order if we would access a different clustered
01826     index page right away without releasing the previous. */
01827 
01828     goto commit_mtr_for_a_while;
01829   }
01830 
01831   if (node->asc) {
01832     moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
01833   } else {
01834     moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
01835   }
01836 
01837   if (!moved) {
01838 
01839     goto table_exhausted;
01840   }
01841 
01842   cursor_just_opened = FALSE;
01843 
01844   /* END OF RECORD LOOP
01845   ------------------ */
01846   goto rec_loop;
01847 
01848 next_table:
01849   /* We found a record which satisfies the conditions: we can move to
01850   the next table or return a row in the result set */
01851 
01852   ut_ad(btr_pcur_is_on_user_rec(&plan->pcur));
01853 
01854   if (plan->unique_search && !node->can_get_updated) {
01855 
01856     plan->cursor_at_end = TRUE;
01857   } else {
01858     ut_ad(!search_latch_locked);
01859 
01860     plan->stored_cursor_rec_processed = TRUE;
01861 
01862     btr_pcur_store_position(&(plan->pcur), &mtr);
01863   }
01864 
01865   mtr_commit(&mtr);
01866 
01867   mtr_has_extra_clust_latch = FALSE;
01868 
01869 next_table_no_mtr:
01870   /* If we use 'goto' to this label, it means that the row was popped
01871   from the prefetched rows stack, and &mtr is already committed */
01872 
01873   if (node->fetch_table + 1 == node->n_tables) {
01874 
01875     sel_eval_select_list(node);
01876 
01877     if (node->is_aggregate) {
01878 
01879       goto table_loop;
01880     }
01881 
01882     sel_assign_into_var_values(node->into_list, node);
01883 
01884     thr->run_node = que_node_get_parent(node);
01885 
01886     err = DB_SUCCESS;
01887     goto func_exit;
01888   }
01889 
01890   node->fetch_table++;
01891 
01892   /* When we move to the next table, we first reset the plan cursor:
01893   we do not care about resetting it when we backtrack from a table */
01894 
01895   plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
01896 
01897   goto table_loop;
01898 
01899 table_exhausted:
01900   /* The table cursor pcur reached the result set end: backtrack to the
01901   previous table in the join if we do not have cached prefetched rows */
01902 
01903   plan->cursor_at_end = TRUE;
01904 
01905   mtr_commit(&mtr);
01906 
01907   mtr_has_extra_clust_latch = FALSE;
01908 
01909   if (plan->n_rows_prefetched > 0) {
01910     /* The table became exhausted during a prefetch */
01911 
01912     sel_pop_prefetched_row(plan);
01913 
01914     goto next_table_no_mtr;
01915   }
01916 
01917 table_exhausted_no_mtr:
01918   if (node->fetch_table == 0) {
01919     err = DB_SUCCESS;
01920 
01921     if (node->is_aggregate && !node->aggregate_already_fetched) {
01922 
01923       node->aggregate_already_fetched = TRUE;
01924 
01925       sel_assign_into_var_values(node->into_list, node);
01926 
01927       thr->run_node = que_node_get_parent(node);
01928     } else {
01929       node->state = SEL_NODE_NO_MORE_ROWS;
01930 
01931       thr->run_node = que_node_get_parent(node);
01932     }
01933 
01934     err = DB_SUCCESS;
01935     goto func_exit;
01936   }
01937 
01938   node->fetch_table--;
01939 
01940   goto table_loop;
01941 
01942 stop_for_a_while:
01943   /* Return control for a while to que_run_threads, so that runaway
01944   queries can be canceled. NOTE that when we come here, we must, in a
01945   locking read, have placed the necessary (possibly waiting request)
01946   record lock on the cursor record or its successor: when we reposition
01947   the cursor, this record lock guarantees that nobody can meanwhile have
01948   inserted new records which should have appeared in the result set,
01949   which would result in the phantom problem. */
01950 
01951   ut_ad(!search_latch_locked);
01952 
01953   plan->stored_cursor_rec_processed = FALSE;
01954   btr_pcur_store_position(&(plan->pcur), &mtr);
01955 
01956   mtr_commit(&mtr);
01957 
01958 #ifdef UNIV_SYNC_DEBUG
01959   ut_ad(sync_thread_levels_empty_gen(TRUE));
01960 #endif /* UNIV_SYNC_DEBUG */
01961   err = DB_SUCCESS;
01962   goto func_exit;
01963 
01964 commit_mtr_for_a_while:
01965   /* Stores the cursor position and commits &mtr; this is used if
01966   &mtr may contain latches which would break the latching order if
01967   &mtr would not be committed and the latches released. */
01968 
01969   plan->stored_cursor_rec_processed = TRUE;
01970 
01971   ut_ad(!search_latch_locked);
01972   btr_pcur_store_position(&(plan->pcur), &mtr);
01973 
01974   mtr_commit(&mtr);
01975 
01976   mtr_has_extra_clust_latch = FALSE;
01977 
01978 #ifdef UNIV_SYNC_DEBUG
01979   ut_ad(sync_thread_levels_empty_gen(TRUE));
01980 #endif /* UNIV_SYNC_DEBUG */
01981 
01982   goto table_loop;
01983 
01984 lock_wait_or_error:
01985   /* See the note at stop_for_a_while: the same holds for this case */
01986 
01987   ut_ad(!btr_pcur_is_before_first_on_page(&plan->pcur) || !node->asc);
01988   ut_ad(!search_latch_locked);
01989 
01990   plan->stored_cursor_rec_processed = FALSE;
01991   btr_pcur_store_position(&(plan->pcur), &mtr);
01992 
01993   mtr_commit(&mtr);
01994 
01995 #ifdef UNIV_SYNC_DEBUG
01996   ut_ad(sync_thread_levels_empty_gen(TRUE));
01997 #endif /* UNIV_SYNC_DEBUG */
01998 
01999 func_exit:
02000   if (search_latch_locked) {
02001     rw_lock_s_unlock(&btr_search_latch);
02002   }
02003   if (UNIV_LIKELY_NULL(heap)) {
02004     mem_heap_free(heap);
02005   }
02006   return(err);
02007 }
02008 
02009 /**********************************************************************/
02013 UNIV_INTERN
02014 que_thr_t*
02015 row_sel_step(
02016 /*=========*/
02017   que_thr_t*  thr)  
02018 {
02019   ulint   i_lock_mode;
02020   sym_node_t* table_node;
02021   sel_node_t* node;
02022   ulint   err;
02023 
02024   ut_ad(thr);
02025 
02026         node = static_cast<sel_node_t *>(thr->run_node);
02027 
02028   ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
02029 
02030   /* If this is a new time this node is executed (or when execution
02031   resumes after wait for a table intention lock), set intention locks
02032   on the tables, or assign a read view */
02033 
02034   if (node->into_list && (thr->prev_node == que_node_get_parent(node))) {
02035 
02036     node->state = SEL_NODE_OPEN;
02037   }
02038 
02039   if (node->state == SEL_NODE_OPEN) {
02040 
02041     /* It may be that the current session has not yet started
02042     its transaction, or it has been committed: */
02043 
02044     trx_start_if_not_started(thr_get_trx(thr));
02045 
02046     plan_reset_cursor(sel_node_get_nth_plan(node, 0));
02047 
02048     if (node->consistent_read) {
02049       /* Assign a read view for the query */
02050       node->read_view = trx_assign_read_view(
02051         thr_get_trx(thr));
02052     } else {
02053       if (node->set_x_locks) {
02054         i_lock_mode = LOCK_IX;
02055       } else {
02056         i_lock_mode = LOCK_IS;
02057       }
02058 
02059       table_node = node->table_list;
02060 
02061       while (table_node) {
02062         err = lock_table(0, table_node->table,
02063                                                  static_cast<lock_mode>(i_lock_mode), thr);
02064         if (err != DB_SUCCESS) {
02065           thr_get_trx(thr)->error_state = err;
02066 
02067           return(NULL);
02068         }
02069 
02070                                 table_node = static_cast<sym_node_t *>(que_node_get_next(table_node));
02071       }
02072     }
02073 
02074     /* If this is an explicit cursor, copy stored procedure
02075     variable values, so that the values cannot change between
02076     fetches (currently, we copy them also for non-explicit
02077     cursors) */
02078 
02079     if (node->explicit_cursor
02080         && UT_LIST_GET_FIRST(node->copy_variables)) {
02081 
02082       row_sel_copy_input_variable_vals(node);
02083     }
02084 
02085     node->state = SEL_NODE_FETCH;
02086     node->fetch_table = 0;
02087 
02088     if (node->is_aggregate) {
02089       /* Reset the aggregate total values */
02090       sel_reset_aggregate_vals(node);
02091     }
02092 
02093     err = DB_SUCCESS;
02094   }
02095 
02096   err = row_sel(node, thr);
02097 
02098   /* NOTE! if queries are parallelized, the following assignment may
02099   have problems; the assignment should be made only if thr is the
02100   only top-level thr in the graph: */
02101 
02102   thr->graph->last_sel_node = node;
02103 
02104   if (err != DB_SUCCESS) {
02105     thr_get_trx(thr)->error_state = err;
02106 
02107     return(NULL);
02108   }
02109 
02110   return(thr);
02111 }
02112 
02113 /**********************************************************************/
02116 UNIV_INTERN
02117 que_thr_t*
02118 fetch_step(
02119 /*=======*/
02120   que_thr_t*  thr)  
02121 {
02122   sel_node_t* sel_node;
02123   fetch_node_t* node;
02124 
02125   ut_ad(thr);
02126 
02127         node = static_cast<fetch_node_t *>(thr->run_node);
02128   sel_node = node->cursor_def;
02129 
02130   ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
02131 
02132   if (thr->prev_node != que_node_get_parent(node)) {
02133 
02134     if (sel_node->state != SEL_NODE_NO_MORE_ROWS) {
02135 
02136       if (node->into_list) {
02137         sel_assign_into_var_values(node->into_list,
02138                  sel_node);
02139       } else {
02140         void* ret = (*node->func->func)(
02141           sel_node, node->func->arg);
02142 
02143         if (!ret) {
02144           sel_node->state
02145             = SEL_NODE_NO_MORE_ROWS;
02146         }
02147       }
02148     }
02149 
02150     thr->run_node = que_node_get_parent(node);
02151 
02152     return(thr);
02153   }
02154 
02155   /* Make the fetch node the parent of the cursor definition for
02156   the time of the fetch, so that execution knows to return to this
02157   fetch node after a row has been selected or we know that there is
02158   no row left */
02159 
02160   sel_node->common.parent = node;
02161 
02162   if (sel_node->state == SEL_NODE_CLOSED) {
02163     fprintf(stderr,
02164       "InnoDB: Error: fetch called on a closed cursor\n");
02165 
02166     thr_get_trx(thr)->error_state = DB_ERROR;
02167 
02168     return(NULL);
02169   }
02170 
02171   thr->run_node = sel_node;
02172 
02173   return(thr);
02174 }
02175 
02176 /****************************************************************/
02179 UNIV_INTERN
02180 void*
02181 row_fetch_print(
02182 /*============*/
02183   void* row,    
02184   void* user_arg) 
02185 {
02186         sel_node_t *node = static_cast<sel_node_t *>(row);
02187   que_node_t* exp;
02188   ulint   i = 0;
02189 
02190   UT_NOT_USED(user_arg);
02191 
02192   fprintf(stderr, "row_fetch_print: row %p\n", row);
02193 
02194   exp = node->select_list;
02195 
02196   while (exp) {
02197     dfield_t* dfield = que_node_get_val(exp);
02198     const dtype_t*  type = dfield_get_type(dfield);
02199 
02200     fprintf(stderr, " column %lu:\n", (ulong)i);
02201 
02202     dtype_print(type);
02203     putc('\n', stderr);
02204 
02205     if (dfield_get_len(dfield) != UNIV_SQL_NULL) {
02206       ut_print_buf(stderr, dfield_get_data(dfield),
02207              dfield_get_len(dfield));
02208       putc('\n', stderr);
02209     } else {
02210       fputs(" <NULL>;\n", stderr);
02211     }
02212 
02213     exp = que_node_get_next(exp);
02214     i++;
02215   }
02216 
02217   return((void*)42);
02218 }
02219 
02220 /***********************************************************/
02223 UNIV_INTERN
02224 que_thr_t*
02225 row_printf_step(
02226 /*============*/
02227   que_thr_t*  thr)  
02228 {
02229   row_printf_node_t*  node;
02230   sel_node_t*   sel_node;
02231   que_node_t*   arg;
02232 
02233   ut_ad(thr);
02234 
02235         node = static_cast<row_printf_node_t *>(thr->run_node);
02236 
02237   sel_node = node->sel_node;
02238 
02239   ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF);
02240 
02241   if (thr->prev_node == que_node_get_parent(node)) {
02242 
02243     /* Reset the cursor */
02244     sel_node->state = SEL_NODE_OPEN;
02245 
02246     /* Fetch next row to print */
02247 
02248     thr->run_node = sel_node;
02249 
02250     return(thr);
02251   }
02252 
02253   if (sel_node->state != SEL_NODE_FETCH) {
02254 
02255     ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
02256 
02257     /* No more rows to print */
02258 
02259     thr->run_node = que_node_get_parent(node);
02260 
02261     return(thr);
02262   }
02263 
02264   arg = sel_node->select_list;
02265 
02266   while (arg) {
02267     dfield_print_also_hex(que_node_get_val(arg));
02268 
02269     fputs(" ::: ", stderr);
02270 
02271     arg = que_node_get_next(arg);
02272   }
02273 
02274   putc('\n', stderr);
02275 
02276   /* Fetch next row to print */
02277 
02278   thr->run_node = sel_node;
02279 
02280   return(thr);
02281 }
02282 
02283 /****************************************************************/
02290 UNIV_INTERN
02291 void
02292 row_sel_convert_mysql_key_to_innobase(
02293 /*==================================*/
02294   dtuple_t* tuple,    
02298   byte*   buf,    
02300   ulint   buf_len,  
02301   dict_index_t* index,    
02302   const byte* key_ptr,  
02303   ulint   key_len,  
02304   trx_t*    trx)    
02305 {
02306   byte*   original_buf  = buf;
02307   const byte* original_key_ptr = key_ptr;
02308   dict_field_t* field;
02309   dfield_t* dfield;
02310   ulint   data_offset;
02311   ulint   data_len;
02312   ulint   data_field_len;
02313   ibool   is_null;
02314   const byte* key_end;
02315   ulint   n_fields = 0;
02316 
02317   /* For documentation of the key value storage format in MySQL, see
02318   ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
02319 
02320   key_end = key_ptr + key_len;
02321 
02322   /* Permit us to access any field in the tuple (ULINT_MAX): */
02323 
02324   dtuple_set_n_fields(tuple, ULINT_MAX);
02325 
02326   dfield = dtuple_get_nth_field(tuple, 0);
02327   field = dict_index_get_nth_field(index, 0);
02328 
02329   if (UNIV_UNLIKELY(dfield_get_type(dfield)->mtype == DATA_SYS)) {
02330     /* A special case: we are looking for a position in the
02331     generated clustered index which InnoDB automatically added
02332     to a table with no primary key: the first and the only
02333     ordering column is ROW_ID which InnoDB stored to the key_ptr
02334     buffer. */
02335 
02336     ut_a(key_len == DATA_ROW_ID_LEN);
02337 
02338     dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN);
02339 
02340     dtuple_set_n_fields(tuple, 1);
02341 
02342     return;
02343   }
02344 
02345   while (key_ptr < key_end) {
02346 
02347     ulint type = dfield_get_type(dfield)->mtype;
02348     ut_a(field->col->mtype == type);
02349 
02350     data_offset = 0;
02351     is_null = FALSE;
02352 
02353     if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) {
02354       /* The first byte in the field tells if this is
02355       an SQL NULL value */
02356 
02357       data_offset = 1;
02358 
02359       if (*key_ptr != 0) {
02360         dfield_set_null(dfield);
02361 
02362         is_null = TRUE;
02363       }
02364     }
02365 
02366     /* Calculate data length and data field total length */
02367 
02368     if (type == DATA_BLOB) {
02369       /* The key field is a column prefix of a BLOB or
02370       TEXT */
02371 
02372       ut_a(field->prefix_len > 0);
02373 
02374       /* MySQL stores the actual data length to the first 2
02375       bytes after the optional SQL NULL marker byte. The
02376       storage format is little-endian, that is, the most
02377       significant byte at a higher address. In UTF-8, MySQL
02378       seems to reserve field->prefix_len bytes for
02379       storing this field in the key value buffer, even
02380       though the actual value only takes data_len bytes
02381       from the start. */
02382 
02383       data_len = key_ptr[data_offset]
02384         + 256 * key_ptr[data_offset + 1];
02385       data_field_len = data_offset + 2 + field->prefix_len;
02386 
02387       data_offset += 2;
02388 
02389       /* Now that we know the length, we store the column
02390       value like it would be a fixed char field */
02391 
02392     } else if (field->prefix_len > 0) {
02393       /* Looks like MySQL pads unused end bytes in the
02394       prefix with space. Therefore, also in UTF-8, it is ok
02395       to compare with a prefix containing full prefix_len
02396       bytes, and no need to take at most prefix_len / 3
02397       UTF-8 characters from the start.
02398       If the prefix is used as the upper end of a LIKE
02399       'abc%' query, then MySQL pads the end with chars
02400       0xff. TODO: in that case does it any harm to compare
02401       with the full prefix_len bytes. How do characters
02402       0xff in UTF-8 behave? */
02403 
02404       data_len = field->prefix_len;
02405       data_field_len = data_offset + data_len;
02406     } else {
02407       data_len = dfield_get_type(dfield)->len;
02408       data_field_len = data_offset + data_len;
02409     }
02410 
02411     if (UNIV_UNLIKELY
02412         (dtype_get_mysql_type(dfield_get_type(dfield))
02413          == DATA_MYSQL_TRUE_VARCHAR)
02414         && UNIV_LIKELY(type != DATA_INT)) {
02415       /* In a MySQL key value format, a true VARCHAR is
02416       always preceded by 2 bytes of a length field.
02417       dfield_get_type(dfield)->len returns the maximum
02418       'payload' len in bytes. That does not include the
02419       2 bytes that tell the actual data length.
02420 
02421       We added the check != DATA_INT to make sure we do
02422       not treat MySQL ENUM or SET as a true VARCHAR! */
02423 
02424       data_len += 2;
02425       data_field_len += 2;
02426     }
02427 
02428     /* Storing may use at most data_len bytes of buf */
02429 
02430     if (UNIV_LIKELY(!is_null)) {
02431       row_mysql_store_col_in_innobase_format(
02432         dfield, buf,
02433         FALSE, /* MySQL key value format col */
02434         key_ptr + data_offset, data_len,
02435         dict_table_is_comp(index->table));
02436       buf += data_len;
02437     }
02438 
02439     key_ptr += data_field_len;
02440 
02441     if (UNIV_UNLIKELY(key_ptr > key_end)) {
02442       /* The last field in key was not a complete key field
02443       but a prefix of it.
02444 
02445       Print a warning about this! HA_READ_PREFIX_LAST does
02446       not currently work in InnoDB with partial-field key
02447       value prefixes. Since MySQL currently uses a padding
02448       trick to calculate LIKE 'abc%' type queries there
02449       should never be partial-field prefixes in searches. */
02450 
02451       ut_print_timestamp(stderr);
02452 
02453       fputs("  InnoDB: Warning: using a partial-field"
02454             " key prefix in search.\n"
02455             "InnoDB: ", stderr);
02456       dict_index_name_print(stderr, trx, index);
02457       fprintf(stderr, ". Last data field length %lu bytes,\n"
02458         "InnoDB: key ptr now exceeds"
02459         " key end by %lu bytes.\n"
02460         "InnoDB: Key value in the MySQL format:\n",
02461         (ulong) data_field_len,
02462         (ulong) (key_ptr - key_end));
02463       fflush(stderr);
02464       ut_print_buf(stderr, original_key_ptr, key_len);
02465       putc('\n', stderr);
02466 
02467       if (!is_null) {
02468         ulint len = dfield_get_len(dfield);
02469         dfield_set_len(dfield, len
02470                  - (ulint) (key_ptr - key_end));
02471       }
02472     }
02473 
02474     n_fields++;
02475     field++;
02476     dfield++;
02477   }
02478 
02479   ut_a(buf <= original_buf + buf_len);
02480 
02481   /* We set the length of tuple to n_fields: we assume that the memory
02482   area allocated for it is big enough (usually bigger than n_fields). */
02483 
02484   dtuple_set_n_fields(tuple, n_fields);
02485 }
02486 
02487 /**************************************************************/
02489 static
02490 void
02491 row_sel_store_row_id_to_prebuilt(
02492 /*=============================*/
02493   row_prebuilt_t*   prebuilt, 
02494   const rec_t*    index_rec,  
02495   const dict_index_t* index,    
02496   const ulint*    offsets)  
02498 {
02499   const byte* data;
02500   ulint   len;
02501 
02502   ut_ad(rec_offs_validate(index_rec, index, offsets));
02503 
02504   data = rec_get_nth_field(
02505     index_rec, offsets,
02506     dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len);
02507 
02508   if (UNIV_UNLIKELY(len != DATA_ROW_ID_LEN)) {
02509     fprintf(stderr,
02510       "InnoDB: Error: Row id field is"
02511       " wrong length %lu in ", (ulong) len);
02512     dict_index_name_print(stderr, prebuilt->trx, index);
02513     fprintf(stderr, "\n"
02514       "InnoDB: Field number %lu, record:\n",
02515       (ulong) dict_index_get_sys_col_pos(index,
02516                  DATA_ROW_ID));
02517     rec_print_new(stderr, index_rec, offsets);
02518     putc('\n', stderr);
02519     ut_error;
02520   }
02521 
02522   ut_memcpy(prebuilt->row_id, data, len);
02523 }
02524 
02525 /**************************************************************/
02528 static
02529 void
02530 row_sel_field_store_in_mysql_format(
02531 /*================================*/
02532   byte*   dest, 
02538   const mysql_row_templ_t* templ,
02543   const byte* data, 
02544   ulint   len)  
02545 {
02546   byte* ptr;
02547 
02548   ut_ad(len != UNIV_SQL_NULL);
02549   UNIV_MEM_ASSERT_RW(data, len);
02550 
02551   switch (templ->type) {
02552     const byte* field_end;
02553     byte*   pad;
02554   case DATA_INT:
02555     /* Convert integer data from Innobase to a little-endian
02556     format, sign bit restored to normal */
02557 
02558     ptr = dest + len;
02559 
02560     for (;;) {
02561       ptr--;
02562       *ptr = *data;
02563       if (ptr == dest) {
02564         break;
02565       }
02566       data++;
02567     }
02568 
02569     if (!templ->is_unsigned) {
02570       dest[len - 1] = (byte) (dest[len - 1] ^ 128);
02571     }
02572 
02573     ut_ad(templ->mysql_col_len == len);
02574     break;
02575 
02576   case DATA_VARCHAR:
02577   case DATA_VARMYSQL:
02578   case DATA_BINARY:
02579     field_end = dest + templ->mysql_col_len;
02580 
02581     if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
02582       /* This is a >= 5.0.3 type true VARCHAR. Store the
02583       length of the data to the first byte or the first
02584       two bytes of dest. */
02585 
02586       dest = row_mysql_store_true_var_len(
02587         dest, len, templ->mysql_length_bytes);
02588     }
02589 
02590     /* Copy the actual data */
02591     ut_memcpy(dest, data, len);
02592 
02593     /* Pad with trailing spaces. We pad with spaces also the
02594     unused end of a >= 5.0.3 true VARCHAR column, just in case
02595     MySQL expects its contents to be deterministic. */
02596 
02597     pad = dest + len;
02598 
02599     ut_ad(templ->mbminlen <= templ->mbmaxlen);
02600 
02601     /* We treat some Unicode charset strings specially. */
02602     switch (templ->mbminlen) {
02603     case 4:
02604       /* InnoDB should never have stripped partial
02605       UTF-32 characters. */
02606       ut_a(!(len & 3));
02607       break;
02608     case 2:
02609       /* A space char is two bytes,
02610       0x0020 in UCS2 and UTF-16 */
02611 
02612       if (UNIV_UNLIKELY(len & 1)) {
02613         /* A 0x20 has been stripped from the column.
02614         Pad it back. */
02615 
02616         if (pad < field_end) {
02617           *pad++ = 0x20;
02618         }
02619       }
02620     }
02621 
02622     row_mysql_pad_col(templ->mbminlen, pad, field_end - pad);
02623     break;
02624 
02625   case DATA_BLOB:
02626     /* Store a pointer to the BLOB buffer to dest: the BLOB was
02627     already copied to the buffer in row_sel_store_mysql_rec */
02628 
02629     row_mysql_store_blob_ref(dest, templ->mysql_col_len, data,
02630            len);
02631     break;
02632 
02633   case DATA_MYSQL:
02634     memcpy(dest, data, len);
02635 
02636     ut_ad(templ->mysql_col_len >= len);
02637     ut_ad(templ->mbmaxlen >= templ->mbminlen);
02638 
02639     ut_ad(templ->mbmaxlen > templ->mbminlen
02640           || templ->mysql_col_len == len);
02641     /* The following assertion would fail for old tables
02642     containing UTF-8 ENUM columns due to Bug #9526. */
02643     ut_ad(!templ->mbmaxlen
02644           || !(templ->mysql_col_len % templ->mbmaxlen));
02645     ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len);
02646 
02647     if (templ->mbminlen == 1 && templ->mbmaxlen != 1) {
02648       /* Pad with spaces. This undoes the stripping
02649       done in row0mysql.c, function
02650       row_mysql_store_col_in_innobase_format(). */
02651 
02652       memset(dest + len, 0x20, templ->mysql_col_len - len);
02653     }
02654     break;
02655 
02656   default:
02657 #ifdef UNIV_DEBUG
02658   case DATA_SYS_CHILD:
02659   case DATA_SYS:
02660     /* These column types should never be shipped to MySQL. */
02661     ut_ad(0);
02662 
02663   case DATA_CHAR:
02664   case DATA_FIXBINARY:
02665   case DATA_FLOAT:
02666   case DATA_DOUBLE:
02667   case DATA_DECIMAL:
02668     /* Above are the valid column types for MySQL data. */
02669 #endif /* UNIV_DEBUG */
02670     ut_ad(templ->mysql_col_len == len);
02671     memcpy(dest, data, len);
02672   }
02673 }
02674 
02675 /**************************************************************/
02681 static
02682 #ifdef __GNUC__
02683  __attribute__((warn_unused_result))
02684 #endif
02685 ibool
02686 row_sel_store_mysql_rec(
02687 /*====================*/
02688   byte*   mysql_rec,  
02689   row_prebuilt_t* prebuilt, 
02690   const rec_t*  rec,    
02694   ibool   rec_clust,  
02697   const ulint*  offsets)  
02699 {
02700   mem_heap_t* extern_field_heap = NULL;
02701   mem_heap_t* heap;
02702   ulint   i;
02703 
02704   ut_ad(prebuilt->mysql_template);
02705   ut_ad(prebuilt->default_rec);
02706   ut_ad(rec_offs_validate(rec, NULL, offsets));
02707   ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
02708 
02709   if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
02710     mem_heap_free(prebuilt->blob_heap);
02711     prebuilt->blob_heap = NULL;
02712   }
02713 
02714   for (i = 0; i < prebuilt->n_template ; i++) {
02715 
02716     const mysql_row_templ_t*templ = prebuilt->mysql_template + i;
02717     const byte*   data;
02718     ulint     len;
02719     ulint     field_no;
02720 
02721     field_no = rec_clust
02722       ? templ->clust_rec_field_no : templ->rec_field_no;
02723 
02724     if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, field_no))) {
02725 
02726       /* Copy an externally stored field to the temporary
02727       heap */
02728 
02729       ut_a(!prebuilt->trx->has_search_latch);
02730 
02731       if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) {
02732         if (prebuilt->blob_heap == NULL) {
02733           prebuilt->blob_heap = mem_heap_create(
02734             UNIV_PAGE_SIZE);
02735         }
02736 
02737         heap = prebuilt->blob_heap;
02738       } else {
02739         extern_field_heap
02740           = mem_heap_create(UNIV_PAGE_SIZE);
02741 
02742         heap = extern_field_heap;
02743       }
02744 
02745       /* NOTE: if we are retrieving a big BLOB, we may
02746       already run out of memory in the next call, which
02747       causes an assert */
02748 
02749       data = btr_rec_copy_externally_stored_field(
02750         rec, offsets,
02751         dict_table_zip_size(prebuilt->table),
02752         field_no, &len, heap);
02753 
02754       if (UNIV_UNLIKELY(!data)) {
02755         /* The externally stored field
02756         was not written yet. This
02757         record should only be seen by
02758         recv_recovery_rollback_active()
02759         or any TRX_ISO_READ_UNCOMMITTED
02760         transactions. */
02761 
02762         if (extern_field_heap) {
02763           mem_heap_free(extern_field_heap);
02764         }
02765 
02766         return(FALSE);
02767       }
02768 
02769       if (UNIV_UNLIKELY(!data)) {
02770         /* The externally stored field
02771         was not written yet. This
02772         record should only be seen by
02773         recv_recovery_rollback_active()
02774         or any TRX_ISO_READ_UNCOMMITTED
02775         transactions. */
02776 
02777         if (extern_field_heap) {
02778           mem_heap_free(extern_field_heap);
02779         }
02780 
02781         return(FALSE);
02782       }
02783 
02784       ut_a(len != UNIV_SQL_NULL);
02785     } else {
02786       /* Field is stored in the row. */
02787 
02788       data = rec_get_nth_field(rec, offsets, field_no, &len);
02789 
02790       if (UNIV_UNLIKELY(templ->type == DATA_BLOB)
02791           && len != UNIV_SQL_NULL) {
02792 
02793         /* It is a BLOB field locally stored in the
02794         InnoDB record: we MUST copy its contents to
02795         prebuilt->blob_heap here because later code
02796         assumes all BLOB values have been copied to a
02797         safe place. */
02798 
02799         if (prebuilt->blob_heap == NULL) {
02800           prebuilt->blob_heap = mem_heap_create(
02801             UNIV_PAGE_SIZE);
02802         }
02803 
02804                                 data = static_cast<byte *>(memcpy(mem_heap_alloc(
02805             prebuilt->blob_heap, len),
02806                                                                   data, len));
02807       }
02808     }
02809 
02810     if (len != UNIV_SQL_NULL) {
02811       row_sel_field_store_in_mysql_format(
02812         mysql_rec + templ->mysql_col_offset,
02813         templ, data, len);
02814 
02815       /* Cleanup */
02816       if (extern_field_heap) {
02817         mem_heap_free(extern_field_heap);
02818         extern_field_heap = NULL;
02819       }
02820 
02821       if (templ->mysql_null_bit_mask) {
02822         /* It is a nullable column with a non-NULL
02823         value */
02824         mysql_rec[templ->mysql_null_byte_offset]
02825           &= ~(byte) templ->mysql_null_bit_mask;
02826       }
02827     } else {
02828       /* MySQL assumes that the field for an SQL
02829       NULL value is set to the default value. */
02830 
02831       UNIV_MEM_ASSERT_RW(prebuilt->default_rec
02832              + templ->mysql_col_offset,
02833              templ->mysql_col_len);
02834       mysql_rec[templ->mysql_null_byte_offset]
02835         |= (byte) templ->mysql_null_bit_mask;
02836       memcpy(mysql_rec + templ->mysql_col_offset,
02837              (const byte*) prebuilt->default_rec
02838              + templ->mysql_col_offset,
02839              templ->mysql_col_len);
02840     }
02841   }
02842 
02843   return(TRUE);
02844 }
02845 
02846 /*********************************************************************/
02849 static
02850 ulint
02851 row_sel_build_prev_vers_for_mysql(
02852 /*==============================*/
02853   read_view_t*  read_view,  
02854   dict_index_t* clust_index,  
02855   row_prebuilt_t* prebuilt, 
02856   const rec_t*  rec,    
02857   ulint**   offsets,  
02859   mem_heap_t**  offset_heap,  
02861   rec_t**   old_vers, 
02865   mtr_t*    mtr)    
02866 {
02867   ulint err;
02868 
02869   if (prebuilt->old_vers_heap) {
02870     mem_heap_empty(prebuilt->old_vers_heap);
02871   } else {
02872     prebuilt->old_vers_heap = mem_heap_create(200);
02873   }
02874 
02875   err = row_vers_build_for_consistent_read(
02876     rec, mtr, clust_index, offsets, read_view, offset_heap,
02877     prebuilt->old_vers_heap, old_vers);
02878   return(err);
02879 }
02880 
02881 /*********************************************************************/
02886 static
02887 enum db_err
02888 row_sel_get_clust_rec_for_mysql(
02889 /*============================*/
02890   row_prebuilt_t* prebuilt,
02891   dict_index_t* sec_index,
02892   const rec_t*  rec,  
02896   que_thr_t*  thr,  
02897   const rec_t** out_rec,
02901   ulint**   offsets,
02905   mem_heap_t**  offset_heap,
02907   mtr_t*    mtr)  
02910 {
02911   dict_index_t* clust_index;
02912   const rec_t*  clust_rec;
02913   rec_t*    old_vers;
02914   enum db_err err;
02915   trx_t*    trx;
02916 
02917   *out_rec = NULL;
02918   trx = thr_get_trx(thr);
02919 
02920   row_build_row_ref_in_tuple(prebuilt->clust_ref, rec,
02921            sec_index, *offsets, trx);
02922 
02923   clust_index = dict_table_get_first_index(sec_index->table);
02924 
02925   btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref,
02926            PAGE_CUR_LE, BTR_SEARCH_LEAF,
02927            prebuilt->clust_pcur, 0, mtr);
02928 
02929   clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);
02930 
02931   prebuilt->clust_pcur->trx_if_known = trx;
02932 
02933   /* Note: only if the search ends up on a non-infimum record is the
02934   low_match value the real match to the search tuple */
02935 
02936   if (!page_rec_is_user_rec(clust_rec)
02937       || btr_pcur_get_low_match(prebuilt->clust_pcur)
02938       < dict_index_get_n_unique(clust_index)) {
02939 
02940     /* In a rare case it is possible that no clust rec is found
02941     for a delete-marked secondary index record: if in row0umod.c
02942     in row_undo_mod_remove_clust_low() we have already removed
02943     the clust rec, while purge is still cleaning and removing
02944     secondary index records associated with earlier versions of
02945     the clustered index record. In that case we know that the
02946     clustered index record did not exist in the read view of
02947     trx. */
02948 
02949     if (!rec_get_deleted_flag(rec,
02950             dict_table_is_comp(sec_index->table))
02951         || prebuilt->select_lock_type != LOCK_NONE) {
02952       ut_print_timestamp(stderr);
02953       fputs("  InnoDB: error clustered record"
02954             " for sec rec not found\n"
02955             "InnoDB: ", stderr);
02956       dict_index_name_print(stderr, trx, sec_index);
02957       fputs("\n"
02958             "InnoDB: sec index record ", stderr);
02959       rec_print(stderr, rec, sec_index);
02960       fputs("\n"
02961             "InnoDB: clust index record ", stderr);
02962       rec_print(stderr, clust_rec, clust_index);
02963       putc('\n', stderr);
02964       trx_print(stderr, trx, 600);
02965 
02966       fputs("\n"
02967             "InnoDB: Submit a detailed bug report"
02968             " to http://bugs.mysql.com\n", stderr);
02969     }
02970 
02971     clust_rec = NULL;
02972 
02973     err = DB_SUCCESS;
02974     goto func_exit;
02975   }
02976 
02977   *offsets = rec_get_offsets(clust_rec, clust_index, *offsets,
02978            ULINT_UNDEFINED, offset_heap);
02979 
02980   if (prebuilt->select_lock_type != LOCK_NONE) {
02981     /* Try to place a lock on the index record; we are searching
02982     the clust rec with a unique condition, hence
02983     we set a LOCK_REC_NOT_GAP type lock */
02984 
02985     err = lock_clust_rec_read_check_and_lock(
02986       0, btr_pcur_get_block(prebuilt->clust_pcur),
02987       clust_rec, clust_index, *offsets,
02988                         static_cast<lock_mode>(prebuilt->select_lock_type),
02989                         LOCK_REC_NOT_GAP, thr);
02990     switch (err) {
02991     case DB_SUCCESS:
02992     case DB_SUCCESS_LOCKED_REC:
02993       break;
02994     default:
02995       goto err_exit;
02996     }
02997   } else {
02998     /* This is a non-locking consistent read: if necessary, fetch
02999     a previous version of the record */
03000 
03001     old_vers = NULL;
03002 
03003     /* If the isolation level allows reading of uncommitted data,
03004     then we never look for an earlier version */
03005 
03006     if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
03007         && !lock_clust_rec_cons_read_sees(
03008           clust_rec, clust_index, *offsets,
03009           trx->read_view)) {
03010 
03011       /* The following call returns 'offsets' associated with
03012       'old_vers' */
03013                   err = static_cast<db_err>(row_sel_build_prev_vers_for_mysql(
03014         trx->read_view, clust_index, prebuilt,
03015         clust_rec, offsets, offset_heap, &old_vers,
03016                                 mtr));
03017 
03018       if (err != DB_SUCCESS || old_vers == NULL) {
03019 
03020         goto err_exit;
03021       }
03022 
03023       clust_rec = old_vers;
03024     }
03025 
03026     /* If we had to go to an earlier version of row or the
03027     secondary index record is delete marked, then it may be that
03028     the secondary index record corresponding to clust_rec
03029     (or old_vers) is not rec; in that case we must ignore
03030     such row because in our snapshot rec would not have existed.
03031     Remember that from rec we cannot see directly which transaction
03032     id corresponds to it: we have to go to the clustered index
03033     record. A query where we want to fetch all rows where
03034     the secondary index value is in some interval would return
03035     a wrong result if we would not drop rows which we come to
03036     visit through secondary index records that would not really
03037     exist in our snapshot. */
03038 
03039     if (clust_rec
03040         && (old_vers
03041       || trx->isolation_level <= TRX_ISO_READ_UNCOMMITTED
03042       || rec_get_deleted_flag(rec, dict_table_is_comp(
03043               sec_index->table)))
03044         && !row_sel_sec_rec_is_for_clust_rec(
03045           rec, sec_index, clust_rec, clust_index)) {
03046       clust_rec = NULL;
03047 #ifdef UNIV_SEARCH_DEBUG
03048     } else {
03049       ut_a(clust_rec == NULL
03050            || row_sel_sec_rec_is_for_clust_rec(
03051              rec, sec_index, clust_rec, clust_index));
03052 #endif
03053     }
03054 
03055     err = DB_SUCCESS;
03056   }
03057 
03058 func_exit:
03059   *out_rec = clust_rec;
03060 
03061   if (prebuilt->select_lock_type != LOCK_NONE) {
03062     /* We may use the cursor in update or in unlock_row():
03063     store its position */
03064 
03065     btr_pcur_store_position(prebuilt->clust_pcur, mtr);
03066   }
03067 
03068 err_exit:
03069   return(err);
03070 }
03071 
03072 /********************************************************************/
03078 static
03079 ibool
03080 sel_restore_position_for_mysql(
03081 /*===========================*/
03082   ibool*    same_user_rec,  
03086   ulint   latch_mode, 
03088   btr_pcur_t* pcur,   
03090   ibool   moves_up, 
03092   mtr_t*    mtr)    
03094 {
03095   ibool success;
03096   ulint relative_position;
03097 
03098   relative_position = pcur->rel_pos;
03099 
03100   success = btr_pcur_restore_position(latch_mode, pcur, mtr);
03101 
03102   *same_user_rec = success;
03103 
03104   if (relative_position == BTR_PCUR_ON) {
03105     if (success) {
03106       return(FALSE);
03107     }
03108 
03109     if (moves_up) {
03110       btr_pcur_move_to_next(pcur, mtr);
03111     }
03112 
03113     return(TRUE);
03114   }
03115 
03116   if (relative_position == BTR_PCUR_AFTER
03117       || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE) {
03118 
03119     if (moves_up) {
03120       return(TRUE);
03121     }
03122 
03123     if (btr_pcur_is_on_user_rec(pcur)) {
03124       btr_pcur_move_to_prev(pcur, mtr);
03125     }
03126 
03127     return(TRUE);
03128   }
03129 
03130   ut_ad(relative_position == BTR_PCUR_BEFORE
03131         || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE);
03132 
03133   if (moves_up && btr_pcur_is_on_user_rec(pcur)) {
03134     btr_pcur_move_to_next(pcur, mtr);
03135   }
03136 
03137   return(TRUE);
03138 }
03139 
03140 /********************************************************************/
03142 UNIV_INLINE
03143 void
03144 row_sel_pop_cached_row_for_mysql(
03145 /*=============================*/
03146   byte*   buf,    
03148   row_prebuilt_t* prebuilt) 
03149 {
03150   ulint     i;
03151   const mysql_row_templ_t*templ;
03152   byte*     cached_rec;
03153   ut_ad(prebuilt->n_fetch_cached > 0);
03154   ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len);
03155 
03156   if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) {
03157     /* Copy cache record field by field, don't touch fields that
03158     are not covered by current key */
03159     cached_rec = prebuilt->fetch_cache[
03160       prebuilt->fetch_cache_first];
03161 
03162     for (i = 0; i < prebuilt->n_template; i++) {
03163       templ = prebuilt->mysql_template + i;
03164 #if 0 /* Some of the cached_rec may legitimately be uninitialized. */
03165       UNIV_MEM_ASSERT_RW(cached_rec
03166              + templ->mysql_col_offset,
03167              templ->mysql_col_len);
03168 #endif
03169       ut_memcpy(buf + templ->mysql_col_offset,
03170           cached_rec + templ->mysql_col_offset,
03171           templ->mysql_col_len);
03172       /* Copy NULL bit of the current field from cached_rec
03173       to buf */
03174       if (templ->mysql_null_bit_mask) {
03175         buf[templ->mysql_null_byte_offset]
03176           ^= (buf[templ->mysql_null_byte_offset]
03177               ^ cached_rec[templ->mysql_null_byte_offset])
03178           & (byte)templ->mysql_null_bit_mask;
03179       }
03180     }
03181   }
03182   else {
03183 #if 0 /* Some of the cached_rec may legitimately be uninitialized. */
03184     UNIV_MEM_ASSERT_RW(prebuilt->fetch_cache
03185            [prebuilt->fetch_cache_first],
03186            prebuilt->mysql_prefix_len);
03187 #endif
03188     ut_memcpy(buf,
03189         prebuilt->fetch_cache[prebuilt->fetch_cache_first],
03190         prebuilt->mysql_prefix_len);
03191   }
03192   prebuilt->n_fetch_cached--;
03193   prebuilt->fetch_cache_first++;
03194 
03195   if (prebuilt->n_fetch_cached == 0) {
03196     prebuilt->fetch_cache_first = 0;
03197   }
03198 }
03199 
03200 /********************************************************************/
03203 UNIV_INLINE
03204 #ifdef __GNUC__
03205 __attribute__((warn_unused_result))
03206 #endif
03207 ibool
03208 row_sel_push_cache_row_for_mysql(
03209 /*=============================*/
03210   row_prebuilt_t* prebuilt, 
03211   const rec_t*  rec,    
03215   ibool   rec_clust,  
03218   const ulint*  offsets)  
03219 {
03220   byte* buf;
03221   ulint i;
03222 
03223   ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
03224   ut_ad(rec_offs_validate(rec, NULL, offsets));
03225   ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
03226   ut_a(!prebuilt->templ_contains_blob);
03227 
03228   if (prebuilt->fetch_cache[0] == NULL) {
03229     /* Allocate memory for the fetch cache */
03230 
03231     for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
03232 
03233       /* A user has reported memory corruption in these
03234       buffers in Linux. Put magic numbers there to help
03235       to track a possible bug. */
03236 
03237                         buf = static_cast<byte *>(mem_alloc(prebuilt->mysql_row_len + 8));
03238 
03239       prebuilt->fetch_cache[i] = buf + 4;
03240 
03241       mach_write_to_4(buf, ROW_PREBUILT_FETCH_MAGIC_N);
03242       mach_write_to_4(buf + 4 + prebuilt->mysql_row_len,
03243           ROW_PREBUILT_FETCH_MAGIC_N);
03244     }
03245   }
03246 
03247   ut_ad(prebuilt->fetch_cache_first == 0);
03248   UNIV_MEM_INVALID(prebuilt->fetch_cache[prebuilt->n_fetch_cached],
03249        prebuilt->mysql_row_len);
03250 
03251   if (UNIV_UNLIKELY(!row_sel_store_mysql_rec(
03252           prebuilt->fetch_cache[
03253             prebuilt->n_fetch_cached],
03254           prebuilt, rec, rec_clust, offsets))) {
03255     return(FALSE);
03256   }
03257 
03258   prebuilt->n_fetch_cached++;
03259   return(TRUE);
03260 }
03261 
03262 /*********************************************************************/
03268 static
03269 ulint
03270 row_sel_try_search_shortcut_for_mysql(
03271 /*==================================*/
03272   const rec_t** out_rec,
03273   row_prebuilt_t* prebuilt,
03274   ulint**   offsets,
03275   mem_heap_t**  heap, 
03276   mtr_t*    mtr)  
03277 {
03278   dict_index_t* index   = prebuilt->index;
03279   const dtuple_t* search_tuple  = prebuilt->search_tuple;
03280   btr_pcur_t* pcur    = prebuilt->pcur;
03281   trx_t*    trx   = prebuilt->trx;
03282   const rec_t*  rec;
03283 
03284   ut_ad(dict_index_is_clust(index));
03285   ut_ad(!prebuilt->templ_contains_blob);
03286 
03287 #ifndef UNIV_SEARCH_DEBUG
03288   btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
03289            BTR_SEARCH_LEAF, pcur,
03290            RW_S_LATCH,
03291            mtr);
03292 #else /* UNIV_SEARCH_DEBUG */
03293   btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
03294            BTR_SEARCH_LEAF, pcur,
03295            0,
03296            mtr);
03297 #endif /* UNIV_SEARCH_DEBUG */
03298   rec = btr_pcur_get_rec(pcur);
03299 
03300   if (!page_rec_is_user_rec(rec)) {
03301 
03302     return(SEL_RETRY);
03303   }
03304 
03305   /* As the cursor is now placed on a user record after a search with
03306   the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
03307   fields in the user record matched to the search tuple */
03308 
03309   if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) {
03310 
03311     return(SEL_EXHAUSTED);
03312   }
03313 
03314   /* This is a non-locking consistent read: if necessary, fetch
03315   a previous version of the record */
03316 
03317   *offsets = rec_get_offsets(rec, index, *offsets,
03318            ULINT_UNDEFINED, heap);
03319 
03320   if (!lock_clust_rec_cons_read_sees(rec, index,
03321              *offsets, trx->read_view)) {
03322 
03323     return(SEL_RETRY);
03324   }
03325 
03326   if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) {
03327 
03328     return(SEL_EXHAUSTED);
03329   }
03330 
03331   *out_rec = rec;
03332 
03333   return(SEL_FOUND);
03334 }
03335 
03336 /********************************************************************/
03344 UNIV_INTERN
03345 ulint
03346 row_search_for_mysql(
03347 /*=================*/
03348   byte*   buf,    
03350   ulint   mode,   
03351   row_prebuilt_t* prebuilt, 
03358   ulint   match_mode, 
03360   ulint   direction)  
03365 {
03366   dict_index_t* index   = prebuilt->index;
03367   ibool   comp    = dict_table_is_comp(index->table);
03368   const dtuple_t* search_tuple  = prebuilt->search_tuple;
03369   btr_pcur_t* pcur    = prebuilt->pcur;
03370   trx_t*    trx   = prebuilt->trx;
03371   dict_index_t* clust_index;
03372   que_thr_t*  thr;
03373   const rec_t*  rec;
03374   const rec_t*  result_rec;
03375   const rec_t*  clust_rec;
03376   ulint   err       = DB_SUCCESS;
03377   ibool   unique_search     = FALSE;
03378   ibool   unique_search_from_clust_index  = FALSE;
03379   ibool   mtr_has_extra_clust_latch = FALSE;
03380   ibool   moves_up      = FALSE;
03381   ibool   set_also_gap_locks    = TRUE;
03382   /* if the query is a plain locking SELECT, and the isolation level
03383   is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */
03384   ibool   did_semi_consistent_read  = FALSE;
03385   /* if the returned record was locked and we did a semi-consistent
03386   read (fetch the newest committed version), then this is set to
03387   TRUE */
03388 #ifdef UNIV_SEARCH_DEBUG
03389   ulint   cnt       = 0;
03390 #endif /* UNIV_SEARCH_DEBUG */
03391   ulint   next_offs;
03392   ibool   same_user_rec;
03393   mtr_t   mtr;
03394   mem_heap_t* heap        = NULL;
03395   ulint   offsets_[REC_OFFS_NORMAL_SIZE];
03396   ulint*    offsets       = offsets_;
03397   ibool   table_lock_waited   = FALSE;
03398 
03399   rec_offs_init(offsets_);
03400 
03401   ut_ad(index && pcur && search_tuple);
03402   ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
03403 
03404   if (UNIV_UNLIKELY(prebuilt->table->ibd_file_missing)) {
03405     ut_print_timestamp(stderr);
03406     fprintf(stderr, "  InnoDB: Error:\n"
03407       "InnoDB: MySQL is trying to use a table handle"
03408       " but the .ibd file for\n"
03409       "InnoDB: table %s does not exist.\n"
03410       "InnoDB: Have you deleted the .ibd file"
03411       " from the database directory under\n"
03412       "InnoDB: the MySQL datadir, or have you used"
03413       " DISCARD TABLESPACE?\n"
03414       "InnoDB: Look from\n"
03415       "InnoDB: " REFMAN "innodb-troubleshooting.html\n"
03416       "InnoDB: how you can resolve the problem.\n",
03417       prebuilt->table->name);
03418 
03419     return(DB_ERROR);
03420   }
03421 
03422   if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
03423 
03424     return(DB_MISSING_HISTORY);
03425   }
03426 
03427   if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) {
03428     fprintf(stderr,
03429       "InnoDB: Error: trying to free a corrupt\n"
03430       "InnoDB: table handle. Magic n %lu, table name ",
03431       (ulong) prebuilt->magic_n);
03432     ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
03433     putc('\n', stderr);
03434 
03435     mem_analyze_corruption(prebuilt);
03436 
03437     ut_error;
03438   }
03439 
03440 #if 0
03441   fprintf(stderr, "Match mode %lu\n search tuple ",
03442     (ulong) match_mode);
03443   dtuple_print(search_tuple);
03444   fprintf(stderr, "N tables locked %lu\n",
03445     (ulong) trx->mysql_n_tables_locked);
03446 #endif
03447   /*-------------------------------------------------------------*/
03448   /* PHASE 0: Release a possible s-latch we are holding on the
03449   adaptive hash index latch if there is someone waiting behind */
03450 
03451   if (UNIV_UNLIKELY(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_NOT_LOCKED)
03452       && trx->has_search_latch) {
03453 
03454     /* There is an x-latch request on the adaptive hash index:
03455     release the s-latch to reduce starvation and wait for
03456     BTR_SEA_TIMEOUT rounds before trying to keep it again over
03457     calls from MySQL */
03458 
03459     rw_lock_s_unlock(&btr_search_latch);
03460     trx->has_search_latch = FALSE;
03461 
03462     trx->search_latch_timeout = BTR_SEA_TIMEOUT;
03463   }
03464 
03465   /* Reset the new record lock info if srv_locks_unsafe_for_binlog
03466   is set or session is using a READ COMMITED isolation level. Then
03467   we are able to remove the record locks set here on an individual
03468   row. */
03469   prebuilt->new_rec_locks = 0;
03470 
03471   /*-------------------------------------------------------------*/
03472   /* PHASE 1: Try to pop the row from the prefetch cache */
03473 
03474   if (UNIV_UNLIKELY(direction == 0)) {
03475     trx->op_info = "starting index read";
03476 
03477     prebuilt->n_rows_fetched = 0;
03478     prebuilt->n_fetch_cached = 0;
03479     prebuilt->fetch_cache_first = 0;
03480 
03481     if (prebuilt->sel_graph == NULL) {
03482       /* Build a dummy select query graph */
03483       row_prebuild_sel_graph(prebuilt);
03484     }
03485   } else {
03486     trx->op_info = "fetching rows";
03487 
03488     if (prebuilt->n_rows_fetched == 0) {
03489       prebuilt->fetch_direction = direction;
03490     }
03491 
03492     if (UNIV_UNLIKELY(direction != prebuilt->fetch_direction)) {
03493       if (UNIV_UNLIKELY(prebuilt->n_fetch_cached > 0)) {
03494         ut_error;
03495         /* TODO: scrollable cursor: restore cursor to
03496         the place of the latest returned row,
03497         or better: prevent caching for a scroll
03498         cursor! */
03499       }
03500 
03501       prebuilt->n_rows_fetched = 0;
03502       prebuilt->n_fetch_cached = 0;
03503       prebuilt->fetch_cache_first = 0;
03504 
03505     } else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) {
03506       row_sel_pop_cached_row_for_mysql(buf, prebuilt);
03507 
03508       prebuilt->n_rows_fetched++;
03509 
03510       srv_n_rows_read++;
03511       err = DB_SUCCESS;
03512       goto func_exit;
03513     }
03514 
03515     if (prebuilt->fetch_cache_first > 0
03516         && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
03517 
03518       /* The previous returned row was popped from the fetch
03519       cache, but the cache was not full at the time of the
03520       popping: no more rows can exist in the result set */
03521 
03522       err = DB_RECORD_NOT_FOUND;
03523       goto func_exit;
03524     }
03525 
03526     prebuilt->n_rows_fetched++;
03527 
03528     if (prebuilt->n_rows_fetched > 1000000000) {
03529       /* Prevent wrap-over */
03530       prebuilt->n_rows_fetched = 500000000;
03531     }
03532 
03533     mode = pcur->search_mode;
03534   }
03535 
03536   /* In a search where at most one record in the index may match, we
03537   can use a LOCK_REC_NOT_GAP type record lock when locking a
03538   non-delete-marked matching record.
03539 
03540   Note that in a unique secondary index there may be different
03541   delete-marked versions of a record where only the primary key
03542   values differ: thus in a secondary index we must use next-key
03543   locks when locking delete-marked records. */
03544 
03545   if (match_mode == ROW_SEL_EXACT
03546       && dict_index_is_unique(index)
03547       && dtuple_get_n_fields(search_tuple)
03548       == dict_index_get_n_unique(index)
03549       && (dict_index_is_clust(index)
03550     || !dtuple_contains_null(search_tuple))) {
03551 
03552     /* Note above that a UNIQUE secondary index can contain many
03553     rows with the same key value if one of the columns is the SQL
03554     null. A clustered index under MySQL can never contain null
03555     columns because we demand that all the columns in primary key
03556     are non-null. */
03557 
03558     unique_search = TRUE;
03559 
03560     /* Even if the condition is unique, MySQL seems to try to
03561     retrieve also a second row if a primary key contains more than
03562     1 column.*/
03563 
03564     if (UNIV_UNLIKELY(direction != 0)) {
03565 
03566       err = DB_RECORD_NOT_FOUND;
03567       goto func_exit;
03568     }
03569   }
03570 
03571   mtr_start(&mtr);
03572 
03573   /*-------------------------------------------------------------*/
03574   /* PHASE 2: Try fast adaptive hash index search if possible */
03575 
03576   /* Next test if this is the special case where we can use the fast
03577   adaptive hash index to try the search. Since we must release the
03578   search system latch when we retrieve an externally stored field, we
03579   cannot use the adaptive hash index in a search in the case the row
03580   may be long and there may be externally stored fields */
03581 
03582   if (UNIV_UNLIKELY(direction == 0)
03583       && unique_search
03584       && dict_index_is_clust(index)
03585       && !prebuilt->templ_contains_blob
03586       && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)) {
03587 
03588     mode = PAGE_CUR_GE;
03589 
03590     unique_search_from_clust_index = TRUE;
03591 
03592     if (trx->mysql_n_tables_locked == 0
03593         && prebuilt->select_lock_type == LOCK_NONE
03594         && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
03595         && trx->read_view) {
03596 
03597       /* This is a SELECT query done as a consistent read,
03598       and the read view has already been allocated:
03599       let us try a search shortcut through the hash
03600       index.
03601       NOTE that we must also test that
03602       mysql_n_tables_locked == 0, because this might
03603       also be INSERT INTO ... SELECT ... or
03604       CREATE TABLE ... SELECT ... . Our algorithm is
03605       NOT prepared to inserts interleaved with the SELECT,
03606       and if we try that, we can deadlock on the adaptive
03607       hash index semaphore! */
03608 
03609 #ifndef UNIV_SEARCH_DEBUG
03610       if (!trx->has_search_latch) {
03611         rw_lock_s_lock(&btr_search_latch);
03612         trx->has_search_latch = TRUE;
03613       }
03614 #endif
03615       switch (row_sel_try_search_shortcut_for_mysql(
03616           &rec, prebuilt, &offsets, &heap,
03617           &mtr)) {
03618       case SEL_FOUND:
03619 #ifdef UNIV_SEARCH_DEBUG
03620         ut_a(0 == cmp_dtuple_rec(search_tuple,
03621                rec, offsets));
03622 #endif
03623         /* At this point, rec is protected by
03624         a page latch that was acquired by
03625         row_sel_try_search_shortcut_for_mysql().
03626         The latch will not be released until
03627         mtr_commit(&mtr). */
03628         ut_ad(!rec_get_deleted_flag(rec, comp));
03629 
03630         if (!row_sel_store_mysql_rec(buf, prebuilt,
03631                    rec, FALSE,
03632                    offsets)) {
03633           /* Only fresh inserts may contain
03634           incomplete externally stored
03635           columns. Pretend that such
03636           records do not exist. Such
03637           records may only be accessed
03638           at the READ UNCOMMITTED
03639           isolation level or when
03640           rolling back a recovered
03641           transaction. Rollback happens
03642           at a lower level, not here. */
03643           ut_a(trx->isolation_level
03644                == TRX_ISO_READ_UNCOMMITTED);
03645 
03646           /* Proceed as in case SEL_RETRY. */
03647           break;
03648         }
03649 
03650         mtr_commit(&mtr);
03651 
03652         /* ut_print_name(stderr, index->name);
03653         fputs(" shortcut\n", stderr); */
03654 
03655         srv_n_rows_read++;
03656 
03657         err = DB_SUCCESS;
03658         goto release_search_latch_if_needed;
03659 
03660       case SEL_EXHAUSTED:
03661         mtr_commit(&mtr);
03662 
03663         /* ut_print_name(stderr, index->name);
03664         fputs(" record not found 2\n", stderr); */
03665 
03666         err = DB_RECORD_NOT_FOUND;
03667 release_search_latch_if_needed:
03668         if (trx->search_latch_timeout > 0
03669             && trx->has_search_latch) {
03670 
03671           trx->search_latch_timeout--;
03672 
03673           rw_lock_s_unlock(&btr_search_latch);
03674           trx->has_search_latch = FALSE;
03675         }
03676 
03677         /* NOTE that we do NOT store the cursor
03678         position */
03679         goto func_exit;
03680 
03681       case SEL_RETRY:
03682         break;
03683 
03684       default:
03685         ut_ad(0);
03686       }
03687 
03688       mtr_commit(&mtr);
03689       mtr_start(&mtr);
03690     }
03691   }
03692 
03693   /*-------------------------------------------------------------*/
03694   /* PHASE 3: Open or restore index cursor position */
03695 
03696   if (trx->has_search_latch) {
03697     rw_lock_s_unlock(&btr_search_latch);
03698     trx->has_search_latch = FALSE;
03699   }
03700 
03701   ut_ad(prebuilt->sql_stat_start || trx->conc_state == TRX_ACTIVE);
03702   ut_ad(trx->conc_state == TRX_NOT_STARTED
03703         || trx->conc_state == TRX_ACTIVE);
03704   ut_ad(prebuilt->sql_stat_start
03705         || prebuilt->select_lock_type != LOCK_NONE
03706         || trx->read_view);
03707 
03708   ut_ad(prebuilt->sql_stat_start || trx->conc_state == TRX_ACTIVE);
03709   ut_ad(trx->conc_state == TRX_NOT_STARTED
03710         || trx->conc_state == TRX_ACTIVE);
03711   ut_ad(prebuilt->sql_stat_start
03712         || prebuilt->select_lock_type != LOCK_NONE
03713         || trx->read_view);
03714 
03715   trx_start_if_not_started(trx);
03716 
03717   if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
03718       && prebuilt->select_lock_type != LOCK_NONE
03719       && trx->mysql_thd != NULL
03720       && thd_is_select(trx->mysql_thd)) {
03721     /* It is a plain locking SELECT and the isolation
03722     level is low: do not lock gaps */
03723 
03724     set_also_gap_locks = FALSE;
03725   }
03726 
03727   /* Note that if the search mode was GE or G, then the cursor
03728   naturally moves upward (in fetch next) in alphabetical order,
03729   otherwise downward */
03730 
03731   if (UNIV_UNLIKELY(direction == 0)) {
03732     if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) {
03733       moves_up = TRUE;
03734     }
03735   } else if (direction == ROW_SEL_NEXT) {
03736     moves_up = TRUE;
03737   }
03738 
03739   thr = que_fork_get_first_thr(prebuilt->sel_graph);
03740 
03741   que_thr_move_to_run_state_for_mysql(thr, trx);
03742 
03743   clust_index = dict_table_get_first_index(index->table);
03744 
03745   /* Do some start-of-statement preparations */
03746 
03747   if (!prebuilt->sql_stat_start) {
03748     /* No need to set an intention lock or assign a read view */
03749 
03750     if (trx->read_view == NULL
03751         && prebuilt->select_lock_type == LOCK_NONE) {
03752 
03753       fputs("InnoDB: Error: MySQL is trying to"
03754             " perform a consistent read\n"
03755             "InnoDB: but the read view is not assigned!\n",
03756             stderr);
03757       trx_print(stderr, trx, 600);
03758       fputc('\n', stderr);
03759       ut_error;
03760     }
03761   } else if (prebuilt->select_lock_type == LOCK_NONE) {
03762     /* This is a consistent read */
03763     /* Assign a read view for the query */
03764 
03765     trx_assign_read_view(trx);
03766     prebuilt->sql_stat_start = FALSE;
03767   } else {
03768 wait_table_again:
03769     err = lock_table(0, index->table,
03770          prebuilt->select_lock_type == LOCK_S
03771          ? LOCK_IS : LOCK_IX, thr);
03772 
03773     if (err != DB_SUCCESS) {
03774 
03775       table_lock_waited = TRUE;
03776       goto lock_table_wait;
03777     }
03778     prebuilt->sql_stat_start = FALSE;
03779   }
03780 
03781   /* Open or restore index cursor position */
03782 
03783   if (UNIV_LIKELY(direction != 0)) {
03784     ibool need_to_process = sel_restore_position_for_mysql(
03785       &same_user_rec, BTR_SEARCH_LEAF,
03786       pcur, moves_up, &mtr);
03787 
03788     if (UNIV_UNLIKELY(need_to_process)) {
03789       if (UNIV_UNLIKELY(prebuilt->row_read_type
03790             == ROW_READ_DID_SEMI_CONSISTENT)) {
03791         /* We did a semi-consistent read,
03792         but the record was removed in
03793         the meantime. */
03794         prebuilt->row_read_type
03795           = ROW_READ_TRY_SEMI_CONSISTENT;
03796       }
03797     } else if (UNIV_LIKELY(prebuilt->row_read_type
03798                != ROW_READ_DID_SEMI_CONSISTENT)) {
03799 
03800       /* The cursor was positioned on the record
03801       that we returned previously.  If we need
03802       to repeat a semi-consistent read as a
03803       pessimistic locking read, the record
03804       cannot be skipped. */
03805 
03806       goto next_rec;
03807     }
03808 
03809   } else if (dtuple_get_n_fields(search_tuple) > 0) {
03810 
03811     btr_pcur_open_with_no_init(index, search_tuple, mode,
03812              BTR_SEARCH_LEAF,
03813              pcur, 0, &mtr);
03814 
03815     pcur->trx_if_known = trx;
03816 
03817     rec = btr_pcur_get_rec(pcur);
03818 
03819     if (!moves_up
03820         && !page_rec_is_supremum(rec)
03821         && set_also_gap_locks
03822         && !(srv_locks_unsafe_for_binlog
03823        || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
03824         && prebuilt->select_lock_type != LOCK_NONE) {
03825 
03826       /* Try to place a gap lock on the next index record
03827       to prevent phantoms in ORDER BY ... DESC queries */
03828       const rec_t*  next = page_rec_get_next_const(rec);
03829 
03830       offsets = rec_get_offsets(next, index, offsets,
03831               ULINT_UNDEFINED, &heap);
03832       err = sel_set_rec_lock(btr_pcur_get_block(pcur),
03833                  next, index, offsets,
03834                  prebuilt->select_lock_type,
03835                  LOCK_GAP, thr);
03836 
03837       switch (err) {
03838       case DB_SUCCESS_LOCKED_REC:
03839         err = DB_SUCCESS;
03840       case DB_SUCCESS:
03841         break;
03842       default:
03843         goto lock_wait_or_error;
03844       }
03845     }
03846   } else {
03847     if (mode == PAGE_CUR_G) {
03848       btr_pcur_open_at_index_side(
03849         TRUE, index, BTR_SEARCH_LEAF, pcur, FALSE,
03850         &mtr);
03851     } else if (mode == PAGE_CUR_L) {
03852       btr_pcur_open_at_index_side(
03853         FALSE, index, BTR_SEARCH_LEAF, pcur, FALSE,
03854         &mtr);
03855     }
03856   }
03857 
03858 rec_loop:
03859   /*-------------------------------------------------------------*/
03860   /* PHASE 4: Look for matching records in a loop */
03861 
03862   rec = btr_pcur_get_rec(pcur);
03863   ut_ad(!!page_rec_is_comp(rec) == comp);
03864 #ifdef UNIV_SEARCH_DEBUG
03865   /*
03866   fputs("Using ", stderr);
03867   dict_index_name_print(stderr, index);
03868   fprintf(stderr, " cnt %lu ; Page no %lu\n", cnt,
03869   page_get_page_no(page_align(rec)));
03870   rec_print(rec);
03871   */
03872 #endif /* UNIV_SEARCH_DEBUG */
03873 
03874   if (page_rec_is_infimum(rec)) {
03875 
03876     /* The infimum record on a page cannot be in the result set,
03877     and neither can a record lock be placed on it: we skip such
03878     a record. */
03879 
03880     goto next_rec;
03881   }
03882 
03883   if (page_rec_is_supremum(rec)) {
03884 
03885     if (set_also_gap_locks
03886         && !(srv_locks_unsafe_for_binlog
03887        || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
03888         && prebuilt->select_lock_type != LOCK_NONE) {
03889 
03890       /* Try to place a lock on the index record */
03891 
03892       /* If innodb_locks_unsafe_for_binlog option is used
03893       or this session is using a READ COMMITTED isolation
03894       level we do not lock gaps. Supremum record is really
03895       a gap and therefore we do not set locks there. */
03896 
03897       offsets = rec_get_offsets(rec, index, offsets,
03898               ULINT_UNDEFINED, &heap);
03899       err = sel_set_rec_lock(btr_pcur_get_block(pcur),
03900                  rec, index, offsets,
03901                  prebuilt->select_lock_type,
03902                  LOCK_ORDINARY, thr);
03903 
03904       switch (err) {
03905       case DB_SUCCESS_LOCKED_REC:
03906         err = DB_SUCCESS;
03907       case DB_SUCCESS:
03908         break;
03909       default:
03910         goto lock_wait_or_error;
03911       }
03912     }
03913     /* A page supremum record cannot be in the result set: skip
03914     it now that we have placed a possible lock on it */
03915 
03916     goto next_rec;
03917   }
03918 
03919   /*-------------------------------------------------------------*/
03920   /* Do sanity checks in case our cursor has bumped into page
03921   corruption */
03922 
03923   if (comp) {
03924     next_offs = rec_get_next_offs(rec, TRUE);
03925     if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) {
03926 
03927       goto wrong_offs;
03928     }
03929   } else {
03930     next_offs = rec_get_next_offs(rec, FALSE);
03931     if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) {
03932 
03933       goto wrong_offs;
03934     }
03935   }
03936 
03937   if (UNIV_UNLIKELY(next_offs >= UNIV_PAGE_SIZE - PAGE_DIR)) {
03938 
03939 wrong_offs:
03940     if (srv_force_recovery == 0 || moves_up == FALSE) {
03941       ut_print_timestamp(stderr);
03942       buf_page_print(page_align(rec), 0);
03943       fprintf(stderr,
03944         "\nInnoDB: rec address %p,"
03945         " buf block fix count %lu\n",
03946         (void*) rec, (ulong)
03947         btr_cur_get_block(btr_pcur_get_btr_cur(pcur))
03948         ->page.buf_fix_count);
03949       fprintf(stderr,
03950         "InnoDB: Index corruption: rec offs %lu"
03951         " next offs %lu, page no %lu,\n"
03952         "InnoDB: ",
03953         (ulong) page_offset(rec),
03954         (ulong) next_offs,
03955         (ulong) page_get_page_no(page_align(rec)));
03956       dict_index_name_print(stderr, trx, index);
03957       fputs(". Run CHECK TABLE. You may need to\n"
03958             "InnoDB: restore from a backup, or"
03959             " dump + drop + reimport the table.\n",
03960             stderr);
03961 
03962       err = DB_CORRUPTION;
03963 
03964       goto lock_wait_or_error;
03965     } else {
03966       /* The user may be dumping a corrupt table. Jump
03967       over the corruption to recover as much as possible. */
03968 
03969       fprintf(stderr,
03970         "InnoDB: Index corruption: rec offs %lu"
03971         " next offs %lu, page no %lu,\n"
03972         "InnoDB: ",
03973         (ulong) page_offset(rec),
03974         (ulong) next_offs,
03975         (ulong) page_get_page_no(page_align(rec)));
03976       dict_index_name_print(stderr, trx, index);
03977       fputs(". We try to skip the rest of the page.\n",
03978             stderr);
03979 
03980       btr_pcur_move_to_last_on_page(pcur, &mtr);
03981 
03982       goto next_rec;
03983     }
03984   }
03985   /*-------------------------------------------------------------*/
03986 
03987   /* Calculate the 'offsets' associated with 'rec' */
03988 
03989   offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
03990 
03991   if (UNIV_UNLIKELY(srv_force_recovery > 0)) {
03992     if (!rec_validate(rec, offsets)
03993         || !btr_index_rec_validate(rec, index, FALSE)) {
03994       fprintf(stderr,
03995         "InnoDB: Index corruption: rec offs %lu"
03996         " next offs %lu, page no %lu,\n"
03997         "InnoDB: ",
03998         (ulong) page_offset(rec),
03999         (ulong) next_offs,
04000         (ulong) page_get_page_no(page_align(rec)));
04001       dict_index_name_print(stderr, trx, index);
04002       fputs(". We try to skip the record.\n",
04003             stderr);
04004 
04005       goto next_rec;
04006     }
04007   }
04008 
04009   /* Note that we cannot trust the up_match value in the cursor at this
04010   place because we can arrive here after moving the cursor! Thus
04011   we have to recompare rec and search_tuple to determine if they
04012   match enough. */
04013 
04014   if (match_mode == ROW_SEL_EXACT) {
04015     /* Test if the index record matches completely to search_tuple
04016     in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
04017 
04018     /* fputs("Comparing rec and search tuple\n", stderr); */
04019 
04020     if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) {
04021 
04022       if (set_also_gap_locks
04023           && !(srv_locks_unsafe_for_binlog
04024          || trx->isolation_level
04025          <= TRX_ISO_READ_COMMITTED)
04026           && prebuilt->select_lock_type != LOCK_NONE) {
04027 
04028         /* Try to place a gap lock on the index
04029         record only if innodb_locks_unsafe_for_binlog
04030         option is not set or this session is not
04031         using a READ COMMITTED isolation level. */
04032 
04033         err = sel_set_rec_lock(
04034           btr_pcur_get_block(pcur),
04035           rec, index, offsets,
04036           prebuilt->select_lock_type, LOCK_GAP,
04037           thr);
04038 
04039         switch (err) {
04040         case DB_SUCCESS_LOCKED_REC:
04041         case DB_SUCCESS:
04042           break;
04043         default:
04044           goto lock_wait_or_error;
04045         }
04046       }
04047 
04048       btr_pcur_store_position(pcur, &mtr);
04049 
04050       err = DB_RECORD_NOT_FOUND;
04051       /* ut_print_name(stderr, index->name);
04052       fputs(" record not found 3\n", stderr); */
04053 
04054       goto normal_return;
04055     }
04056 
04057   } else if (match_mode == ROW_SEL_EXACT_PREFIX) {
04058 
04059     if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) {
04060 
04061       if (set_also_gap_locks
04062           && !(srv_locks_unsafe_for_binlog
04063          || trx->isolation_level
04064          <= TRX_ISO_READ_COMMITTED)
04065           && prebuilt->select_lock_type != LOCK_NONE) {
04066 
04067         /* Try to place a gap lock on the index
04068         record only if innodb_locks_unsafe_for_binlog
04069         option is not set or this session is not
04070         using a READ COMMITTED isolation level. */
04071 
04072         err = sel_set_rec_lock(
04073           btr_pcur_get_block(pcur),
04074           rec, index, offsets,
04075           prebuilt->select_lock_type, LOCK_GAP,
04076           thr);
04077 
04078         switch (err) {
04079         case DB_SUCCESS_LOCKED_REC:
04080         case DB_SUCCESS:
04081           break;
04082         default:
04083           goto lock_wait_or_error;
04084         }
04085       }
04086 
04087       btr_pcur_store_position(pcur, &mtr);
04088 
04089       err = DB_RECORD_NOT_FOUND;
04090       /* ut_print_name(stderr, index->name);
04091       fputs(" record not found 4\n", stderr); */
04092 
04093       goto normal_return;
04094     }
04095   }
04096 
04097   /* We are ready to look at a possible new index entry in the result
04098   set: the cursor is now placed on a user record */
04099 
04100   if (prebuilt->select_lock_type != LOCK_NONE) {
04101     /* Try to place a lock on the index record; note that delete
04102     marked records are a special case in a unique search. If there
04103     is a non-delete marked record, then it is enough to lock its
04104     existence with LOCK_REC_NOT_GAP. */
04105 
04106     /* If innodb_locks_unsafe_for_binlog option is used
04107     or this session is using a READ COMMITED isolation
04108     level we lock only the record, i.e., next-key locking is
04109     not used. */
04110 
04111     ulint lock_type;
04112 
04113     if (!set_also_gap_locks
04114         || srv_locks_unsafe_for_binlog
04115         || trx->isolation_level <= TRX_ISO_READ_COMMITTED
04116         || (unique_search
04117       && !UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp)))) {
04118 
04119       goto no_gap_lock;
04120     } else {
04121       lock_type = LOCK_ORDINARY;
04122     }
04123 
04124     /* If we are doing a 'greater or equal than a primary key
04125     value' search from a clustered index, and we find a record
04126     that has that exact primary key value, then there is no need
04127     to lock the gap before the record, because no insert in the
04128     gap can be in our search range. That is, no phantom row can
04129     appear that way.
04130 
04131     An example: if col1 is the primary key, the search is WHERE
04132     col1 >= 100, and we find a record where col1 = 100, then no
04133     need to lock the gap before that record. */
04134 
04135     if (index == clust_index
04136         && mode == PAGE_CUR_GE
04137         && direction == 0
04138         && dtuple_get_n_fields_cmp(search_tuple)
04139         == dict_index_get_n_unique(index)
04140         && 0 == cmp_dtuple_rec(search_tuple, rec, offsets)) {
04141 no_gap_lock:
04142       lock_type = LOCK_REC_NOT_GAP;
04143     }
04144 
04145     err = sel_set_rec_lock(btr_pcur_get_block(pcur),
04146                rec, index, offsets,
04147                prebuilt->select_lock_type,
04148                lock_type, thr);
04149 
04150     switch (err) {
04151       const rec_t*  old_vers;
04152     case DB_SUCCESS_LOCKED_REC:
04153       if (srv_locks_unsafe_for_binlog
04154           || trx->isolation_level
04155           <= TRX_ISO_READ_COMMITTED) {
04156         /* Note that a record of
04157         prebuilt->index was locked. */
04158         prebuilt->new_rec_locks = 1;
04159       }
04160       err = DB_SUCCESS;
04161     case DB_SUCCESS:
04162       break;
04163     case DB_LOCK_WAIT:
04164       /* Never unlock rows that were part of a conflict. */
04165       prebuilt->new_rec_locks = 0;
04166 
04167       if (UNIV_LIKELY(prebuilt->row_read_type
04168           != ROW_READ_TRY_SEMI_CONSISTENT)
04169           || unique_search
04170           || index != clust_index) {
04171 
04172         goto lock_wait_or_error;
04173       }
04174 
04175       /* The following call returns 'offsets'
04176       associated with 'old_vers' */
04177       err = row_sel_build_committed_vers_for_mysql(
04178         clust_index, prebuilt, rec,
04179         &offsets, &heap, &old_vers, &mtr);
04180 
04181       switch (err) {
04182       case DB_SUCCESS_LOCKED_REC:
04183         err = DB_SUCCESS;
04184       case DB_SUCCESS:
04185         break;
04186       default:
04187         goto lock_wait_or_error;
04188       }
04189 
04190       mutex_enter(&kernel_mutex);
04191       if (trx->was_chosen_as_deadlock_victim) {
04192         mutex_exit(&kernel_mutex);
04193         err = DB_DEADLOCK;
04194 
04195         goto lock_wait_or_error;
04196       }
04197       if (UNIV_LIKELY(trx->wait_lock != NULL)) {
04198         lock_cancel_waiting_and_release(
04199           trx->wait_lock);
04200       } else {
04201         mutex_exit(&kernel_mutex);
04202 
04203         /* The lock was granted while we were
04204         searching for the last committed version.
04205         Do a normal locking read. */
04206 
04207         offsets = rec_get_offsets(rec, index, offsets,
04208                 ULINT_UNDEFINED,
04209                 &heap);
04210         err = DB_SUCCESS;
04211         break;
04212       }
04213       mutex_exit(&kernel_mutex);
04214 
04215       if (old_vers == NULL) {
04216         /* The row was not yet committed */
04217 
04218         goto next_rec;
04219       }
04220 
04221       did_semi_consistent_read = TRUE;
04222       rec = old_vers;
04223       break;
04224     default:
04225 
04226       goto lock_wait_or_error;
04227     }
04228   } else {
04229     /* This is a non-locking consistent read: if necessary, fetch
04230     a previous version of the record */
04231 
04232     if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
04233 
04234       /* Do nothing: we let a non-locking SELECT read the
04235       latest version of the record */
04236 
04237     } else if (index == clust_index) {
04238 
04239       /* Fetch a previous version of the row if the current
04240       one is not visible in the snapshot; if we have a very
04241       high force recovery level set, we try to avoid crashes
04242       by skipping this lookup */
04243 
04244       if (UNIV_LIKELY(srv_force_recovery < 5)
04245           && !lock_clust_rec_cons_read_sees(
04246             rec, index, offsets, trx->read_view)) {
04247 
04248         rec_t*  old_vers;
04249         /* The following call returns 'offsets'
04250         associated with 'old_vers' */
04251         err = row_sel_build_prev_vers_for_mysql(
04252           trx->read_view, clust_index,
04253           prebuilt, rec, &offsets, &heap,
04254           &old_vers, &mtr);
04255 
04256         switch (err) {
04257         case DB_SUCCESS_LOCKED_REC:
04258         case DB_SUCCESS:
04259           break;
04260         default:
04261           goto lock_wait_or_error;
04262         }
04263 
04264         if (old_vers == NULL) {
04265           /* The row did not exist yet in
04266           the read view */
04267 
04268           goto next_rec;
04269         }
04270 
04271         rec = old_vers;
04272       }
04273     } else {
04274       /* We are looking into a non-clustered index,
04275       and to get the right version of the record we
04276       have to look also into the clustered index: this
04277       is necessary, because we can only get the undo
04278       information via the clustered index record. */
04279 
04280       ut_ad(!dict_index_is_clust(index));
04281       if (!lock_sec_rec_cons_read_sees(
04282             rec, trx->read_view)) {
04283         goto requires_clust_rec;
04284       }
04285     }
04286   }
04287 
04288   /* NOTE that at this point rec can be an old version of a clustered
04289   index record built for a consistent read. We cannot assume after this
04290   point that rec is on a buffer pool page. Functions like
04291   page_rec_is_comp() cannot be used! */
04292 
04293   if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp))) {
04294 
04295     /* The record is delete-marked: we can skip it */
04296 
04297     if ((srv_locks_unsafe_for_binlog
04298          || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
04299         && prebuilt->select_lock_type != LOCK_NONE
04300         && !did_semi_consistent_read) {
04301 
04302       /* No need to keep a lock on a delete-marked record
04303       if we do not want to use next-key locking. */
04304 
04305       row_unlock_for_mysql(prebuilt, TRUE);
04306     }
04307 
04308     /* This is an optimization to skip setting the next key lock
04309     on the record that follows this delete-marked record. This
04310     optimization works because of the unique search criteria
04311     which precludes the presence of a range lock between this
04312     delete marked record and the record following it.
04313 
04314     For now this is applicable only to clustered indexes while
04315     doing a unique search. There is scope for further optimization
04316     applicable to unique secondary indexes. Current behaviour is
04317     to widen the scope of a lock on an already delete marked record
04318     if the same record is deleted twice by the same transaction */
04319     if (index == clust_index && unique_search) {
04320       err = DB_RECORD_NOT_FOUND;
04321 
04322       goto normal_return;
04323     }
04324 
04325     goto next_rec;
04326   }
04327 
04328   /* Get the clustered index record if needed, if we did not do the
04329   search using the clustered index. */
04330 
04331   if (index != clust_index && prebuilt->need_to_access_clustered) {
04332 
04333 requires_clust_rec:
04334     /* We use a 'goto' to the preceding label if a consistent
04335     read of a secondary index record requires us to look up old
04336     versions of the associated clustered index record. */
04337 
04338     ut_ad(rec_offs_validate(rec, index, offsets));
04339 
04340     /* It was a non-clustered index and we must fetch also the
04341     clustered index record */
04342 
04343     mtr_has_extra_clust_latch = TRUE;
04344 
04345     /* The following call returns 'offsets' associated with
04346     'clust_rec'. Note that 'clust_rec' can be an old version
04347     built for a consistent read. */
04348 
04349     err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec,
04350                   thr, &clust_rec,
04351                   &offsets, &heap, &mtr);
04352     switch (err) {
04353     case DB_SUCCESS:
04354       if (clust_rec == NULL) {
04355         /* The record did not exist in the read view */
04356         ut_ad(prebuilt->select_lock_type == LOCK_NONE);
04357 
04358         goto next_rec;
04359       }
04360       break;
04361     case DB_SUCCESS_LOCKED_REC:
04362       ut_a(clust_rec != NULL);
04363       if (srv_locks_unsafe_for_binlog
04364            || trx->isolation_level
04365           <= TRX_ISO_READ_COMMITTED) {
04366         /* Note that the clustered index record
04367         was locked. */
04368         prebuilt->new_rec_locks = 2;
04369       }
04370       err = DB_SUCCESS;
04371       break;
04372     default:
04373       goto lock_wait_or_error;
04374     }
04375 
04376     if (UNIV_UNLIKELY(rec_get_deleted_flag(clust_rec, comp))) {
04377 
04378       /* The record is delete marked: we can skip it */
04379 
04380       if ((srv_locks_unsafe_for_binlog
04381            || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
04382           && prebuilt->select_lock_type != LOCK_NONE) {
04383 
04384         /* No need to keep a lock on a delete-marked
04385         record if we do not want to use next-key
04386         locking. */
04387 
04388         row_unlock_for_mysql(prebuilt, TRUE);
04389       }
04390 
04391       goto next_rec;
04392     }
04393 
04394     result_rec = clust_rec;
04395     ut_ad(rec_offs_validate(result_rec, clust_index, offsets));
04396   } else {
04397     result_rec = rec;
04398   }
04399 
04400   /* We found a qualifying record 'result_rec'. At this point,
04401   'offsets' are associated with 'result_rec'. */
04402 
04403   ut_ad(rec_offs_validate(result_rec,
04404         result_rec != rec ? clust_index : index,
04405         offsets));
04406   ut_ad(!rec_get_deleted_flag(result_rec, comp));
04407 
04408   /* At this point, the clustered index record is protected
04409   by a page latch that was acquired when pcur was positioned.
04410   The latch will not be released until mtr_commit(&mtr). */
04411 
04412   if ((match_mode == ROW_SEL_EXACT
04413        || prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD)
04414       && prebuilt->select_lock_type == LOCK_NONE
04415       && !prebuilt->templ_contains_blob
04416       && !prebuilt->clust_index_was_generated
04417       && prebuilt->template_type
04418       != ROW_MYSQL_DUMMY_TEMPLATE) {
04419 
04420     /* Inside an update, for example, we do not cache rows,
04421     since we may use the cursor position to do the actual
04422     update, that is why we require ...lock_type == LOCK_NONE.
04423     Since we keep space in prebuilt only for the BLOBs of
04424     a single row, we cannot cache rows in the case there
04425     are BLOBs in the fields to be fetched. In HANDLER we do
04426     not cache rows because there the cursor is a scrollable
04427     cursor. */
04428 
04429     if (!row_sel_push_cache_row_for_mysql(prebuilt, result_rec,
04430                   result_rec != rec,
04431                   offsets)) {
04432       /* Only fresh inserts may contain incomplete
04433       externally stored columns. Pretend that such
04434       records do not exist. Such records may only be
04435       accessed at the READ UNCOMMITTED isolation
04436       level or when rolling back a recovered
04437       transaction. Rollback happens at a lower
04438       level, not here. */
04439       ut_a(trx->isolation_level == TRX_ISO_READ_UNCOMMITTED);
04440     } else if (prebuilt->n_fetch_cached
04441          == MYSQL_FETCH_CACHE_SIZE) {
04442 
04443       goto got_row;
04444     }
04445 
04446     goto next_rec;
04447   } else {
04448     if (UNIV_UNLIKELY
04449         (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE)) {
04450       /* CHECK TABLE: fetch the row */
04451 
04452       if (result_rec != rec
04453           && !prebuilt->need_to_access_clustered) {
04454         /* We used 'offsets' for the clust
04455         rec, recalculate them for 'rec' */
04456         offsets = rec_get_offsets(rec, index, offsets,
04457                 ULINT_UNDEFINED,
04458                 &heap);
04459         result_rec = rec;
04460       }
04461 
04462       memcpy(buf + 4, result_rec
04463              - rec_offs_extra_size(offsets),
04464              rec_offs_size(offsets));
04465       mach_write_to_4(buf,
04466           rec_offs_extra_size(offsets) + 4);
04467     } else {
04468       /* Returning a row to MySQL */
04469 
04470       if (!row_sel_store_mysql_rec(buf, prebuilt, result_rec,
04471                  result_rec != rec,
04472                  offsets)) {
04473         /* Only fresh inserts may contain
04474         incomplete externally stored
04475         columns. Pretend that such records do
04476         not exist. Such records may only be
04477         accessed at the READ UNCOMMITTED
04478         isolation level or when rolling back a
04479         recovered transaction. Rollback
04480         happens at a lower level, not here. */
04481         ut_a(trx->isolation_level
04482              == TRX_ISO_READ_UNCOMMITTED);
04483         goto next_rec;
04484       }
04485     }
04486 
04487     if (prebuilt->clust_index_was_generated) {
04488       if (result_rec != rec) {
04489         offsets = rec_get_offsets(
04490           rec, index, offsets, ULINT_UNDEFINED,
04491           &heap);
04492       }
04493       row_sel_store_row_id_to_prebuilt(prebuilt, rec,
04494                index, offsets);
04495     }
04496   }
04497 
04498   /* From this point on, 'offsets' are invalid. */
04499 
04500 got_row:
04501   /* We have an optimization to save CPU time: if this is a consistent
04502   read on a unique condition on the clustered index, then we do not
04503   store the pcur position, because any fetch next or prev will anyway
04504   return 'end of file'. Exceptions are locking reads and the MySQL
04505   HANDLER command where the user can move the cursor with PREV or NEXT
04506   even after a unique search. */
04507 
04508   if (!unique_search_from_clust_index
04509       || prebuilt->select_lock_type != LOCK_NONE) {
04510 
04511     /* Inside an update always store the cursor position */
04512 
04513     btr_pcur_store_position(pcur, &mtr);
04514   }
04515 
04516   err = DB_SUCCESS;
04517 
04518   goto normal_return;
04519 
04520 next_rec:
04521   /* Reset the old and new "did semi-consistent read" flags. */
04522   if (UNIV_UNLIKELY(prebuilt->row_read_type
04523         == ROW_READ_DID_SEMI_CONSISTENT)) {
04524     prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
04525   }
04526   did_semi_consistent_read = FALSE;
04527   prebuilt->new_rec_locks = 0;
04528 
04529   /*-------------------------------------------------------------*/
04530   /* PHASE 5: Move the cursor to the next index record */
04531 
04532   if (UNIV_UNLIKELY(mtr_has_extra_clust_latch)) {
04533     /* We must commit mtr if we are moving to the next
04534     non-clustered index record, because we could break the
04535     latching order if we would access a different clustered
04536     index page right away without releasing the previous. */
04537 
04538     btr_pcur_store_position(pcur, &mtr);
04539 
04540     mtr_commit(&mtr);
04541     mtr_has_extra_clust_latch = FALSE;
04542 
04543     mtr_start(&mtr);
04544     if (sel_restore_position_for_mysql(&same_user_rec,
04545                BTR_SEARCH_LEAF,
04546                pcur, moves_up, &mtr)) {
04547 #ifdef UNIV_SEARCH_DEBUG
04548       cnt++;
04549 #endif /* UNIV_SEARCH_DEBUG */
04550 
04551       goto rec_loop;
04552     }
04553   }
04554 
04555   if (moves_up) {
04556     if (UNIV_UNLIKELY(!btr_pcur_move_to_next(pcur, &mtr))) {
04557 not_moved:
04558       btr_pcur_store_position(pcur, &mtr);
04559 
04560       if (match_mode != 0) {
04561         err = DB_RECORD_NOT_FOUND;
04562       } else {
04563         err = DB_END_OF_INDEX;
04564       }
04565 
04566       goto normal_return;
04567     }
04568   } else {
04569     if (UNIV_UNLIKELY(!btr_pcur_move_to_prev(pcur, &mtr))) {
04570       goto not_moved;
04571     }
04572   }
04573 
04574 #ifdef UNIV_SEARCH_DEBUG
04575   cnt++;
04576 #endif /* UNIV_SEARCH_DEBUG */
04577 
04578   goto rec_loop;
04579 
04580 lock_wait_or_error:
04581   /* Reset the old and new "did semi-consistent read" flags. */
04582   if (UNIV_UNLIKELY(prebuilt->row_read_type
04583         == ROW_READ_DID_SEMI_CONSISTENT)) {
04584     prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
04585   }
04586   did_semi_consistent_read = FALSE;
04587 
04588   /*-------------------------------------------------------------*/
04589 
04590   btr_pcur_store_position(pcur, &mtr);
04591 
04592 lock_table_wait:
04593   mtr_commit(&mtr);
04594   mtr_has_extra_clust_latch = FALSE;
04595 
04596   trx->error_state = err;
04597 
04598   /* The following is a patch for MySQL */
04599 
04600   que_thr_stop_for_mysql(thr);
04601 
04602   thr->lock_state = QUE_THR_LOCK_ROW;
04603 
04604   if (row_mysql_handle_errors(&err, trx, thr, NULL)) {
04605     /* It was a lock wait, and it ended */
04606 
04607     thr->lock_state = QUE_THR_LOCK_NOLOCK;
04608     mtr_start(&mtr);
04609 
04610     /* Table lock waited, go try to obtain table lock
04611     again */
04612     if (table_lock_waited) {
04613       table_lock_waited = FALSE;
04614 
04615       goto wait_table_again;
04616     }
04617 
04618     sel_restore_position_for_mysql(&same_user_rec,
04619                  BTR_SEARCH_LEAF, pcur,
04620                  moves_up, &mtr);
04621 
04622     if ((srv_locks_unsafe_for_binlog
04623          || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
04624         && !same_user_rec) {
04625 
04626       /* Since we were not able to restore the cursor
04627       on the same user record, we cannot use
04628       row_unlock_for_mysql() to unlock any records, and
04629       we must thus reset the new rec lock info. Since
04630       in lock0lock.c we have blocked the inheriting of gap
04631       X-locks, we actually do not have any new record locks
04632       set in this case.
04633 
04634       Note that if we were able to restore on the 'same'
04635       user record, it is still possible that we were actually
04636       waiting on a delete-marked record, and meanwhile
04637       it was removed by purge and inserted again by some
04638       other user. But that is no problem, because in
04639       rec_loop we will again try to set a lock, and
04640       new_rec_lock_info in trx will be right at the end. */
04641 
04642       prebuilt->new_rec_locks = 0;
04643     }
04644 
04645     mode = pcur->search_mode;
04646 
04647     goto rec_loop;
04648   }
04649 
04650   thr->lock_state = QUE_THR_LOCK_NOLOCK;
04651 
04652 #ifdef UNIV_SEARCH_DEBUG
04653   /*  fputs("Using ", stderr);
04654   dict_index_name_print(stderr, index);
04655   fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
04656 #endif /* UNIV_SEARCH_DEBUG */
04657   goto func_exit;
04658 
04659 normal_return:
04660   /*-------------------------------------------------------------*/
04661   que_thr_stop_for_mysql_no_error(thr, trx);
04662 
04663   mtr_commit(&mtr);
04664 
04665   if (prebuilt->n_fetch_cached > 0) {
04666     row_sel_pop_cached_row_for_mysql(buf, prebuilt);
04667 
04668     err = DB_SUCCESS;
04669   }
04670 
04671 #ifdef UNIV_SEARCH_DEBUG
04672   /*  fputs("Using ", stderr);
04673   dict_index_name_print(stderr, index);
04674   fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
04675 #endif /* UNIV_SEARCH_DEBUG */
04676   if (err == DB_SUCCESS) {
04677     srv_n_rows_read++;
04678   }
04679 
04680 func_exit:
04681   trx->op_info = "";
04682   if (UNIV_LIKELY_NULL(heap)) {
04683     mem_heap_free(heap);
04684   }
04685 
04686   /* Set or reset the "did semi-consistent read" flag on return.
04687   The flag did_semi_consistent_read is set if and only if
04688   the record being returned was fetched with a semi-consistent read. */
04689   ut_ad(prebuilt->row_read_type != ROW_READ_WITH_LOCKS
04690         || !did_semi_consistent_read);
04691 
04692   if (UNIV_UNLIKELY(prebuilt->row_read_type != ROW_READ_WITH_LOCKS)) {
04693     if (UNIV_UNLIKELY(did_semi_consistent_read)) {
04694       prebuilt->row_read_type = ROW_READ_DID_SEMI_CONSISTENT;
04695     } else {
04696       prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
04697     }
04698   }
04699   return(err);
04700 }
04701 
04702 /*******************************************************************/
04706 UNIV_INTERN
04707 ibool
04708 row_search_check_if_query_cache_permitted(
04709 /*======================================*/
04710   trx_t*    trx,    
04711   const char* norm_name)  
04713 {
04714   dict_table_t* table;
04715   ibool   ret = FALSE;
04716 
04717   table = dict_table_get(norm_name, FALSE);
04718 
04719   if (table == NULL) {
04720 
04721     return(FALSE);
04722   }
04723 
04724   mutex_enter(&kernel_mutex);
04725 
04726   /* Start the transaction if it is not started yet */
04727 
04728   trx_start_if_not_started_low(trx);
04729 
04730   /* If there are locks on the table or some trx has invalidated the
04731   cache up to our trx id, then ret = FALSE.
04732   We do not check what type locks there are on the table, though only
04733   IX type locks actually would require ret = FALSE. */
04734 
04735   if (UT_LIST_GET_LEN(table->locks) == 0
04736       && trx->id >= table->query_cache_inv_trx_id) {
04737 
04738     ret = TRUE;
04739 
04740     /* If the isolation level is high, assign a read view for the
04741     transaction if it does not yet have one */
04742 
04743     if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ
04744         && !trx->read_view) {
04745 
04746       trx->read_view = read_view_open_now(
04747         trx->id, trx->global_read_view_heap);
04748       trx->global_read_view = trx->read_view;
04749     }
04750   }
04751 
04752   mutex_exit(&kernel_mutex);
04753 
04754   return(ret);
04755 }
04756 
04757 /*******************************************************************/
04761 static
04762 ib_uint64_t
04763 row_search_autoinc_read_column(
04764 /*===========================*/
04765   dict_index_t* index,    
04766   const rec_t*  rec,    
04767   ulint   col_no,   
04768   ulint   mtype,    
04769   ibool   unsigned_type)  
04770 {
04771   ulint   len;
04772   const byte* data;
04773   ib_uint64_t value;
04774   mem_heap_t* heap = NULL;
04775   ulint   offsets_[REC_OFFS_NORMAL_SIZE];
04776   ulint*    offsets = offsets_;
04777 
04778   rec_offs_init(offsets_);
04779 
04780   offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
04781 
04782   data = rec_get_nth_field(rec, offsets, col_no, &len);
04783 
04784   ut_a(len != UNIV_SQL_NULL);
04785 
04786   switch (mtype) {
04787   case DATA_INT:
04788     ut_a(len <= sizeof value);
04789     value = mach_read_int_type(data, len, unsigned_type);
04790     break;
04791 
04792   case DATA_FLOAT:
04793     ut_a(len == sizeof(float));
04794     value = (ib_uint64_t) mach_float_read(data);
04795     break;
04796 
04797   case DATA_DOUBLE:
04798     ut_a(len == sizeof(double));
04799     value = (ib_uint64_t) mach_double_read(data);
04800     break;
04801 
04802   default:
04803     ut_error;
04804   }
04805 
04806   if (UNIV_LIKELY_NULL(heap)) {
04807     mem_heap_free(heap);
04808   }
04809 
04810   if (!unsigned_type && (ib_int64_t) value < 0) {
04811     value = 0;
04812   }
04813 
04814   return(value);
04815 }
04816 
04817 /*******************************************************************/
04820 static
04821 const rec_t*
04822 row_search_autoinc_get_rec(
04823 /*=======================*/
04824   btr_pcur_t* pcur,   
04825   mtr_t*    mtr)    
04826 {
04827   do {
04828     const rec_t* rec = btr_pcur_get_rec(pcur);
04829 
04830     if (page_rec_is_user_rec(rec)) {
04831       return(rec);
04832     }
04833   } while (btr_pcur_move_to_prev(pcur, mtr));
04834 
04835   return(NULL);
04836 }
04837 
04838 /*******************************************************************/
04842 UNIV_INTERN
04843 ulint
04844 row_search_max_autoinc(
04845 /*===================*/
04846   dict_index_t* index,    
04847   const char* col_name, 
04848   ib_uint64_t*  value)    
04849 {
04850   ulint   i;
04851   ulint   n_cols;
04852   dict_field_t* dfield = NULL;
04853   ulint   error = DB_SUCCESS;
04854 
04855   n_cols = dict_index_get_n_ordering_defined_by_user(index);
04856 
04857   /* Search the index for the AUTOINC column name */
04858   for (i = 0; i < n_cols; ++i) {
04859     dfield = dict_index_get_nth_field(index, i);
04860 
04861     if (strcmp(col_name, dfield->name) == 0) {
04862       break;
04863     }
04864   }
04865 
04866   *value = 0;
04867 
04868   /* Must find the AUTOINC column name */
04869   if (i < n_cols && dfield) {
04870     mtr_t   mtr;
04871     btr_pcur_t  pcur;
04872 
04873     mtr_start(&mtr);
04874 
04875     /* Open at the high/right end (FALSE), and INIT
04876     cursor (TRUE) */
04877     btr_pcur_open_at_index_side(
04878       FALSE, index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
04879 
04880     if (page_get_n_recs(btr_pcur_get_page(&pcur)) > 0) {
04881       const rec_t*  rec;
04882 
04883       rec = row_search_autoinc_get_rec(&pcur, &mtr);
04884 
04885       if (rec != NULL) {
04886         ibool unsigned_type = (
04887           dfield->col->prtype & DATA_UNSIGNED);
04888 
04889         *value = row_search_autoinc_read_column(
04890           index, rec, i,
04891           dfield->col->mtype, unsigned_type);
04892       }
04893     }
04894 
04895     btr_pcur_close(&pcur);
04896 
04897     mtr_commit(&mtr);
04898   } else {
04899     error = DB_RECORD_NOT_FOUND;
04900   }
04901 
04902   return(error);
04903 }