Drizzled Public API Documentation

trx0trx.cc
00001 /*****************************************************************************
00002 
00003 Copyright (C) 1996, 2010, Innobase Oy. All Rights Reserved.
00004 
00005 This program is free software; you can redistribute it and/or modify it under
00006 the terms of the GNU General Public License as published by the Free Software
00007 Foundation; version 2 of the License.
00008 
00009 This program is distributed in the hope that it will be useful, but WITHOUT
00010 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
00011 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
00012 
00013 You should have received a copy of the GNU General Public License along with
00014 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
00015 St, Fifth Floor, Boston, MA 02110-1301 USA
00016 
00017 *****************************************************************************/
00018 
00019 /**************************************************/
00026 #include "trx0trx.h"
00027 
00028 #ifdef UNIV_NONINL
00029 #include "trx0trx.ic"
00030 #endif
00031 
00032 #include "trx0undo.h"
00033 #include "trx0rseg.h"
00034 #include "log0log.h"
00035 #include "que0que.h"
00036 #include "lock0lock.h"
00037 #include "trx0roll.h"
00038 #include "usr0sess.h"
00039 #include "read0read.h"
00040 #include "srv0srv.h"
00041 #include "thr0loc.h"
00042 #include "btr0sea.h"
00043 #include "os0proc.h"
00044 #include "trx0xa.h"
00045 #include "ha_prototypes.h"
00046 
00048 UNIV_INTERN sess_t*   trx_dummy_sess = NULL;
00049 
00052 UNIV_INTERN ulint trx_n_mysql_transactions = 0;
00053 
00054 #ifdef UNIV_PFS_MUTEX
00055 /* Key to register the mutex with performance schema */
00056 UNIV_INTERN mysql_pfs_key_t trx_undo_mutex_key;
00057 #endif /* UNIV_PFS_MUTEX */
00058 
00059 /*************************************************************/
00061 UNIV_INTERN
00062 void
00063 trx_set_detailed_error(
00064 /*===================*/
00065   trx_t*    trx,  
00066   const char* msg)  
00067 {
00068   ut_strlcpy(trx->detailed_error, msg, sizeof(trx->detailed_error));
00069 }
00070 
00071 /*************************************************************/
00074 UNIV_INTERN
00075 void
00076 trx_set_detailed_error_from_file(
00077 /*=============================*/
00078   trx_t*  trx,  
00079   FILE* file) 
00080 {
00081   os_file_read_string(file, trx->detailed_error,
00082           sizeof(trx->detailed_error));
00083 }
00084 
00085 /****************************************************************/
00088 UNIV_INTERN
00089 trx_t*
00090 trx_create(
00091 /*=======*/
00092   sess_t* sess) 
00093 {
00094   trx_t*  trx;
00095 
00096   ut_ad(mutex_own(&kernel_mutex));
00097   ut_ad(sess);
00098 
00099         trx = static_cast<trx_t *>(mem_alloc(sizeof(trx_t)));
00100 
00101   trx->magic_n = TRX_MAGIC_N;
00102 
00103   trx->op_info = "";
00104 
00105   trx->is_purge = 0;
00106   trx->is_recovered = 0;
00107   trx->conc_state = TRX_NOT_STARTED;
00108   trx->start_time = time(NULL);
00109 
00110   trx->isolation_level = TRX_ISO_REPEATABLE_READ;
00111 
00112   trx->id = 0;
00113   trx->no = IB_ULONGLONG_MAX;
00114 
00115   trx->support_xa = TRUE;
00116 
00117   trx->check_foreigns = TRUE;
00118   trx->check_unique_secondary = TRUE;
00119 
00120   trx->flush_log_later = FALSE;
00121   trx->must_flush_log_later = FALSE;
00122 
00123   trx->dict_operation = TRX_DICT_OP_NONE;
00124   trx->table_id = 0;
00125 
00126   trx->mysql_thd = NULL;
00127   trx->duplicates = 0;
00128 
00129   trx->mysql_n_tables_locked = 0;
00130 
00131   trx->mysql_log_file_name = NULL;
00132   trx->mysql_log_offset = 0;
00133 
00134   mutex_create(trx_undo_mutex_key, &trx->undo_mutex, SYNC_TRX_UNDO);
00135 
00136   trx->rseg = NULL;
00137 
00138   trx->undo_no = 0;
00139   trx->last_sql_stat_start.least_undo_no = 0;
00140   trx->insert_undo = NULL;
00141   trx->update_undo = NULL;
00142   trx->undo_no_arr = NULL;
00143 
00144   trx->error_state = DB_SUCCESS;
00145   trx->error_key_num = 0;
00146   trx->detailed_error[0] = '\0';
00147 
00148   trx->sess = sess;
00149   trx->que_state = TRX_QUE_RUNNING;
00150   trx->n_active_thrs = 0;
00151 
00152   trx->handling_signals = FALSE;
00153 
00154   UT_LIST_INIT(trx->signals);
00155   UT_LIST_INIT(trx->reply_signals);
00156 
00157   trx->graph = NULL;
00158 
00159   trx->wait_lock = NULL;
00160   trx->was_chosen_as_deadlock_victim = FALSE;
00161   UT_LIST_INIT(trx->wait_thrs);
00162 
00163   trx->lock_heap = mem_heap_create_in_buffer(256);
00164   UT_LIST_INIT(trx->trx_locks);
00165 
00166   UT_LIST_INIT(trx->trx_savepoints);
00167 
00168   trx->dict_operation_lock_mode = 0;
00169   trx->has_search_latch = FALSE;
00170   trx->search_latch_timeout = BTR_SEA_TIMEOUT;
00171 
00172   trx->declared_to_be_inside_innodb = FALSE;
00173   trx->n_tickets_to_enter_innodb = 0;
00174 
00175   trx->global_read_view_heap = mem_heap_create(256);
00176   trx->global_read_view = NULL;
00177   trx->read_view = NULL;
00178 
00179   /* Set X/Open XA transaction identification to NULL */
00180   memset(&trx->xid, 0, sizeof(trx->xid));
00181   trx->xid.formatID = -1;
00182 
00183   trx->n_autoinc_rows = 0;
00184 
00185   /* Remember to free the vector explicitly. */
00186   trx->autoinc_locks = ib_vector_create(
00187     mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 4), 4);
00188 
00189   trx->log_commit_id= FALSE;
00190 
00191   return(trx);
00192 }
00193 
00194 /********************************************************************/
00197 UNIV_INTERN
00198 trx_t*
00199 trx_allocate_for_mysql(void)
00200 /*========================*/
00201 {
00202   trx_t*  trx;
00203 
00204   mutex_enter(&kernel_mutex);
00205 
00206   trx = trx_create(trx_dummy_sess);
00207 
00208   trx_n_mysql_transactions++;
00209 
00210   UT_LIST_ADD_FIRST(mysql_trx_list, trx_sys->mysql_trx_list, trx);
00211 
00212   mutex_exit(&kernel_mutex);
00213 
00214   trx->mysql_thread_id = os_thread_get_curr_id();
00215 
00216   trx->mysql_process_no = os_proc_get_number();
00217 
00218   return(trx);
00219 }
00220 
00221 /********************************************************************/
00224 UNIV_INTERN
00225 trx_t*
00226 trx_allocate_for_background(void)
00227 /*=============================*/
00228 {
00229   trx_t*  trx;
00230 
00231   mutex_enter(&kernel_mutex);
00232 
00233   trx = trx_create(trx_dummy_sess);
00234 
00235   mutex_exit(&kernel_mutex);
00236 
00237   return(trx);
00238 }
00239 
00240 /********************************************************************/
00242 UNIV_INTERN
00243 void
00244 trx_search_latch_release_if_reserved(
00245 /*=================================*/
00246   trx_t*     trx) 
00247 {
00248   if (trx->has_search_latch) {
00249     rw_lock_s_unlock(&btr_search_latch);
00250 
00251     trx->has_search_latch = FALSE;
00252   }
00253 }
00254 
00255 /********************************************************************/
00257 UNIV_INTERN
00258 void
00259 trx_free(
00260 /*=====*/
00261   trx_t*  trx)  
00262 {
00263   ut_ad(mutex_own(&kernel_mutex));
00264 
00265   if (trx->declared_to_be_inside_innodb) {
00266     ut_print_timestamp(stderr);
00267     fputs("  InnoDB: Error: Freeing a trx which is declared"
00268           " to be processing\n"
00269           "InnoDB: inside InnoDB.\n", stderr);
00270     trx_print(stderr, trx, 600);
00271     putc('\n', stderr);
00272 
00273     /* This is an error but not a fatal error. We must keep
00274     the counters like srv_conc_n_threads accurate. */
00275     srv_conc_force_exit_innodb(trx);
00276   }
00277 
00278   if (trx->mysql_n_tables_locked != 0) {
00279 
00280     ut_print_timestamp(stderr);
00281     fprintf(stderr,
00282       "  InnoDB: Error: MySQL is freeing a thd\n"
00283       "InnoDB: and trx->mysql_n_tables_locked is %lu.\n",
00284       (ulong)trx->mysql_n_tables_locked);
00285 
00286     trx_print(stderr, trx, 600);
00287 
00288     ut_print_buf(stderr, trx, sizeof(trx_t));
00289     putc('\n', stderr);
00290   }
00291 
00292   ut_a(trx->magic_n == TRX_MAGIC_N);
00293 
00294   trx->magic_n = 11112222;
00295 
00296   ut_a(trx->conc_state == TRX_NOT_STARTED);
00297 
00298   mutex_free(&(trx->undo_mutex));
00299 
00300   ut_a(trx->insert_undo == NULL);
00301   ut_a(trx->update_undo == NULL);
00302 
00303   if (trx->undo_no_arr) {
00304     trx_undo_arr_free(trx->undo_no_arr);
00305   }
00306 
00307   ut_a(UT_LIST_GET_LEN(trx->signals) == 0);
00308   ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0);
00309 
00310   ut_a(trx->wait_lock == NULL);
00311   ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
00312 
00313   ut_a(!trx->has_search_latch);
00314 
00315   ut_a(trx->dict_operation_lock_mode == 0);
00316 
00317   if (trx->lock_heap) {
00318     mem_heap_free(trx->lock_heap);
00319   }
00320 
00321   ut_a(UT_LIST_GET_LEN(trx->trx_locks) == 0);
00322 
00323   if (trx->global_read_view_heap) {
00324     mem_heap_free(trx->global_read_view_heap);
00325   }
00326 
00327   trx->global_read_view = NULL;
00328 
00329   ut_a(trx->read_view == NULL);
00330 
00331   ut_a(ib_vector_is_empty(trx->autoinc_locks));
00332   /* We allocated a dedicated heap for the vector. */
00333   ib_vector_free(trx->autoinc_locks);
00334 
00335   mem_free(trx);
00336 }
00337 
00338 /********************************************************************/
00340 UNIV_INTERN
00341 void
00342 trx_free_for_mysql(
00343 /*===============*/
00344   trx_t*  trx)  
00345 {
00346   mutex_enter(&kernel_mutex);
00347 
00348   UT_LIST_REMOVE(mysql_trx_list, trx_sys->mysql_trx_list, trx);
00349 
00350   trx_free(trx);
00351 
00352   ut_a(trx_n_mysql_transactions > 0);
00353 
00354   trx_n_mysql_transactions--;
00355 
00356   mutex_exit(&kernel_mutex);
00357 }
00358 
00359 /********************************************************************/
00361 UNIV_INTERN
00362 void
00363 trx_free_for_background(
00364 /*====================*/
00365   trx_t*  trx)  
00366 {
00367   mutex_enter(&kernel_mutex);
00368 
00369   trx_free(trx);
00370 
00371   mutex_exit(&kernel_mutex);
00372 }
00373 
00374 /****************************************************************/
00379 static
00380 void
00381 trx_list_insert_ordered(
00382 /*====================*/
00383   trx_t*  trx)  
00384 {
00385   trx_t*  trx2;
00386 
00387   ut_ad(mutex_own(&kernel_mutex));
00388 
00389   trx2 = UT_LIST_GET_FIRST(trx_sys->trx_list);
00390 
00391   while (trx2 != NULL) {
00392     if (trx->id >= trx2->id) {
00393 
00394       ut_ad(trx->id > trx2->id);
00395       break;
00396     }
00397     trx2 = UT_LIST_GET_NEXT(trx_list, trx2);
00398   }
00399 
00400   if (trx2 != NULL) {
00401     trx2 = UT_LIST_GET_PREV(trx_list, trx2);
00402 
00403     if (trx2 == NULL) {
00404       UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx);
00405     } else {
00406       UT_LIST_INSERT_AFTER(trx_list, trx_sys->trx_list,
00407                trx2, trx);
00408     }
00409   } else {
00410     UT_LIST_ADD_LAST(trx_list, trx_sys->trx_list, trx);
00411   }
00412 }
00413 
00414 /****************************************************************/
00420 UNIV_INTERN
00421 void
00422 trx_lists_init_at_db_start(void)
00423 /*============================*/
00424 {
00425   trx_rseg_t* rseg;
00426   trx_undo_t* undo;
00427   trx_t*    trx;
00428 
00429   ut_ad(mutex_own(&kernel_mutex));
00430   UT_LIST_INIT(trx_sys->trx_list);
00431 
00432   /* Look from the rollback segments if there exist undo logs for
00433   transactions */
00434 
00435   rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
00436 
00437   while (rseg != NULL) {
00438     undo = UT_LIST_GET_FIRST(rseg->insert_undo_list);
00439 
00440     while (undo != NULL) {
00441 
00442       trx = trx_create(trx_dummy_sess);
00443 
00444       trx->is_recovered = TRUE;
00445       trx->id = undo->trx_id;
00446       trx->xid = undo->xid;
00447       trx->insert_undo = undo;
00448       trx->rseg = rseg;
00449 
00450       if (undo->state != TRX_UNDO_ACTIVE) {
00451 
00452         /* Prepared transactions are left in
00453         the prepared state waiting for a
00454         commit or abort decision from MySQL */
00455 
00456         if (undo->state == TRX_UNDO_PREPARED) {
00457 
00458           fprintf(stderr,
00459             "InnoDB: Transaction "
00460             TRX_ID_FMT
00461             " was in the"
00462             " XA prepared state.\n",
00463             trx->id);
00464 
00465           if (srv_force_recovery == 0) {
00466 
00467             trx->conc_state = TRX_PREPARED;
00468           } else {
00469             fprintf(stderr,
00470               "InnoDB: Since"
00471               " innodb_force_recovery"
00472               " > 0, we will"
00473               " rollback it"
00474               " anyway.\n");
00475 
00476             trx->conc_state = TRX_ACTIVE;
00477           }
00478         } else {
00479           trx->conc_state
00480             = TRX_COMMITTED_IN_MEMORY;
00481         }
00482 
00483         /* We give a dummy value for the trx no;
00484         this should have no relevance since purge
00485         is not interested in committed transaction
00486         numbers, unless they are in the history
00487         list, in which case it looks the number
00488         from the disk based undo log structure */
00489 
00490         trx->no = trx->id;
00491       } else {
00492         trx->conc_state = TRX_ACTIVE;
00493 
00494         /* A running transaction always has the number
00495         field inited to IB_ULONGLONG_MAX */
00496 
00497         trx->no = IB_ULONGLONG_MAX;
00498       }
00499 
00500       if (undo->dict_operation) {
00501         trx_set_dict_operation(
00502           trx, TRX_DICT_OP_TABLE);
00503         trx->table_id = undo->table_id;
00504       }
00505 
00506       if (!undo->empty) {
00507         trx->undo_no = undo->top_undo_no + 1;
00508       }
00509 
00510       trx_list_insert_ordered(trx);
00511 
00512       undo = UT_LIST_GET_NEXT(undo_list, undo);
00513     }
00514 
00515     undo = UT_LIST_GET_FIRST(rseg->update_undo_list);
00516 
00517     while (undo != NULL) {
00518       trx = trx_get_on_id(undo->trx_id);
00519 
00520       if (NULL == trx) {
00521         trx = trx_create(trx_dummy_sess);
00522 
00523         trx->is_recovered = TRUE;
00524         trx->id = undo->trx_id;
00525         trx->xid = undo->xid;
00526 
00527         if (undo->state != TRX_UNDO_ACTIVE) {
00528 
00529           /* Prepared transactions are left in
00530           the prepared state waiting for a
00531           commit or abort decision from MySQL */
00532 
00533           if (undo->state == TRX_UNDO_PREPARED) {
00534             fprintf(stderr,
00535               "InnoDB: Transaction "
00536               TRX_ID_FMT " was in the"
00537               " XA prepared state.\n",
00538               trx->id);
00539 
00540             if (srv_force_recovery == 0) {
00541 
00542               trx->conc_state
00543                 = TRX_PREPARED;
00544             } else {
00545               fprintf(stderr,
00546                 "InnoDB: Since"
00547                 " innodb_force_recovery"
00548                 " > 0, we will"
00549                 " rollback it"
00550                 " anyway.\n");
00551 
00552               trx->conc_state
00553                 = TRX_ACTIVE;
00554             }
00555           } else {
00556             trx->conc_state
00557               = TRX_COMMITTED_IN_MEMORY;
00558           }
00559 
00560           /* We give a dummy value for the trx
00561           number */
00562 
00563           trx->no = trx->id;
00564         } else {
00565           trx->conc_state = TRX_ACTIVE;
00566 
00567           /* A running transaction always has
00568           the number field inited to
00569           IB_ULONGLONG_MAX */
00570 
00571           trx->no = IB_ULONGLONG_MAX;
00572         }
00573 
00574         trx->rseg = rseg;
00575         trx_list_insert_ordered(trx);
00576 
00577         if (undo->dict_operation) {
00578           trx_set_dict_operation(
00579             trx, TRX_DICT_OP_TABLE);
00580           trx->table_id = undo->table_id;
00581         }
00582       }
00583 
00584       trx->update_undo = undo;
00585 
00586       if ((!undo->empty)
00587           && undo->top_undo_no >= trx->undo_no) {
00588 
00589         trx->undo_no = undo->top_undo_no + 1;
00590       }
00591 
00592       undo = UT_LIST_GET_NEXT(undo_list, undo);
00593     }
00594 
00595     rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
00596   }
00597 }
00598 
00599 /******************************************************************/
00603 UNIV_INLINE
00604 ulint
00605 trx_assign_rseg(void)
00606 /*=================*/
00607 {
00608   trx_rseg_t* rseg  = trx_sys->latest_rseg;
00609 
00610   ut_ad(mutex_own(&kernel_mutex));
00611 loop:
00612   /* Get next rseg in a round-robin fashion */
00613 
00614   rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
00615 
00616   if (rseg == NULL) {
00617     rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
00618   }
00619 
00620   /* If it is the SYSTEM rollback segment, and there exist others, skip
00621   it */
00622 
00623   if ((rseg->id == TRX_SYS_SYSTEM_RSEG_ID)
00624       && (UT_LIST_GET_LEN(trx_sys->rseg_list) > 1)) {
00625     goto loop;
00626   }
00627 
00628   trx_sys->latest_rseg = rseg;
00629 
00630   return(rseg->id);
00631 }
00632 
00633 /****************************************************************/
00636 UNIV_INTERN
00637 ibool
00638 trx_start_low(
00639 /*==========*/
00640   trx_t*  trx,  
00641   ulint rseg_id)
00644 {
00645   trx_rseg_t* rseg;
00646 
00647   ut_ad(mutex_own(&kernel_mutex));
00648   ut_ad(trx->rseg == NULL);
00649 
00650   if (trx->is_purge) {
00651     trx->id = 0;
00652     trx->conc_state = TRX_ACTIVE;
00653     trx->start_time = time(NULL);
00654 
00655     return(TRUE);
00656   }
00657 
00658   ut_ad(trx->conc_state != TRX_ACTIVE);
00659 
00660   if (rseg_id == ULINT_UNDEFINED) {
00661 
00662     rseg_id = trx_assign_rseg();
00663   }
00664 
00665   rseg = trx_sys_get_nth_rseg(trx_sys, rseg_id);
00666 
00667   trx->id = trx_sys_get_new_trx_id();
00668 
00669   /* The initial value for trx->no: IB_ULONGLONG_MAX is used in
00670   read_view_open_now: */
00671 
00672   trx->no = IB_ULONGLONG_MAX;
00673 
00674   trx->rseg = rseg;
00675 
00676   trx->conc_state = TRX_ACTIVE;
00677   trx->start_time = time(NULL);
00678 
00679   UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx);
00680 
00681   return(TRUE);
00682 }
00683 
00684 /****************************************************************/
00687 UNIV_INTERN
00688 ibool
00689 trx_start(
00690 /*======*/
00691   trx_t*  trx,  
00692   ulint rseg_id)
00695 {
00696   ibool ret;
00697 
00698   /* Update the info whether we should skip XA steps that eat CPU time
00699   For the duration of the transaction trx->support_xa is not reread
00700   from thd so any changes in the value take effect in the next
00701   transaction. This is to avoid a scenario where some undo
00702   generated by a transaction, has XA stuff, and other undo,
00703   generated by the same transaction, doesn't. */
00704   trx->support_xa = thd_supports_xa(trx->mysql_thd);
00705 
00706   mutex_enter(&kernel_mutex);
00707 
00708   ret = trx_start_low(trx, rseg_id);
00709 
00710   mutex_exit(&kernel_mutex);
00711 
00712   return(ret);
00713 }
00714 
00715 /****************************************************************/
00717 UNIV_INTERN
00718 void
00719 trx_commit_off_kernel(
00720 /*==================*/
00721   trx_t*  trx)  
00722 {
00723   page_t*   update_hdr_page;
00724   ib_uint64_t lsn   = 0;
00725   trx_rseg_t* rseg;
00726   trx_undo_t* undo;
00727   mtr_t   mtr;
00728 
00729   ut_ad(mutex_own(&kernel_mutex));
00730 
00731   trx->must_flush_log_later = FALSE;
00732 
00733   rseg = trx->rseg;
00734 
00735   if (trx->insert_undo != NULL || trx->update_undo != NULL) {
00736 
00737     mutex_exit(&kernel_mutex);
00738 
00739     mtr_start(&mtr);
00740 
00741     /* Change the undo log segment states from TRX_UNDO_ACTIVE
00742     to some other state: these modifications to the file data
00743     structure define the transaction as committed in the file
00744     based world, at the serialization point of the log sequence
00745     number lsn obtained below. */
00746 
00747     mutex_enter(&(rseg->mutex));
00748 
00749     if (trx->insert_undo != NULL) {
00750       trx_undo_set_state_at_finish(trx->insert_undo, &mtr);
00751     }
00752 
00753     undo = trx->update_undo;
00754 
00755     if (undo) {
00756       mutex_enter(&kernel_mutex);
00757       trx->no = trx_sys_get_new_trx_no();
00758       mutex_exit(&kernel_mutex);
00759 
00760       /* It is not necessary to obtain trx->undo_mutex here
00761       because only a single OS thread is allowed to do the
00762       transaction commit for this transaction. */
00763 
00764       update_hdr_page = trx_undo_set_state_at_finish(
00765         undo, &mtr);
00766 
00767       /* We have to do the cleanup for the update log while
00768       holding the rseg mutex because update log headers
00769       have to be put to the history list in the order of
00770       the trx number. */
00771 
00772       trx_undo_update_cleanup(trx, update_hdr_page, &mtr);
00773     }
00774 
00775     mutex_exit(&(rseg->mutex));
00776 
00777     /* Update the highest commit id currently in the system */
00778     if (trx_log_commit_id(trx))
00779     {
00780       mutex_enter(&commit_id_mutex);
00781       trx_sys_flush_commit_id(trx_sys_commit_id,
00782             TRX_SYS_DRIZZLE_LOG_INFO,
00783             &mtr);
00784       mutex_exit(&commit_id_mutex);
00785     }
00786 
00787     /* The following call commits the mini-transaction, making the
00788     whole transaction committed in the file-based world, at this
00789     log sequence number. The transaction becomes 'durable' when
00790     we write the log to disk, but in the logical sense the commit
00791     in the file-based data structures (undo logs etc.) happens
00792     here.
00793 
00794     NOTE that transaction numbers, which are assigned only to
00795     transactions with an update undo log, do not necessarily come
00796     in exactly the same order as commit lsn's, if the transactions
00797     have different rollback segments. To get exactly the same
00798     order we should hold the kernel mutex up to this point,
00799     adding to the contention of the kernel mutex. However, if
00800     a transaction T2 is able to see modifications made by
00801     a transaction T1, T2 will always get a bigger transaction
00802     number and a bigger commit lsn than T1. */
00803 
00804     /*--------------*/
00805     mtr_commit(&mtr);
00806     /*--------------*/
00807     lsn = mtr.end_lsn;
00808 
00809     mutex_enter(&kernel_mutex);
00810   }
00811 
00812   ut_ad(trx->conc_state == TRX_ACTIVE
00813         || trx->conc_state == TRX_PREPARED);
00814   ut_ad(mutex_own(&kernel_mutex));
00815 
00816   /* The following assignment makes the transaction committed in memory
00817   and makes its changes to data visible to other transactions.
00818   NOTE that there is a small discrepancy from the strict formal
00819   visibility rules here: a human user of the database can see
00820   modifications made by another transaction T even before the necessary
00821   log segment has been flushed to the disk. If the database happens to
00822   crash before the flush, the user has seen modifications from T which
00823   will never be a committed transaction. However, any transaction T2
00824   which sees the modifications of the committing transaction T, and
00825   which also itself makes modifications to the database, will get an lsn
00826   larger than the committing transaction T. In the case where the log
00827   flush fails, and T never gets committed, also T2 will never get
00828   committed. */
00829 
00830   /*--------------------------------------*/
00831   trx->conc_state = TRX_COMMITTED_IN_MEMORY;
00832   /*--------------------------------------*/
00833 
00834   /* If we release kernel_mutex below and we are still doing
00835   recovery i.e.: back ground rollback thread is still active
00836   then there is a chance that the rollback thread may see
00837   this trx as COMMITTED_IN_MEMORY and goes adhead to clean it
00838   up calling trx_cleanup_at_db_startup(). This can happen
00839   in the case we are committing a trx here that is left in
00840   PREPARED state during the crash. Note that commit of the
00841   rollback of a PREPARED trx happens in the recovery thread
00842   while the rollback of other transactions happen in the
00843   background thread. To avoid this race we unconditionally
00844   unset the is_recovered flag from the trx. */
00845 
00846   trx->is_recovered = FALSE;
00847 
00848   lock_release_off_kernel(trx);
00849 
00850   if (trx->global_read_view) {
00851     read_view_close(trx->global_read_view);
00852     mem_heap_empty(trx->global_read_view_heap);
00853     trx->global_read_view = NULL;
00854   }
00855 
00856   trx->read_view = NULL;
00857 
00858   if (lsn) {
00859 
00860     mutex_exit(&kernel_mutex);
00861 
00862     if (trx->insert_undo != NULL) {
00863 
00864       trx_undo_insert_cleanup(trx);
00865     }
00866 
00867     /* NOTE that we could possibly make a group commit more
00868     efficient here: call os_thread_yield here to allow also other
00869     trxs to come to commit! */
00870 
00871     /*-------------------------------------*/
00872 
00873     /* Depending on the my.cnf options, we may now write the log
00874     buffer to the log files, making the transaction durable if
00875     the OS does not crash. We may also flush the log files to
00876     disk, making the transaction durable also at an OS crash or a
00877     power outage.
00878 
00879     The idea in InnoDB's group commit is that a group of
00880     transactions gather behind a trx doing a physical disk write
00881     to log files, and when that physical write has been completed,
00882     one of those transactions does a write which commits the whole
00883     group. Note that this group commit will only bring benefit if
00884     there are > 2 users in the database. Then at least 2 users can
00885     gather behind one doing the physical log write to disk.
00886 
00887     If we are calling trx_commit() under prepare_commit_mutex, we
00888     will delay possible log write and flush to a separate function
00889     trx_commit_complete_for_mysql(), which is only called when the
00890     thread has released the mutex. This is to make the
00891     group commit algorithm to work. Otherwise, the prepare_commit
00892     mutex would serialize all commits and prevent a group of
00893     transactions from gathering. */
00894 
00895     if (trx->flush_log_later) {
00896       /* Do nothing yet */
00897       trx->must_flush_log_later = TRUE;
00898     } else if (srv_flush_log_at_trx_commit == 0) {
00899       /* Do nothing */
00900     } else if (srv_flush_log_at_trx_commit == 1) {
00901       if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
00902         /* Write the log but do not flush it to disk */
00903 
00904         log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
00905             FALSE);
00906       } else {
00907         /* Write the log to the log files AND flush
00908         them to disk */
00909 
00910         log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
00911       }
00912     } else if (srv_flush_log_at_trx_commit == 2) {
00913 
00914       /* Write the log but do not flush it to disk */
00915 
00916       log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
00917     } else {
00918       ut_error;
00919     }
00920 
00921     trx->commit_lsn = lsn;
00922 
00923     /*-------------------------------------*/
00924 
00925     mutex_enter(&kernel_mutex);
00926   }
00927 
00928   /* Free all savepoints */
00929   trx_roll_free_all_savepoints(trx);
00930 
00931   trx->conc_state = TRX_NOT_STARTED;
00932   trx->rseg = NULL;
00933   trx->undo_no = 0;
00934   trx->last_sql_stat_start.least_undo_no = 0;
00935 
00936   ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
00937   ut_ad(UT_LIST_GET_LEN(trx->trx_locks) == 0);
00938 
00939   UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
00940 }
00941 
00942 /****************************************************************/
00946 UNIV_INTERN
00947 void
00948 trx_cleanup_at_db_startup(
00949 /*======================*/
00950   trx_t*  trx)  
00951 {
00952   if (trx->insert_undo != NULL) {
00953 
00954     trx_undo_insert_cleanup(trx);
00955   }
00956 
00957   trx->conc_state = TRX_NOT_STARTED;
00958   trx->rseg = NULL;
00959   trx->undo_no = 0;
00960   trx->last_sql_stat_start.least_undo_no = 0;
00961 
00962   UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
00963 }
00964 
00965 /********************************************************************/
00970 UNIV_INTERN
00971 read_view_t*
00972 trx_assign_read_view(
00973 /*=================*/
00974   trx_t*  trx)  
00975 {
00976   ut_ad(trx->conc_state == TRX_ACTIVE);
00977 
00978   if (trx->read_view) {
00979     return(trx->read_view);
00980   }
00981 
00982   mutex_enter(&kernel_mutex);
00983 
00984   if (!trx->read_view) {
00985     trx->read_view = read_view_open_now(
00986       trx->id, trx->global_read_view_heap);
00987     trx->global_read_view = trx->read_view;
00988   }
00989 
00990   mutex_exit(&kernel_mutex);
00991 
00992   return(trx->read_view);
00993 }
00994 
00995 /****************************************************************/
00997 static
00998 void
00999 trx_handle_commit_sig_off_kernel(
01000 /*=============================*/
01001   trx_t*    trx,    
01002   que_thr_t** next_thr) 
01007 {
01008   trx_sig_t*  sig;
01009   trx_sig_t*  next_sig;
01010 
01011   ut_ad(mutex_own(&kernel_mutex));
01012 
01013   trx->que_state = TRX_QUE_COMMITTING;
01014 
01015   trx_commit_off_kernel(trx);
01016 
01017   ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
01018 
01019   /* Remove all TRX_SIG_COMMIT signals from the signal queue and send
01020   reply messages to them */
01021 
01022   sig = UT_LIST_GET_FIRST(trx->signals);
01023 
01024   while (sig != NULL) {
01025     next_sig = UT_LIST_GET_NEXT(signals, sig);
01026 
01027     if (sig->type == TRX_SIG_COMMIT) {
01028 
01029       trx_sig_reply(sig, next_thr);
01030       trx_sig_remove(trx, sig);
01031     }
01032 
01033     sig = next_sig;
01034   }
01035 
01036   trx->que_state = TRX_QUE_RUNNING;
01037 }
01038 
01039 /***********************************************************/
01043 UNIV_INTERN
01044 void
01045 trx_end_lock_wait(
01046 /*==============*/
01047   trx_t*  trx)  
01048 {
01049   que_thr_t*  thr;
01050 
01051   ut_ad(mutex_own(&kernel_mutex));
01052   ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT);
01053 
01054   thr = UT_LIST_GET_FIRST(trx->wait_thrs);
01055 
01056   while (thr != NULL) {
01057     que_thr_end_wait_no_next_thr(thr);
01058 
01059     UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr);
01060 
01061     thr = UT_LIST_GET_FIRST(trx->wait_thrs);
01062   }
01063 
01064   trx->que_state = TRX_QUE_RUNNING;
01065 }
01066 
01067 /***********************************************************/
01070 static
01071 void
01072 trx_lock_wait_to_suspended(
01073 /*=======================*/
01074   trx_t*  trx)  
01075 {
01076   que_thr_t*  thr;
01077 
01078   ut_ad(mutex_own(&kernel_mutex));
01079   ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT);
01080 
01081   thr = UT_LIST_GET_FIRST(trx->wait_thrs);
01082 
01083   while (thr != NULL) {
01084     thr->state = QUE_THR_SUSPENDED;
01085 
01086     UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr);
01087 
01088     thr = UT_LIST_GET_FIRST(trx->wait_thrs);
01089   }
01090 
01091   trx->que_state = TRX_QUE_RUNNING;
01092 }
01093 
01094 /***********************************************************/
01097 static
01098 void
01099 trx_sig_reply_wait_to_suspended(
01100 /*============================*/
01101   trx_t*  trx)  
01102 {
01103   trx_sig_t*  sig;
01104   que_thr_t*  thr;
01105 
01106   ut_ad(mutex_own(&kernel_mutex));
01107 
01108   sig = UT_LIST_GET_FIRST(trx->reply_signals);
01109 
01110   while (sig != NULL) {
01111     thr = sig->receiver;
01112 
01113     ut_ad(thr->state == QUE_THR_SIG_REPLY_WAIT);
01114 
01115     thr->state = QUE_THR_SUSPENDED;
01116 
01117     sig->receiver = NULL;
01118 
01119     UT_LIST_REMOVE(reply_signals, trx->reply_signals, sig);
01120 
01121     sig = UT_LIST_GET_FIRST(trx->reply_signals);
01122   }
01123 }
01124 
01125 /*****************************************************************/
01129 static
01130 ibool
01131 trx_sig_is_compatible(
01132 /*==================*/
01133   trx_t*  trx,  
01134   ulint type, 
01135   ulint sender) 
01136 {
01137   trx_sig_t*  sig;
01138 
01139   ut_ad(mutex_own(&kernel_mutex));
01140 
01141   if (UT_LIST_GET_LEN(trx->signals) == 0) {
01142 
01143     return(TRUE);
01144   }
01145 
01146   if (sender == TRX_SIG_SELF) {
01147     if (type == TRX_SIG_ERROR_OCCURRED) {
01148 
01149       return(TRUE);
01150 
01151     } else if (type == TRX_SIG_BREAK_EXECUTION) {
01152 
01153       return(TRUE);
01154     } else {
01155       return(FALSE);
01156     }
01157   }
01158 
01159   ut_ad(sender == TRX_SIG_OTHER_SESS);
01160 
01161   sig = UT_LIST_GET_FIRST(trx->signals);
01162 
01163   if (type == TRX_SIG_COMMIT) {
01164     while (sig != NULL) {
01165 
01166       if (sig->type == TRX_SIG_TOTAL_ROLLBACK) {
01167 
01168         return(FALSE);
01169       }
01170 
01171       sig = UT_LIST_GET_NEXT(signals, sig);
01172     }
01173 
01174     return(TRUE);
01175 
01176   } else if (type == TRX_SIG_TOTAL_ROLLBACK) {
01177     while (sig != NULL) {
01178 
01179       if (sig->type == TRX_SIG_COMMIT) {
01180 
01181         return(FALSE);
01182       }
01183 
01184       sig = UT_LIST_GET_NEXT(signals, sig);
01185     }
01186 
01187     return(TRUE);
01188 
01189   } else if (type == TRX_SIG_BREAK_EXECUTION) {
01190 
01191     return(TRUE);
01192   } else {
01193     ut_error;
01194 
01195     return(FALSE);
01196   }
01197 }
01198 
01199 /****************************************************************/
01201 UNIV_INTERN
01202 void
01203 trx_sig_send(
01204 /*=========*/
01205   trx_t*    trx,    
01206   ulint   type,   
01207   ulint   sender,   
01209   que_thr_t*  receiver_thr, 
01212   trx_savept_t* savept,   
01214   que_thr_t** next_thr) 
01220 {
01221   trx_sig_t*  sig;
01222   trx_t*    receiver_trx;
01223 
01224   ut_ad(trx);
01225   ut_ad(mutex_own(&kernel_mutex));
01226 
01227   if (!trx_sig_is_compatible(trx, type, sender)) {
01228     /* The signal is not compatible with the other signals in
01229     the queue: die */
01230 
01231     ut_error;
01232   }
01233 
01234   /* Queue the signal object */
01235 
01236   if (UT_LIST_GET_LEN(trx->signals) == 0) {
01237 
01238     /* The signal list is empty: the 'sig' slot must be unused
01239     (we improve performance a bit by avoiding mem_alloc) */
01240     sig = &(trx->sig);
01241   } else {
01242     /* It might be that the 'sig' slot is unused also in this
01243     case, but we choose the easy way of using mem_alloc */
01244 
01245           sig = static_cast<trx_sig_t *>(mem_alloc(sizeof(trx_sig_t)));
01246   }
01247 
01248   UT_LIST_ADD_LAST(signals, trx->signals, sig);
01249 
01250   sig->type = type;
01251   sig->sender = sender;
01252   sig->receiver = receiver_thr;
01253 
01254   if (savept) {
01255     sig->savept = *savept;
01256   }
01257 
01258   if (receiver_thr) {
01259     receiver_trx = thr_get_trx(receiver_thr);
01260 
01261     UT_LIST_ADD_LAST(reply_signals, receiver_trx->reply_signals,
01262          sig);
01263   }
01264 
01265   if (trx->sess->state == SESS_ERROR) {
01266 
01267     trx_sig_reply_wait_to_suspended(trx);
01268   }
01269 
01270   if ((sender != TRX_SIG_SELF) || (type == TRX_SIG_BREAK_EXECUTION)) {
01271     ut_error;
01272   }
01273 
01274   /* If there were no other signals ahead in the queue, try to start
01275   handling of the signal */
01276 
01277   if (UT_LIST_GET_FIRST(trx->signals) == sig) {
01278 
01279     trx_sig_start_handle(trx, next_thr);
01280   }
01281 }
01282 
01283 /****************************************************************/
01288 UNIV_INTERN
01289 void
01290 trx_end_signal_handling(
01291 /*====================*/
01292   trx_t*  trx)  
01293 {
01294   ut_ad(mutex_own(&kernel_mutex));
01295   ut_ad(trx->handling_signals == TRUE);
01296 
01297   trx->handling_signals = FALSE;
01298 
01299   trx->graph = trx->graph_before_signal_handling;
01300 
01301   if (trx->graph && (trx->sess->state == SESS_ERROR)) {
01302 
01303     que_fork_error_handle(trx, trx->graph);
01304   }
01305 }
01306 
01307 /****************************************************************/
01309 UNIV_INTERN
01310 void
01311 trx_sig_start_handle(
01312 /*=================*/
01313   trx_t*    trx,    
01314   que_thr_t** next_thr) 
01320 {
01321   trx_sig_t*  sig;
01322   ulint   type;
01323 loop:
01324   /* We loop in this function body as long as there are queued signals
01325   we can process immediately */
01326 
01327   ut_ad(trx);
01328   ut_ad(mutex_own(&kernel_mutex));
01329 
01330   if (trx->handling_signals && (UT_LIST_GET_LEN(trx->signals) == 0)) {
01331 
01332     trx_end_signal_handling(trx);
01333 
01334     return;
01335   }
01336 
01337   if (trx->conc_state == TRX_NOT_STARTED) {
01338 
01339     trx_start_low(trx, ULINT_UNDEFINED);
01340   }
01341 
01342   /* If the trx is in a lock wait state, moves the waiting query threads
01343   to the suspended state */
01344 
01345   if (trx->que_state == TRX_QUE_LOCK_WAIT) {
01346 
01347     trx_lock_wait_to_suspended(trx);
01348   }
01349 
01350   /* If the session is in the error state and this trx has threads
01351   waiting for reply from signals, moves these threads to the suspended
01352   state, canceling wait reservations; note that if the transaction has
01353   sent a commit or rollback signal to itself, and its session is not in
01354   the error state, then nothing is done here. */
01355 
01356   if (trx->sess->state == SESS_ERROR) {
01357     trx_sig_reply_wait_to_suspended(trx);
01358   }
01359 
01360   /* If there are no running query threads, we can start processing of a
01361   signal, otherwise we have to wait until all query threads of this
01362   transaction are aware of the arrival of the signal. */
01363 
01364   if (trx->n_active_thrs > 0) {
01365 
01366     return;
01367   }
01368 
01369   if (trx->handling_signals == FALSE) {
01370     trx->graph_before_signal_handling = trx->graph;
01371 
01372     trx->handling_signals = TRUE;
01373   }
01374 
01375   sig = UT_LIST_GET_FIRST(trx->signals);
01376   type = sig->type;
01377 
01378   if (type == TRX_SIG_COMMIT) {
01379 
01380     trx_handle_commit_sig_off_kernel(trx, next_thr);
01381 
01382   } else if ((type == TRX_SIG_TOTAL_ROLLBACK)
01383        || (type == TRX_SIG_ROLLBACK_TO_SAVEPT)) {
01384 
01385     trx_rollback(trx, sig, next_thr);
01386 
01387     /* No further signals can be handled until the rollback
01388     completes, therefore we return */
01389 
01390     return;
01391 
01392   } else if (type == TRX_SIG_ERROR_OCCURRED) {
01393 
01394     trx_rollback(trx, sig, next_thr);
01395 
01396     /* No further signals can be handled until the rollback
01397     completes, therefore we return */
01398 
01399     return;
01400 
01401   } else if (type == TRX_SIG_BREAK_EXECUTION) {
01402 
01403     trx_sig_reply(sig, next_thr);
01404     trx_sig_remove(trx, sig);
01405   } else {
01406     ut_error;
01407   }
01408 
01409   goto loop;
01410 }
01411 
01412 /****************************************************************/
01415 UNIV_INTERN
01416 void
01417 trx_sig_reply(
01418 /*==========*/
01419   trx_sig_t*  sig,    
01420   que_thr_t** next_thr) 
01425 {
01426   trx_t*  receiver_trx;
01427 
01428   ut_ad(sig);
01429   ut_ad(mutex_own(&kernel_mutex));
01430 
01431   if (sig->receiver != NULL) {
01432     ut_ad((sig->receiver)->state == QUE_THR_SIG_REPLY_WAIT);
01433 
01434     receiver_trx = thr_get_trx(sig->receiver);
01435 
01436     UT_LIST_REMOVE(reply_signals, receiver_trx->reply_signals,
01437              sig);
01438     ut_ad(receiver_trx->sess->state != SESS_ERROR);
01439 
01440     que_thr_end_wait(sig->receiver, next_thr);
01441 
01442     sig->receiver = NULL;
01443 
01444   }
01445 }
01446 
01447 /****************************************************************/
01449 UNIV_INTERN
01450 void
01451 trx_sig_remove(
01452 /*===========*/
01453   trx_t*    trx,  
01454   trx_sig_t*  sig)  
01455 {
01456   ut_ad(trx && sig);
01457   ut_ad(mutex_own(&kernel_mutex));
01458 
01459   ut_ad(sig->receiver == NULL);
01460 
01461   UT_LIST_REMOVE(signals, trx->signals, sig);
01462   sig->type = 0;  /* reset the field to catch possible bugs */
01463 
01464   if (sig != &(trx->sig)) {
01465     mem_free(sig);
01466   }
01467 }
01468 
01469 /*********************************************************************/
01472 UNIV_INTERN
01473 commit_node_t*
01474 commit_node_create(
01475 /*===============*/
01476   mem_heap_t* heap) 
01477 {
01478   commit_node_t*  node;
01479 
01480         node = static_cast<commit_node_t *>(mem_heap_alloc(heap, sizeof(commit_node_t)));
01481   node->common.type  = QUE_NODE_COMMIT;
01482   node->state = COMMIT_NODE_SEND;
01483 
01484   return(node);
01485 }
01486 
01487 /***********************************************************/
01490 UNIV_INTERN
01491 que_thr_t*
01492 trx_commit_step(
01493 /*============*/
01494   que_thr_t*  thr)  
01495 {
01496   commit_node_t*  node;
01497   que_thr_t*  next_thr;
01498 
01499         node = static_cast<commit_node_t *>(thr->run_node);
01500 
01501   ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT);
01502 
01503   if (thr->prev_node == que_node_get_parent(node)) {
01504     node->state = COMMIT_NODE_SEND;
01505   }
01506 
01507   if (node->state == COMMIT_NODE_SEND) {
01508     mutex_enter(&kernel_mutex);
01509 
01510     node->state = COMMIT_NODE_WAIT;
01511 
01512     next_thr = NULL;
01513 
01514     thr->state = QUE_THR_SIG_REPLY_WAIT;
01515 
01516     /* Send the commit signal to the transaction */
01517 
01518     trx_sig_send(thr_get_trx(thr), TRX_SIG_COMMIT, TRX_SIG_SELF,
01519            thr, NULL, &next_thr);
01520 
01521     mutex_exit(&kernel_mutex);
01522 
01523     return(next_thr);
01524   }
01525 
01526   ut_ad(node->state == COMMIT_NODE_WAIT);
01527 
01528   node->state = COMMIT_NODE_SEND;
01529 
01530   thr->run_node = que_node_get_parent(node);
01531 
01532   return(thr);
01533 }
01534 
01535 /**********************************************************************/
01538 UNIV_INTERN
01539 ulint
01540 trx_commit_for_mysql(
01541 /*=================*/
01542   trx_t*  trx)  
01543 {
01544   /* Because we do not do the commit by sending an Innobase
01545   sig to the transaction, we must here make sure that trx has been
01546   started. */
01547 
01548   ut_a(trx);
01549 
01550   trx_start_if_not_started(trx);
01551 
01552   trx->op_info = "committing";
01553 
01554   mutex_enter(&kernel_mutex);
01555 
01556   trx_commit_off_kernel(trx);
01557 
01558   mutex_exit(&kernel_mutex);
01559 
01560   trx->op_info = "";
01561 
01562   return(DB_SUCCESS);
01563 }
01564 
01565 /**********************************************************************/
01569 UNIV_INTERN
01570 ulint
01571 trx_commit_complete_for_mysql(
01572 /*==========================*/
01573   trx_t*  trx)  
01574 {
01575   ib_uint64_t lsn = trx->commit_lsn;
01576 
01577   ut_a(trx);
01578 
01579   trx->op_info = "flushing log";
01580 
01581   if (!trx->must_flush_log_later) {
01582     /* Do nothing */
01583   } else if (srv_flush_log_at_trx_commit == 0) {
01584     /* Do nothing */
01585   } else if (srv_flush_log_at_trx_commit == 1) {
01586     if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
01587       /* Write the log but do not flush it to disk */
01588 
01589       log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
01590     } else {
01591       /* Write the log to the log files AND flush them to
01592       disk */
01593 
01594       log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
01595     }
01596   } else if (srv_flush_log_at_trx_commit == 2) {
01597 
01598     /* Write the log but do not flush it to disk */
01599 
01600     log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
01601   } else {
01602     ut_error;
01603   }
01604 
01605   trx->must_flush_log_later = FALSE;
01606 
01607   trx->op_info = "";
01608 
01609   return(0);
01610 }
01611 
01612 /**********************************************************************/
01614 UNIV_INTERN
01615 void
01616 trx_mark_sql_stat_end(
01617 /*==================*/
01618   trx_t*  trx)  
01619 {
01620   ut_a(trx);
01621 
01622   if (trx->conc_state == TRX_NOT_STARTED) {
01623     trx->undo_no = 0;
01624   }
01625 
01626   trx->last_sql_stat_start.least_undo_no = trx->undo_no;
01627 }
01628 
01629 /**********************************************************************/
01632 UNIV_INTERN
01633 void
01634 trx_print(
01635 /*======*/
01636   FILE* f,    
01637   trx_t*  trx,    
01638   ulint max_query_len)  
01640 {
01641   ibool newline;
01642 
01643   fprintf(f, "TRANSACTION " TRX_ID_FMT, trx->id);
01644 
01645   switch (trx->conc_state) {
01646   case TRX_NOT_STARTED:
01647     fputs(", not started", f);
01648     break;
01649   case TRX_ACTIVE:
01650     fprintf(f, ", ACTIVE %lu sec",
01651       (ulong)difftime(time(NULL), trx->start_time));
01652     break;
01653   case TRX_PREPARED:
01654     fprintf(f, ", ACTIVE (PREPARED) %lu sec",
01655       (ulong)difftime(time(NULL), trx->start_time));
01656     break;
01657   case TRX_COMMITTED_IN_MEMORY:
01658     fputs(", COMMITTED IN MEMORY", f);
01659     break;
01660   default:
01661     fprintf(f, " state %lu", (ulong) trx->conc_state);
01662   }
01663 
01664 #ifdef UNIV_LINUX
01665   fprintf(f, ", process no %lu", trx->mysql_process_no);
01666 #endif
01667   fprintf(f, ", OS thread id %lu",
01668     (ulong) os_thread_pf(trx->mysql_thread_id));
01669 
01670   if (*trx->op_info) {
01671     putc(' ', f);
01672     fputs(trx->op_info, f);
01673   }
01674 
01675   if (trx->is_recovered) {
01676     fputs(" recovered trx", f);
01677   }
01678 
01679   if (trx->is_purge) {
01680     fputs(" purge trx", f);
01681   }
01682 
01683   if (trx->declared_to_be_inside_innodb) {
01684     fprintf(f, ", thread declared inside InnoDB %lu",
01685       (ulong) trx->n_tickets_to_enter_innodb);
01686   }
01687 
01688   putc('\n', f);
01689 
01690   if (trx->mysql_n_tables_locked > 0) {
01691     fprintf(f, "mysql tables in locked %lu\n",
01692       (ulong) trx->mysql_n_tables_locked);
01693   }
01694 
01695   newline = TRUE;
01696 
01697   switch (trx->que_state) {
01698   case TRX_QUE_RUNNING:
01699     newline = FALSE; break;
01700   case TRX_QUE_LOCK_WAIT:
01701     fputs("LOCK WAIT ", f); break;
01702   case TRX_QUE_ROLLING_BACK:
01703     fputs("ROLLING BACK ", f); break;
01704   case TRX_QUE_COMMITTING:
01705     fputs("COMMITTING ", f); break;
01706   default:
01707     fprintf(f, "que state %lu ", (ulong) trx->que_state);
01708   }
01709 
01710   if (0 < UT_LIST_GET_LEN(trx->trx_locks)
01711       || mem_heap_get_size(trx->lock_heap) > 400) {
01712     newline = TRUE;
01713 
01714     fprintf(f, "%lu lock struct(s), heap size %lu,"
01715       " %lu row lock(s)",
01716       (ulong) UT_LIST_GET_LEN(trx->trx_locks),
01717       (ulong) mem_heap_get_size(trx->lock_heap),
01718       (ulong) lock_number_of_rows_locked(trx));
01719   }
01720 
01721   if (trx->has_search_latch) {
01722     newline = TRUE;
01723     fputs(", holds adaptive hash latch", f);
01724   }
01725 
01726   if (trx->undo_no != 0) {
01727     newline = TRUE;
01728     fprintf(f, ", undo log entries %llu",
01729       (ullint) trx->undo_no);
01730   }
01731 
01732   if (newline) {
01733     putc('\n', f);
01734   }
01735 
01736   if (trx->mysql_thd != NULL) {
01737     innobase_mysql_print_thd(f, trx->mysql_thd, max_query_len);
01738   }
01739 }
01740 
01741 /*******************************************************************/
01746 UNIV_INTERN
01747 ibool
01748 trx_weight_ge(
01749 /*==========*/
01750   const trx_t*  a,  
01751   const trx_t*  b)  
01752 {
01753   ibool a_notrans_edit;
01754   ibool b_notrans_edit;
01755 
01756   /* If mysql_thd is NULL for a transaction we assume that it has
01757   not edited non-transactional tables. */
01758 
01759   a_notrans_edit = a->mysql_thd != NULL
01760     && thd_has_edited_nontrans_tables(a->mysql_thd);
01761 
01762   b_notrans_edit = b->mysql_thd != NULL
01763     && thd_has_edited_nontrans_tables(b->mysql_thd);
01764 
01765   if (a_notrans_edit != b_notrans_edit) {
01766 
01767     return(a_notrans_edit);
01768   }
01769 
01770   /* Either both had edited non-transactional tables or both had
01771   not, we fall back to comparing the number of altered/locked
01772   rows. */
01773 
01774 #if 0
01775   fprintf(stderr,
01776     "%s TRX_WEIGHT(a): %lld+%lu, TRX_WEIGHT(b): %lld+%lu\n",
01777     __func__,
01778     a->undo_no, UT_LIST_GET_LEN(a->trx_locks),
01779     b->undo_no, UT_LIST_GET_LEN(b->trx_locks));
01780 #endif
01781 
01782   return(TRX_WEIGHT(a) >= TRX_WEIGHT(b));
01783 }
01784 
01785 /****************************************************************/
01787 UNIV_INLINE
01788 void
01789 trx_prepare_off_kernel(
01790 /*===================*/
01791   trx_t*  trx)  
01792 {
01793   trx_rseg_t* rseg;
01794   ib_uint64_t lsn   = 0;
01795   mtr_t   mtr;
01796 
01797   ut_ad(mutex_own(&kernel_mutex));
01798 
01799   rseg = trx->rseg;
01800 
01801   if (trx->insert_undo != NULL || trx->update_undo != NULL) {
01802 
01803     mutex_exit(&kernel_mutex);
01804 
01805     mtr_start(&mtr);
01806 
01807     /* Change the undo log segment states from TRX_UNDO_ACTIVE
01808     to TRX_UNDO_PREPARED: these modifications to the file data
01809     structure define the transaction as prepared in the
01810     file-based world, at the serialization point of lsn. */
01811 
01812     mutex_enter(&(rseg->mutex));
01813 
01814     if (trx->insert_undo != NULL) {
01815 
01816       /* It is not necessary to obtain trx->undo_mutex here
01817       because only a single OS thread is allowed to do the
01818       transaction prepare for this transaction. */
01819 
01820       trx_undo_set_state_at_prepare(trx, trx->insert_undo,
01821                   &mtr);
01822     }
01823 
01824     if (trx->update_undo) {
01825       trx_undo_set_state_at_prepare(
01826         trx, trx->update_undo, &mtr);
01827     }
01828 
01829     mutex_exit(&(rseg->mutex));
01830 
01831     /*--------------*/
01832     mtr_commit(&mtr); /* This mtr commit makes the
01833           transaction prepared in the file-based
01834           world */
01835     /*--------------*/
01836     lsn = mtr.end_lsn;
01837 
01838     mutex_enter(&kernel_mutex);
01839   }
01840 
01841   ut_ad(mutex_own(&kernel_mutex));
01842 
01843   /*--------------------------------------*/
01844   trx->conc_state = TRX_PREPARED;
01845   /*--------------------------------------*/
01846 
01847   if (lsn) {
01848     /* Depending on the my.cnf options, we may now write the log
01849     buffer to the log files, making the prepared state of the
01850     transaction durable if the OS does not crash. We may also
01851     flush the log files to disk, making the prepared state of the
01852     transaction durable also at an OS crash or a power outage.
01853 
01854     The idea in InnoDB's group prepare is that a group of
01855     transactions gather behind a trx doing a physical disk write
01856     to log files, and when that physical write has been completed,
01857     one of those transactions does a write which prepares the whole
01858     group. Note that this group prepare will only bring benefit if
01859     there are > 2 users in the database. Then at least 2 users can
01860     gather behind one doing the physical log write to disk.
01861 
01862     TODO: find out if MySQL holds some mutex when calling this.
01863     That would spoil our group prepare algorithm. */
01864 
01865     mutex_exit(&kernel_mutex);
01866 
01867     if (srv_flush_log_at_trx_commit == 0) {
01868       /* Do nothing */
01869     } else if (srv_flush_log_at_trx_commit == 1) {
01870       if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
01871         /* Write the log but do not flush it to disk */
01872 
01873         log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
01874             FALSE);
01875       } else {
01876         /* Write the log to the log files AND flush
01877         them to disk */
01878 
01879         log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
01880       }
01881     } else if (srv_flush_log_at_trx_commit == 2) {
01882 
01883       /* Write the log but do not flush it to disk */
01884 
01885       log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
01886     } else {
01887       ut_error;
01888     }
01889 
01890     mutex_enter(&kernel_mutex);
01891   }
01892 }
01893 
01894 /**********************************************************************/
01897 UNIV_INTERN
01898 ulint
01899 trx_prepare_for_mysql(
01900 /*==================*/
01901   trx_t*  trx)  
01902 {
01903   /* Because we do not do the prepare by sending an Innobase
01904   sig to the transaction, we must here make sure that trx has been
01905   started. */
01906 
01907   ut_a(trx);
01908 
01909   trx->op_info = "preparing";
01910 
01911   trx_start_if_not_started(trx);
01912 
01913   mutex_enter(&kernel_mutex);
01914 
01915   trx_prepare_off_kernel(trx);
01916 
01917   mutex_exit(&kernel_mutex);
01918 
01919   trx->op_info = "";
01920 
01921   return(0);
01922 }
01923 
01924 /**********************************************************************/
01928 UNIV_INTERN
01929 int
01930 trx_recover_for_mysql(
01931 /*==================*/
01932   XID*  xid_list, 
01933   ulint len)    
01934 {
01935   trx_t*  trx;
01936   ulint count = 0;
01937 
01938   ut_ad(xid_list);
01939   ut_ad(len);
01940 
01941   /* We should set those transactions which are in the prepared state
01942   to the xid_list */
01943 
01944   mutex_enter(&kernel_mutex);
01945 
01946   trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
01947 
01948   while (trx) {
01949     if (trx->conc_state == TRX_PREPARED) {
01950       xid_list[count] = trx->xid;
01951 
01952       if (count == 0) {
01953         ut_print_timestamp(stderr);
01954         fprintf(stderr,
01955           "  InnoDB: Starting recovery for"
01956           " XA transactions...\n");
01957       }
01958 
01959       ut_print_timestamp(stderr);
01960       fprintf(stderr,
01961         "  InnoDB: Transaction " TRX_ID_FMT " in"
01962         " prepared state after recovery\n",
01963         trx->id);
01964 
01965       ut_print_timestamp(stderr);
01966       fprintf(stderr,
01967         "  InnoDB: Transaction contains changes"
01968         " to %llu rows\n",
01969         (ullint) trx->undo_no);
01970 
01971       count++;
01972 
01973       if (count == len) {
01974         break;
01975       }
01976     }
01977 
01978     trx = UT_LIST_GET_NEXT(trx_list, trx);
01979   }
01980 
01981   mutex_exit(&kernel_mutex);
01982 
01983   if (count > 0){
01984     ut_print_timestamp(stderr);
01985     fprintf(stderr,
01986       "  InnoDB: %lu transactions in prepared state"
01987       " after recovery\n",
01988       (ulong) count);
01989   }
01990 
01991   return ((int) count);
01992 }
01993 
01994 /*******************************************************************/
01998 UNIV_INTERN
01999 trx_t*
02000 trx_get_trx_by_xid(
02001 /*===============*/
02002   XID*  xid)  
02003 {
02004   trx_t*  trx;
02005 
02006   if (xid == NULL) {
02007 
02008     return (NULL);
02009   }
02010 
02011   mutex_enter(&kernel_mutex);
02012 
02013   trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
02014 
02015   while (trx) {
02016     /* Compare two X/Open XA transaction id's: their
02017     length should be the same and binary comparison
02018     of gtrid_length+bqual_length bytes should be
02019     the same */
02020 
02021     if (xid->gtrid_length == trx->xid.gtrid_length
02022         && xid->bqual_length == trx->xid.bqual_length
02023         && memcmp(xid->data, trx->xid.data,
02024             xid->gtrid_length + xid->bqual_length) == 0) {
02025       break;
02026     }
02027 
02028     trx = UT_LIST_GET_NEXT(trx_list, trx);
02029   }
02030 
02031   mutex_exit(&kernel_mutex);
02032 
02033   if (trx) {
02034     if (trx->conc_state != TRX_PREPARED) {
02035 
02036       return(NULL);
02037     }
02038 
02039     return(trx);
02040   } else {
02041     return(NULL);
02042   }
02043 }