00001 /***************************************************************************** 00002 00003 Copyright (C) 1997, 2009, Innobase Oy. All Rights Reserved. 00004 00005 This program is free software; you can redistribute it and/or modify it under 00006 the terms of the GNU General Public License as published by the Free Software 00007 Foundation; version 2 of the License. 00008 00009 This program is distributed in the hope that it will be useful, but WITHOUT 00010 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 00011 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 00012 00013 You should have received a copy of the GNU General Public License along with 00014 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin 00015 St, Fifth Floor, Boston, MA 02110-1301 USA 00016 00017 *****************************************************************************/ 00018 00019 /**************************************************/ 00026 #include "row0undo.h" 00027 00028 #ifdef UNIV_NONINL 00029 #include "row0undo.ic" 00030 #endif 00031 00032 #include "fsp0fsp.h" 00033 #include "mach0data.h" 00034 #include "trx0rseg.h" 00035 #include "trx0trx.h" 00036 #include "trx0roll.h" 00037 #include "trx0undo.h" 00038 #include "trx0purge.h" 00039 #include "trx0rec.h" 00040 #include "que0que.h" 00041 #include "row0row.h" 00042 #include "row0uins.h" 00043 #include "row0umod.h" 00044 #include "row0upd.h" 00045 #include "row0mysql.h" 00046 #include "srv0srv.h" 00047 00048 /* How to undo row operations? 00049 (1) For an insert, we have stored a prefix of the clustered index record 00050 in the undo log. Using it, we look for the clustered record, and using 00051 that we look for the records in the secondary indexes. The insert operation 00052 may have been left incomplete, if the database crashed, for example. 00053 We may have look at the trx id and roll ptr to make sure the record in the 00054 clustered index is really the one for which the undo log record was 00055 written. We can use the framework we get from the original insert op. 00056 (2) Delete marking: We can use the framework we get from the original 00057 delete mark op. We only have to check the trx id. 00058 (3) Update: This may be the most complicated. We have to use the framework 00059 we get from the original update op. 00060 00061 What if the same trx repeatedly deletes and inserts an identical row. 00062 Then the row id changes and also roll ptr. What if the row id was not 00063 part of the ordering fields in the clustered index? Maybe we have to write 00064 it to undo log. Well, maybe not, because if we order the row id and trx id 00065 in descending order, then the only undeleted copy is the first in the 00066 index. Our searches in row operations always position the cursor before 00067 the first record in the result set. But, if there is no key defined for 00068 a table, then it would be desirable that row id is in ascending order. 00069 So, lets store row id in descending order only if it is not an ordering 00070 field in the clustered index. 00071 00072 NOTE: Deletes and inserts may lead to situation where there are identical 00073 records in a secondary index. Is that a problem in the B-tree? Yes. 00074 Also updates can lead to this, unless trx id and roll ptr are included in 00075 ord fields. 00076 (1) Fix in clustered indexes: include row id, trx id, and roll ptr 00077 in node pointers of B-tree. 00078 (2) Fix in secondary indexes: include all fields in node pointers, and 00079 if an entry is inserted, check if it is equal to the right neighbor, 00080 in which case update the right neighbor: the neighbor must be delete 00081 marked, set it unmarked and write the trx id of the current transaction. 00082 00083 What if the same trx repeatedly updates the same row, updating a secondary 00084 index field or not? Updating a clustered index ordering field? 00085 00086 (1) If it does not update the secondary index and not the clustered index 00087 ord field. Then the secondary index record stays unchanged, but the 00088 trx id in the secondary index record may be smaller than in the clustered 00089 index record. This is no problem? 00090 (2) If it updates secondary index ord field but not clustered: then in 00091 secondary index there are delete marked records, which differ in an 00092 ord field. No problem. 00093 (3) Updates clustered ord field but not secondary, and secondary index 00094 is unique. Then the record in secondary index is just updated at the 00095 clustered ord field. 00096 (4) 00097 00098 Problem with duplicate records: 00099 Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a 00100 bigger trx id has inserted and delete marked a similar row, our trx inserts 00101 again a similar row, and a trx with an even bigger id delete marks it. Then 00102 the position of the row should change in the index if the trx id affects 00103 the alphabetical ordering. 00104 00105 Fix 2: If an insert encounters a similar row marked deleted, we turn the 00106 insert into an 'update' of the row marked deleted. Then we must write undo 00107 info on the update. A problem: what if a purge operation tries to remove 00108 the delete marked row? 00109 00110 We can think of the database row versions as a linked list which starts 00111 from the record in the clustered index, and is linked by roll ptrs 00112 through undo logs. The secondary index records are references which tell 00113 what kinds of records can be found in this linked list for a record 00114 in the clustered index. 00115 00116 How to do the purge? A record can be removed from the clustered index 00117 if its linked list becomes empty, i.e., the row has been marked deleted 00118 and its roll ptr points to the record in the undo log we are going through, 00119 doing the purge. Similarly, during a rollback, a record can be removed 00120 if the stored roll ptr in the undo log points to a trx already (being) purged, 00121 or if the roll ptr is NULL, i.e., it was a fresh insert. */ 00122 00123 /********************************************************************/ 00126 UNIV_INTERN 00127 undo_node_t* 00128 row_undo_node_create( 00129 /*=================*/ 00130 trx_t* trx, 00131 que_thr_t* parent, 00132 mem_heap_t* heap) 00133 { 00134 undo_node_t* undo; 00135 00136 ut_ad(trx && parent && heap); 00137 00138 undo = static_cast<undo_node_t *>(mem_heap_alloc(heap, sizeof(undo_node_t))); 00139 00140 undo->common.type = QUE_NODE_UNDO; 00141 undo->common.parent = parent; 00142 00143 undo->state = UNDO_NODE_FETCH_NEXT; 00144 undo->trx = trx; 00145 00146 btr_pcur_init(&(undo->pcur)); 00147 00148 undo->heap = mem_heap_create(256); 00149 00150 return(undo); 00151 } 00152 00153 /***********************************************************/ 00160 UNIV_INTERN 00161 ibool 00162 row_undo_search_clust_to_pcur( 00163 /*==========================*/ 00164 undo_node_t* node) 00165 { 00166 dict_index_t* clust_index; 00167 ibool found; 00168 mtr_t mtr; 00169 ibool ret; 00170 rec_t* rec; 00171 mem_heap_t* heap = NULL; 00172 ulint offsets_[REC_OFFS_NORMAL_SIZE]; 00173 ulint* offsets = offsets_; 00174 rec_offs_init(offsets_); 00175 00176 mtr_start(&mtr); 00177 00178 clust_index = dict_table_get_first_index(node->table); 00179 00180 found = row_search_on_row_ref(&(node->pcur), BTR_MODIFY_LEAF, 00181 node->table, node->ref, &mtr); 00182 00183 rec = btr_pcur_get_rec(&(node->pcur)); 00184 00185 offsets = rec_get_offsets(rec, clust_index, offsets, 00186 ULINT_UNDEFINED, &heap); 00187 00188 if (!found || node->roll_ptr 00189 != row_get_rec_roll_ptr(rec, clust_index, offsets)) { 00190 00191 /* We must remove the reservation on the undo log record 00192 BEFORE releasing the latch on the clustered index page: this 00193 is to make sure that some thread will eventually undo the 00194 modification corresponding to node->roll_ptr. */ 00195 00196 /* fputs("--------------------undoing a previous version\n", 00197 stderr); */ 00198 00199 ret = FALSE; 00200 } else { 00201 row_ext_t** ext; 00202 00203 if (dict_table_get_format(node->table) >= DICT_TF_FORMAT_ZIP) { 00204 /* In DYNAMIC or COMPRESSED format, there is 00205 no prefix of externally stored columns in the 00206 clustered index record. Build a cache of 00207 column prefixes. */ 00208 ext = &node->ext; 00209 } else { 00210 /* REDUNDANT and COMPACT formats store a local 00211 768-byte prefix of each externally stored 00212 column. No cache is needed. */ 00213 ext = NULL; 00214 node->ext = NULL; 00215 } 00216 00217 node->row = row_build(ROW_COPY_DATA, clust_index, rec, 00218 offsets, NULL, ext, node->heap); 00219 if (node->update) { 00220 node->undo_row = dtuple_copy(node->row, node->heap); 00221 row_upd_replace(node->undo_row, &node->undo_ext, 00222 clust_index, node->update, node->heap); 00223 } else { 00224 node->undo_row = NULL; 00225 node->undo_ext = NULL; 00226 } 00227 00228 btr_pcur_store_position(&(node->pcur), &mtr); 00229 00230 ret = TRUE; 00231 } 00232 00233 btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); 00234 00235 if (UNIV_LIKELY_NULL(heap)) { 00236 mem_heap_free(heap); 00237 } 00238 return(ret); 00239 } 00240 00241 /***********************************************************/ 00246 static 00247 ulint 00248 row_undo( 00249 /*=====*/ 00250 undo_node_t* node, 00251 que_thr_t* thr) 00252 { 00253 ulint err; 00254 trx_t* trx; 00255 roll_ptr_t roll_ptr; 00256 ibool locked_data_dict; 00257 00258 ut_ad(node && thr); 00259 00260 trx = node->trx; 00261 00262 if (node->state == UNDO_NODE_FETCH_NEXT) { 00263 00264 node->undo_rec = trx_roll_pop_top_rec_of_trx(trx, 00265 trx->roll_limit, 00266 &roll_ptr, 00267 node->heap); 00268 if (!node->undo_rec) { 00269 /* Rollback completed for this query thread */ 00270 00271 thr->run_node = que_node_get_parent(node); 00272 00273 return(DB_SUCCESS); 00274 } 00275 00276 node->roll_ptr = roll_ptr; 00277 node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec); 00278 00279 if (trx_undo_roll_ptr_is_insert(roll_ptr)) { 00280 00281 node->state = UNDO_NODE_INSERT; 00282 } else { 00283 node->state = UNDO_NODE_MODIFY; 00284 } 00285 00286 } else if (node->state == UNDO_NODE_PREV_VERS) { 00287 00288 /* Undo should be done to the same clustered index record 00289 again in this same rollback, restoring the previous version */ 00290 00291 roll_ptr = node->new_roll_ptr; 00292 00293 node->undo_rec = trx_undo_get_undo_rec_low(roll_ptr, 00294 node->heap); 00295 node->roll_ptr = roll_ptr; 00296 node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec); 00297 00298 if (trx_undo_roll_ptr_is_insert(roll_ptr)) { 00299 00300 node->state = UNDO_NODE_INSERT; 00301 } else { 00302 node->state = UNDO_NODE_MODIFY; 00303 } 00304 } 00305 00306 /* Prevent DROP TABLE etc. while we are rolling back this row. 00307 If we are doing a TABLE CREATE or some other dictionary operation, 00308 then we already have dict_operation_lock locked in x-mode. Do not 00309 try to lock again, because that would cause a hang. */ 00310 00311 locked_data_dict = (trx->dict_operation_lock_mode == 0); 00312 00313 if (locked_data_dict) { 00314 00315 row_mysql_freeze_data_dictionary(trx); 00316 } 00317 00318 if (node->state == UNDO_NODE_INSERT) { 00319 00320 err = row_undo_ins(node); 00321 00322 node->state = UNDO_NODE_FETCH_NEXT; 00323 } else { 00324 ut_ad(node->state == UNDO_NODE_MODIFY); 00325 err = row_undo_mod(node, thr); 00326 } 00327 00328 if (locked_data_dict) { 00329 00330 row_mysql_unfreeze_data_dictionary(trx); 00331 } 00332 00333 /* Do some cleanup */ 00334 btr_pcur_close(&(node->pcur)); 00335 00336 mem_heap_empty(node->heap); 00337 00338 thr->run_node = node; 00339 00340 return(err); 00341 } 00342 00343 /***********************************************************/ 00347 UNIV_INTERN 00348 que_thr_t* 00349 row_undo_step( 00350 /*==========*/ 00351 que_thr_t* thr) 00352 { 00353 ulint err; 00354 undo_node_t* node; 00355 trx_t* trx; 00356 00357 ut_ad(thr); 00358 00359 srv_activity_count++; 00360 00361 trx = thr_get_trx(thr); 00362 00363 node = static_cast<undo_node_t *>(thr->run_node); 00364 00365 ut_ad(que_node_get_type(node) == QUE_NODE_UNDO); 00366 00367 err = row_undo(node, thr); 00368 00369 trx->error_state = err; 00370 00371 if (err != DB_SUCCESS) { 00372 /* SQL error detected */ 00373 00374 fprintf(stderr, "InnoDB: Fatal error %lu in rollback.\n", 00375 (ulong) err); 00376 00377 if (err == DB_OUT_OF_FILE_SPACE) { 00378 fprintf(stderr, 00379 "InnoDB: Error 13 means out of tablespace.\n" 00380 "InnoDB: Consider increasing" 00381 " your tablespace.\n"); 00382 00383 exit(1); 00384 } 00385 00386 ut_error; 00387 00388 return(NULL); 00389 } 00390 00391 return(thr); 00392 }