Drizzled Public API Documentation

buf0rea.cc
1 /*****************************************************************************
2 
3 Copyright (C) 1995, 2010, Innobase Oy. All Rights Reserved.
4 
5 This program is free software; you can redistribute it and/or modify it under
6 the terms of the GNU General Public License as published by the Free Software
7 Foundation; version 2 of the License.
8 
9 This program is distributed in the hope that it will be useful, but WITHOUT
10 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12 
13 You should have received a copy of the GNU General Public License along with
14 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
15 St, Fifth Floor, Boston, MA 02110-1301 USA
16 
17 *****************************************************************************/
18 
19 /**************************************************/
26 #include "buf0rea.h"
27 
28 #include "fil0fil.h"
29 #include "mtr0mtr.h"
30 
31 #include "buf0buf.h"
32 #include "buf0flu.h"
33 #include "buf0lru.h"
34 #include "ibuf0ibuf.h"
35 #include "log0recv.h"
36 #include "trx0sys.h"
37 #include "os0file.h"
38 #include "srv0start.h"
39 #include "srv0srv.h"
40 
42 #define BUF_READ_AHEAD_LINEAR_AREA BUF_READ_AHEAD_AREA
43 
47 #define BUF_READ_AHEAD_PEND_LIMIT 2
48 
49 /********************************************************************/
59 UNIV_INTERN
60 ulint
61 buf_read_page_low(
62 /*==============*/
63  ulint* err,
66  ibool sync,
67  ulint mode,
70  ulint space,
71  ulint zip_size,
72  ibool unzip,
73  ib_int64_t tablespace_version,
78  ulint offset)
79 {
80  buf_page_t* bpage;
81  ulint wake_later;
82 
83  *err = DB_SUCCESS;
84 
85  wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
86  mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;
87 
88  if (trx_doublewrite && space == TRX_SYS_SPACE
89  && ( (offset >= trx_doublewrite->block1
90  && offset < trx_doublewrite->block1
92  || (offset >= trx_doublewrite->block2
93  && offset < trx_doublewrite->block2
95  ut_print_timestamp(stderr);
96  fprintf(stderr,
97  " InnoDB: Warning: trying to read"
98  " doublewrite buffer page %lu\n",
99  (ulong) offset);
100 
101  return(0);
102  }
103 
104  if (ibuf_bitmap_page(zip_size, offset)
105  || trx_sys_hdr_page(space, offset)) {
106 
107  /* Trx sys header is so low in the latching order that we play
108  safe and do not leave the i/o-completion to an asynchronous
109  i/o-thread. Ibuf bitmap pages must always be read with
110  syncronous i/o, to make sure they do not get involved in
111  thread deadlocks. */
112 
113  sync = TRUE;
114  }
115 
116  /* The following call will also check if the tablespace does not exist
117  or is being dropped; if we succeed in initing the page in the buffer
118  pool for read, then DISCARD cannot proceed until the read has
119  completed */
120  bpage = buf_page_init_for_read(err, mode, space, zip_size, unzip,
121  tablespace_version, offset);
122  if (bpage == NULL) {
123 
124  return(0);
125  }
126 
127 #ifdef UNIV_DEBUG
128  if (buf_debug_prints) {
129  fprintf(stderr,
130  "Posting read request for page %lu, sync %lu\n",
131  (ulong) offset,
132  (ulong) sync);
133  }
134 #endif
135 
136  ut_ad(buf_page_in_file(bpage));
137 
138  if (zip_size) {
139  *err = fil_io(OS_FILE_READ | wake_later,
140  sync, space, zip_size, offset, 0, zip_size,
141  bpage->zip.data, bpage);
142  } else {
144 
145  *err = fil_io(OS_FILE_READ | wake_later,
146  sync, space, 0, offset, 0, UNIV_PAGE_SIZE,
147  ((buf_block_t*) bpage)->frame, bpage);
148  }
149  ut_a(*err == DB_SUCCESS);
150 
151  if (sync) {
152  /* The i/o is already completed when we arrive from
153  fil_read */
154  buf_page_io_complete(bpage);
155  }
156 
157  return(1);
158 }
159 
160 /********************************************************************/
166 UNIV_INTERN
167 ibool
168 buf_read_page(
169 /*==========*/
170  ulint space,
171  ulint zip_size,
172  ulint offset)
173 {
174  buf_pool_t* buf_pool = buf_pool_get(space, offset);
175  ib_int64_t tablespace_version;
176  ulint count;
177  ulint err;
178 
179  tablespace_version = fil_space_get_version(space);
180 
181  /* We do the i/o in the synchronous aio mode to save thread
182  switches: hence TRUE */
183 
184  count = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
185  zip_size, FALSE,
186  tablespace_version, offset);
187  srv_buf_pool_reads += count;
188  if (err == DB_TABLESPACE_DELETED) {
189  ut_print_timestamp(stderr);
190  fprintf(stderr,
191  " InnoDB: Error: trying to access"
192  " tablespace %lu page no. %lu,\n"
193  "InnoDB: but the tablespace does not exist"
194  " or is just being dropped.\n",
195  (ulong) space, (ulong) offset);
196  }
197 
198  /* Flush pages from the end of the LRU list if necessary */
199  buf_flush_free_margin(buf_pool);
200 
201  /* Increment number of I/O operations used for LRU policy. */
203 
204  return(count);
205 }
206 
207 /********************************************************************/
231 UNIV_INTERN
232 ulint
233 buf_read_ahead_linear(
234 /*==================*/
235  ulint space,
236  ulint zip_size,
237  ulint offset)
239 {
240  buf_pool_t* buf_pool = buf_pool_get(space, offset);
241  ib_int64_t tablespace_version;
242  buf_page_t* bpage;
243  buf_frame_t* frame;
244  buf_page_t* pred_bpage = NULL;
245  ulint pred_offset;
246  ulint succ_offset;
247  ulint count;
248  int asc_or_desc;
249  ulint new_offset;
250  ulint fail_count;
251  ulint ibuf_mode;
252  ulint low, high;
253  ulint err;
254  ulint i;
255  const ulint buf_read_ahead_linear_area
256  = BUF_READ_AHEAD_LINEAR_AREA(buf_pool);
257  ulint threshold;
258 
259  if ((srv_read_ahead & 2) == false) {
260  return(0);
261  }
262 
263  if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) {
264  /* No read-ahead to avoid thread deadlocks */
265  return(0);
266  }
267 
268  low = (offset / buf_read_ahead_linear_area)
269  * buf_read_ahead_linear_area;
270  high = (offset / buf_read_ahead_linear_area + 1)
271  * buf_read_ahead_linear_area;
272 
273  if ((offset != low) && (offset != high - 1)) {
274  /* This is not a border page of the area: return */
275 
276  return(0);
277  }
278 
279  if (ibuf_bitmap_page(zip_size, offset)
280  || trx_sys_hdr_page(space, offset)) {
281 
282  /* If it is an ibuf bitmap page or trx sys hdr, we do
283  no read-ahead, as that could break the ibuf page access
284  order */
285 
286  return(0);
287  }
288 
289  /* Remember the tablespace version before we ask te tablespace size
290  below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
291  do not try to read outside the bounds of the tablespace! */
292 
293  tablespace_version = fil_space_get_version(space);
294 
295  buf_pool_mutex_enter(buf_pool);
296 
297  if (high > fil_space_get_size(space)) {
298  buf_pool_mutex_exit(buf_pool);
299  /* The area is not whole, return */
300 
301  return(0);
302  }
303 
304  if (buf_pool->n_pend_reads
305  > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
306  buf_pool_mutex_exit(buf_pool);
307 
308  return(0);
309  }
310 
311  /* Check that almost all pages in the area have been accessed; if
312  offset == low, the accesses must be in a descending order, otherwise,
313  in an ascending order. */
314 
315  asc_or_desc = 1;
316 
317  if (offset == low) {
318  asc_or_desc = -1;
319  }
320 
321  /* How many out of order accessed pages can we ignore
322  when working out the access pattern for linear readahead */
323  threshold = ut_min((64 - srv_read_ahead_threshold),
324  BUF_READ_AHEAD_AREA(buf_pool));
325 
326  fail_count = 0;
327 
328  for (i = low; i < high; i++) {
329  bpage = buf_page_hash_get(buf_pool, space, i);
330 
331  if (bpage == NULL || !buf_page_is_accessed(bpage)) {
332  /* Not accessed */
333  fail_count++;
334 
335  } else if (pred_bpage) {
336  /* Note that buf_page_is_accessed() returns
337  the time of the first access. If some blocks
338  of the extent existed in the buffer pool at
339  the time of a linear access pattern, the first
340  access times may be nonmonotonic, even though
341  the latest access times were linear. The
342  threshold (srv_read_ahead_factor) should help
343  a little against this. */
344  int res = ut_ulint_cmp(
345  buf_page_is_accessed(bpage),
346  buf_page_is_accessed(pred_bpage));
347  /* Accesses not in the right order */
348  if (res != 0 && res != asc_or_desc) {
349  fail_count++;
350  }
351  }
352 
353  if (fail_count > threshold) {
354  /* Too many failures: return */
355  buf_pool_mutex_exit(buf_pool);
356  return(0);
357  }
358 
359  if (bpage && buf_page_is_accessed(bpage)) {
360  pred_bpage = bpage;
361  }
362  }
363 
364  /* If we got this far, we know that enough pages in the area have
365  been accessed in the right order: linear read-ahead can be sensible */
366 
367  bpage = buf_page_hash_get(buf_pool, space, offset);
368 
369  if (bpage == NULL) {
370  buf_pool_mutex_exit(buf_pool);
371 
372  return(0);
373  }
374 
375  switch (buf_page_get_state(bpage)) {
376  case BUF_BLOCK_ZIP_PAGE:
377  frame = bpage->zip.data;
378  break;
379  case BUF_BLOCK_FILE_PAGE:
380  frame = ((buf_block_t*) bpage)->frame;
381  break;
382  default:
383  ut_error;
384  break;
385  }
386 
387  /* Read the natural predecessor and successor page addresses from
388  the page; NOTE that because the calling thread may have an x-latch
389  on the page, we do not acquire an s-latch on the page, this is to
390  prevent deadlocks. Even if we read values which are nonsense, the
391  algorithm will work. */
392 
393  pred_offset = fil_page_get_prev(frame);
394  succ_offset = fil_page_get_next(frame);
395 
396  buf_pool_mutex_exit(buf_pool);
397 
398  if ((offset == low) && (succ_offset == offset + 1)) {
399 
400  /* This is ok, we can continue */
401  new_offset = pred_offset;
402 
403  } else if ((offset == high - 1) && (pred_offset == offset - 1)) {
404 
405  /* This is ok, we can continue */
406  new_offset = succ_offset;
407  } else {
408  /* Successor or predecessor not in the right order */
409 
410  return(0);
411  }
412 
413  low = (new_offset / buf_read_ahead_linear_area)
414  * buf_read_ahead_linear_area;
415  high = (new_offset / buf_read_ahead_linear_area + 1)
416  * buf_read_ahead_linear_area;
417 
418  if ((new_offset != low) && (new_offset != high - 1)) {
419  /* This is not a border page of the area: return */
420 
421  return(0);
422  }
423 
424  if (high > fil_space_get_size(space)) {
425  /* The area is not whole, return */
426 
427  return(0);
428  }
429 
430  /* If we got this far, read-ahead can be sensible: do it */
431 
432  if (ibuf_inside()) {
433  ibuf_mode = BUF_READ_IBUF_PAGES_ONLY;
434  } else {
435  ibuf_mode = BUF_READ_ANY_PAGE;
436  }
437 
438  count = 0;
439 
440  /* Since Windows XP seems to schedule the i/o handler thread
441  very eagerly, and consequently it does not wait for the
442  full read batch to be posted, we use special heuristics here */
443 
445 
446  for (i = low; i < high; i++) {
447  /* It is only sensible to do read-ahead in the non-sync
448  aio mode: hence FALSE as the first parameter */
449 
450  if (!ibuf_bitmap_page(zip_size, i)) {
451  count += buf_read_page_low(
452  &err, FALSE,
453  ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER,
454  space, zip_size, FALSE, tablespace_version, i);
455  if (err == DB_TABLESPACE_DELETED) {
456  ut_print_timestamp(stderr);
457  fprintf(stderr,
458  " InnoDB: Warning: in"
459  " linear readahead trying to access\n"
460  "InnoDB: tablespace %lu page %lu,\n"
461  "InnoDB: but the tablespace does not"
462  " exist or is just being dropped.\n",
463  (ulong) space, (ulong) i);
464  }
465  }
466  }
467 
468  /* In simulated aio we wake the aio handler threads only after
469  queuing all aio requests, in native aio the following call does
470  nothing: */
471 
473 
474  /* Flush pages from the end of the LRU list if necessary */
475  buf_flush_free_margin(buf_pool);
476 
477 #ifdef UNIV_DEBUG
478  if (buf_debug_prints && (count > 0)) {
479  fprintf(stderr,
480  "LINEAR read-ahead space %lu offset %lu pages %lu\n",
481  (ulong) space, (ulong) offset, (ulong) count);
482  }
483 #endif /* UNIV_DEBUG */
484 
485  /* Read ahead is considered one I/O operation for the purpose of
486  LRU policy decision. */
488 
489  buf_pool->stat.n_ra_pages_read += count;
490  return(count);
491 }
492 
493 /********************************************************************/
497 UNIV_INTERN
498 void
499 buf_read_ibuf_merge_pages(
500 /*======================*/
501  ibool sync,
506  const ulint* space_ids,
507  const ib_int64_t* space_versions,
514  const ulint* page_nos,
518  ulint n_stored)
520 {
521  ulint i;
522 
523  ut_ad(!ibuf_inside());
524 #ifdef UNIV_IBUF_DEBUG
525  ut_a(n_stored < UNIV_PAGE_SIZE);
526 #endif
527 
528  for (i = 0; i < n_stored; i++) {
529  ulint err;
530  buf_pool_t* buf_pool;
531  ulint zip_size = fil_space_get_zip_size(space_ids[i]);
532 
533  buf_pool = buf_pool_get(space_ids[i], space_versions[i]);
534 
535  while (buf_pool->n_pend_reads
536  > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
537  os_thread_sleep(500000);
538  }
539 
540  if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
541 
542  goto tablespace_deleted;
543  }
544 
545  buf_read_page_low(&err, sync && (i + 1 == n_stored),
546  BUF_READ_ANY_PAGE, space_ids[i],
547  zip_size, TRUE, space_versions[i],
548  page_nos[i]);
549 
550  if (UNIV_UNLIKELY(err == DB_TABLESPACE_DELETED)) {
551 tablespace_deleted:
552  /* We have deleted or are deleting the single-table
553  tablespace: remove the entries for that page */
554 
555  ibuf_merge_or_delete_for_page(NULL, space_ids[i],
556  page_nos[i],
557  zip_size, FALSE);
558  }
559  }
560 
562 
563  /* Flush pages from the end of all the LRU lists if necessary */
564  buf_flush_free_margins();
565 
566 #ifdef UNIV_DEBUG
567  if (buf_debug_prints) {
568  fprintf(stderr,
569  "Ibuf merge read-ahead space %lu pages %lu\n",
570  (ulong) space_ids[0], (ulong) n_stored);
571  }
572 #endif /* UNIV_DEBUG */
573 }
574 
575 /********************************************************************/
577 UNIV_INTERN
578 void
579 buf_read_recv_pages(
580 /*================*/
581  ibool sync,
586  ulint space,
587  ulint zip_size,
589  const ulint* page_nos,
593  ulint n_stored)
595 {
596  ib_int64_t tablespace_version;
597  ulint count;
598  ulint err;
599  ulint i;
600 
601  zip_size = fil_space_get_zip_size(space);
602 
603  if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
604  /* It is a single table tablespace and the .ibd file is
605  missing: do nothing */
606 
607  return;
608  }
609 
610  tablespace_version = fil_space_get_version(space);
611 
612  for (i = 0; i < n_stored; i++) {
613  buf_pool_t* buf_pool;
614 
615  count = 0;
616 
617  os_aio_print_debug = FALSE;
618  buf_pool = buf_pool_get(space, page_nos[i]);
619  while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {
620 
622  os_thread_sleep(10000);
623 
624  count++;
625 
626  if (count > 1000) {
627  fprintf(stderr,
628  "InnoDB: Error: InnoDB has waited for"
629  " 10 seconds for pending\n"
630  "InnoDB: reads to the buffer pool to"
631  " be finished.\n"
632  "InnoDB: Number of pending reads %lu,"
633  " pending pread calls %lu\n",
634  (ulong) buf_pool->n_pend_reads,
635  (ulong)os_file_n_pending_preads);
636 
637  os_aio_print_debug = TRUE;
638  }
639  }
640 
641  os_aio_print_debug = FALSE;
642 
643  if ((i + 1 == n_stored) && sync) {
644  buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
645  zip_size, TRUE, tablespace_version,
646  page_nos[i]);
647  } else {
648  buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE
650  space, zip_size, TRUE,
651  tablespace_version, page_nos[i]);
652  }
653  }
654 
656 
657  /* Flush pages from the end of all the LRU lists if necessary */
658  buf_flush_free_margins();
659 
660 #ifdef UNIV_DEBUG
661  if (buf_debug_prints) {
662  fprintf(stderr,
663  "Recovery applies read-ahead pages %lu\n",
664  (ulong) n_stored);
665  }
666 #endif /* UNIV_DEBUG */
667 }