Drizzled Public API Documentation

buf0rea.cc
1 /*****************************************************************************
2 
3 Copyright (C) 1995, 2010, Innobase Oy. All Rights Reserved.
4 
5 This program is free software; you can redistribute it and/or modify it under
6 the terms of the GNU General Public License as published by the Free Software
7 Foundation; version 2 of the License.
8 
9 This program is distributed in the hope that it will be useful, but WITHOUT
10 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12 
13 You should have received a copy of the GNU General Public License along with
14 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
15 St, Fifth Floor, Boston, MA 02110-1301 USA
16 
17 *****************************************************************************/
18 
19 /**************************************************/
26 #include "buf0rea.h"
27 
28 #include "fil0fil.h"
29 #include "mtr0mtr.h"
30 
31 #include "buf0buf.h"
32 #include "buf0flu.h"
33 #include "buf0lru.h"
34 #include "ibuf0ibuf.h"
35 #include "log0recv.h"
36 #include "trx0sys.h"
37 #include "os0file.h"
38 #include "srv0start.h"
39 #include "srv0srv.h"
40 
42 #define BUF_READ_AHEAD_LINEAR_AREA BUF_READ_AHEAD_AREA
43 
47 #define BUF_READ_AHEAD_PEND_LIMIT 2
48 
49 /********************************************************************/
59 UNIV_INTERN
60 ulint
61 buf_read_page_low(
62 /*==============*/
63  ulint* err,
66  ibool sync,
67  ulint mode,
70  ulint space,
71  ulint zip_size,
72  ibool unzip,
73  ib_int64_t tablespace_version,
78  ulint offset)
79 {
80  buf_page_t* bpage;
81  ulint wake_later;
82 
83  *err = DB_SUCCESS;
84 
85  wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
86  mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;
87 
88  if (trx_doublewrite && space == TRX_SYS_SPACE
89  && ( (offset >= trx_doublewrite->block1
90  && offset < trx_doublewrite->block1
92  || (offset >= trx_doublewrite->block2
93  && offset < trx_doublewrite->block2
95  ut_print_timestamp(stderr);
96  fprintf(stderr,
97  " InnoDB: Warning: trying to read"
98  " doublewrite buffer page %lu\n",
99  (ulong) offset);
100 
101  return(0);
102  }
103 
104  if (ibuf_bitmap_page(zip_size, offset)
105  || trx_sys_hdr_page(space, offset)) {
106 
107  /* Trx sys header is so low in the latching order that we play
108  safe and do not leave the i/o-completion to an asynchronous
109  i/o-thread. Ibuf bitmap pages must always be read with
110  syncronous i/o, to make sure they do not get involved in
111  thread deadlocks. */
112 
113  sync = TRUE;
114  }
115 
116  /* The following call will also check if the tablespace does not exist
117  or is being dropped; if we succeed in initing the page in the buffer
118  pool for read, then DISCARD cannot proceed until the read has
119  completed */
120  bpage = buf_page_init_for_read(err, mode, space, zip_size, unzip,
121  tablespace_version, offset);
122  if (bpage == NULL) {
123 
124  return(0);
125  }
126 
127 #ifdef UNIV_DEBUG
128  if (buf_debug_prints) {
129  fprintf(stderr,
130  "Posting read request for page %lu, sync %lu\n",
131  (ulong) offset,
132  (ulong) sync);
133  }
134 #endif
135 
136  ut_ad(buf_page_in_file(bpage));
137 
138  if (zip_size) {
139  *err = fil_io(OS_FILE_READ | wake_later,
140  sync, space, zip_size, offset, 0, zip_size,
141  bpage->zip.data, bpage);
142  } else {
144 
145  *err = fil_io(OS_FILE_READ | wake_later,
146  sync, space, 0, offset, 0, UNIV_PAGE_SIZE,
147  ((buf_block_t*) bpage)->frame, bpage);
148  }
149  ut_a(*err == DB_SUCCESS);
150 
151  if (sync) {
152  /* The i/o is already completed when we arrive from
153  fil_read */
154  buf_page_io_complete(bpage);
155  }
156 
157  return(1);
158 }
159 
160 /********************************************************************/
166 UNIV_INTERN
167 ibool
168 buf_read_page(
169 /*==========*/
170  ulint space,
171  ulint zip_size,
172  ulint offset)
173 {
174  buf_pool_t* buf_pool = buf_pool_get(space, offset);
175  ib_int64_t tablespace_version;
176  ulint count;
177  ulint err;
178 
179  tablespace_version = fil_space_get_version(space);
180 
181  /* We do the i/o in the synchronous aio mode to save thread
182  switches: hence TRUE */
183 
184  count = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
185  zip_size, FALSE,
186  tablespace_version, offset);
187  srv_buf_pool_reads += count;
188  if (err == DB_TABLESPACE_DELETED) {
189  ut_print_timestamp(stderr);
190  fprintf(stderr,
191  " InnoDB: Error: trying to access"
192  " tablespace %lu page no. %lu,\n"
193  "InnoDB: but the tablespace does not exist"
194  " or is just being dropped.\n",
195  (ulong) space, (ulong) offset);
196  }
197 
198  /* Flush pages from the end of the LRU list if necessary */
199  buf_flush_free_margin(buf_pool);
200 
201  /* Increment number of I/O operations used for LRU policy. */
203 
204  return(count);
205 }
206 
207 /********************************************************************/
231 UNIV_INTERN
232 ulint
233 buf_read_ahead_linear(
234 /*==================*/
235  ulint space,
236  ulint zip_size,
237  ulint offset,
238  ibool inside_ibuf)
239 {
240  buf_pool_t* buf_pool = buf_pool_get(space, offset);
241  ib_int64_t tablespace_version;
242  buf_page_t* bpage;
243  buf_frame_t* frame;
244  buf_page_t* pred_bpage = NULL;
245  ulint pred_offset;
246  ulint succ_offset;
247  ulint count;
248  int asc_or_desc;
249  ulint new_offset;
250  ulint fail_count;
251  ulint ibuf_mode;
252  ulint low, high;
253  ulint err;
254  ulint i;
255  const ulint buf_read_ahead_linear_area
256  = BUF_READ_AHEAD_LINEAR_AREA(buf_pool);
257  ulint threshold;
258 
259  if ((srv_read_ahead & 2) == false) {
260  return(0);
261  }
262 
263  if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) {
264  /* No read-ahead to avoid thread deadlocks */
265  return(0);
266  }
267 
268  low = (offset / buf_read_ahead_linear_area)
269  * buf_read_ahead_linear_area;
270  high = (offset / buf_read_ahead_linear_area + 1)
271  * buf_read_ahead_linear_area;
272 
273  if ((offset != low) && (offset != high - 1)) {
274  /* This is not a border page of the area: return */
275 
276  return(0);
277  }
278 
279  if (ibuf_bitmap_page(zip_size, offset)
280  || trx_sys_hdr_page(space, offset)) {
281 
282  /* If it is an ibuf bitmap page or trx sys hdr, we do
283  no read-ahead, as that could break the ibuf page access
284  order */
285 
286  return(0);
287  }
288 
289  /* Remember the tablespace version before we ask te tablespace size
290  below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
291  do not try to read outside the bounds of the tablespace! */
292 
293  tablespace_version = fil_space_get_version(space);
294 
295  buf_pool_mutex_enter(buf_pool);
296 
297  if (high > fil_space_get_size(space)) {
298  buf_pool_mutex_exit(buf_pool);
299  /* The area is not whole, return */
300 
301  return(0);
302  }
303 
304  if (buf_pool->n_pend_reads
305  > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
306  buf_pool_mutex_exit(buf_pool);
307 
308  return(0);
309  }
310 
311  /* Check that almost all pages in the area have been accessed; if
312  offset == low, the accesses must be in a descending order, otherwise,
313  in an ascending order. */
314 
315  asc_or_desc = 1;
316 
317  if (offset == low) {
318  asc_or_desc = -1;
319  }
320 
321  /* How many out of order accessed pages can we ignore
322  when working out the access pattern for linear readahead */
323  threshold = ut_min((64 - srv_read_ahead_threshold),
324  BUF_READ_AHEAD_AREA(buf_pool));
325 
326  fail_count = 0;
327 
328  for (i = low; i < high; i++) {
329  bpage = buf_page_hash_get(buf_pool, space, i);
330 
331  if (bpage == NULL || !buf_page_is_accessed(bpage)) {
332  /* Not accessed */
333  fail_count++;
334 
335  } else if (pred_bpage) {
336  /* Note that buf_page_is_accessed() returns
337  the time of the first access. If some blocks
338  of the extent existed in the buffer pool at
339  the time of a linear access pattern, the first
340  access times may be nonmonotonic, even though
341  the latest access times were linear. The
342  threshold (srv_read_ahead_factor) should help
343  a little against this. */
344  int res = ut_ulint_cmp(
345  buf_page_is_accessed(bpage),
346  buf_page_is_accessed(pred_bpage));
347  /* Accesses not in the right order */
348  if (res != 0 && res != asc_or_desc) {
349  fail_count++;
350  }
351  }
352 
353  if (fail_count > threshold) {
354  /* Too many failures: return */
355  buf_pool_mutex_exit(buf_pool);
356  return(0);
357  }
358 
359  if (bpage && buf_page_is_accessed(bpage)) {
360  pred_bpage = bpage;
361  }
362  }
363 
364  /* If we got this far, we know that enough pages in the area have
365  been accessed in the right order: linear read-ahead can be sensible */
366 
367  bpage = buf_page_hash_get(buf_pool, space, offset);
368 
369  if (bpage == NULL) {
370  buf_pool_mutex_exit(buf_pool);
371 
372  return(0);
373  }
374 
375  switch (buf_page_get_state(bpage)) {
376  case BUF_BLOCK_ZIP_PAGE:
377  frame = bpage->zip.data;
378  break;
379  case BUF_BLOCK_FILE_PAGE:
380  frame = ((buf_block_t*) bpage)->frame;
381  break;
382  default:
383  ut_error;
384  break;
385  }
386 
387  /* Read the natural predecessor and successor page addresses from
388  the page; NOTE that because the calling thread may have an x-latch
389  on the page, we do not acquire an s-latch on the page, this is to
390  prevent deadlocks. Even if we read values which are nonsense, the
391  algorithm will work. */
392 
393  pred_offset = fil_page_get_prev(frame);
394  succ_offset = fil_page_get_next(frame);
395 
396  buf_pool_mutex_exit(buf_pool);
397 
398  if ((offset == low) && (succ_offset == offset + 1)) {
399 
400  /* This is ok, we can continue */
401  new_offset = pred_offset;
402 
403  } else if ((offset == high - 1) && (pred_offset == offset - 1)) {
404 
405  /* This is ok, we can continue */
406  new_offset = succ_offset;
407  } else {
408  /* Successor or predecessor not in the right order */
409 
410  return(0);
411  }
412 
413  low = (new_offset / buf_read_ahead_linear_area)
414  * buf_read_ahead_linear_area;
415  high = (new_offset / buf_read_ahead_linear_area + 1)
416  * buf_read_ahead_linear_area;
417 
418  if ((new_offset != low) && (new_offset != high - 1)) {
419  /* This is not a border page of the area: return */
420 
421  return(0);
422  }
423 
424  if (high > fil_space_get_size(space)) {
425  /* The area is not whole, return */
426 
427  return(0);
428  }
429 
430  /* If we got this far, read-ahead can be sensible: do it */
431 
432  ibuf_mode = inside_ibuf
435 
436  count = 0;
437 
438  /* Since Windows XP seems to schedule the i/o handler thread
439  very eagerly, and consequently it does not wait for the
440  full read batch to be posted, we use special heuristics here */
441 
443 
444  for (i = low; i < high; i++) {
445  /* It is only sensible to do read-ahead in the non-sync
446  aio mode: hence FALSE as the first parameter */
447 
448  if (!ibuf_bitmap_page(zip_size, i)) {
449  count += buf_read_page_low(
450  &err, FALSE,
451  ibuf_mode,
452  space, zip_size, FALSE, tablespace_version, i);
453  if (err == DB_TABLESPACE_DELETED) {
454  ut_print_timestamp(stderr);
455  fprintf(stderr,
456  " InnoDB: Warning: in"
457  " linear readahead trying to access\n"
458  "InnoDB: tablespace %lu page %lu,\n"
459  "InnoDB: but the tablespace does not"
460  " exist or is just being dropped.\n",
461  (ulong) space, (ulong) i);
462  }
463  }
464  }
465 
466  /* In simulated aio we wake the aio handler threads only after
467  queuing all aio requests, in native aio the following call does
468  nothing: */
469 
471 
472  /* Flush pages from the end of the LRU list if necessary */
473  buf_flush_free_margin(buf_pool);
474 
475 #ifdef UNIV_DEBUG
476  if (buf_debug_prints && (count > 0)) {
477  fprintf(stderr,
478  "LINEAR read-ahead space %lu offset %lu pages %lu\n",
479  (ulong) space, (ulong) offset, (ulong) count);
480  }
481 #endif /* UNIV_DEBUG */
482 
483  /* Read ahead is considered one I/O operation for the purpose of
484  LRU policy decision. */
486 
487  buf_pool->stat.n_ra_pages_read += count;
488  return(count);
489 }
490 
491 /********************************************************************/
495 UNIV_INTERN
496 void
497 buf_read_ibuf_merge_pages(
498 /*======================*/
499  ibool sync,
504  const ulint* space_ids,
505  const ib_int64_t* space_versions,
512  const ulint* page_nos,
516  ulint n_stored)
518 {
519  ulint i;
520 
521 #ifdef UNIV_IBUF_DEBUG
522  ut_a(n_stored < UNIV_PAGE_SIZE);
523 #endif
524 
525  for (i = 0; i < n_stored; i++) {
526  ulint err;
527  buf_pool_t* buf_pool;
528  ulint zip_size = fil_space_get_zip_size(space_ids[i]);
529 
530  buf_pool = buf_pool_get(space_ids[i], page_nos[i]);
531 
532  while (buf_pool->n_pend_reads
533  > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
534  os_thread_sleep(500000);
535  }
536 
537  if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
538 
539  goto tablespace_deleted;
540  }
541 
542  buf_read_page_low(&err, sync && (i + 1 == n_stored),
543  BUF_READ_ANY_PAGE, space_ids[i],
544  zip_size, TRUE, space_versions[i],
545  page_nos[i]);
546 
547  if (UNIV_UNLIKELY(err == DB_TABLESPACE_DELETED)) {
548 tablespace_deleted:
549  /* We have deleted or are deleting the single-table
550  tablespace: remove the entries for that page */
551 
552  ibuf_merge_or_delete_for_page(NULL, space_ids[i],
553  page_nos[i],
554  zip_size, FALSE);
555  }
556  }
557 
559 
560  /* Flush pages from the end of all the LRU lists if necessary */
561  buf_flush_free_margins();
562 
563 #ifdef UNIV_DEBUG
564  if (buf_debug_prints) {
565  fprintf(stderr,
566  "Ibuf merge read-ahead space %lu pages %lu\n",
567  (ulong) space_ids[0], (ulong) n_stored);
568  }
569 #endif /* UNIV_DEBUG */
570 }
571 
572 /********************************************************************/
574 UNIV_INTERN
575 void
576 buf_read_recv_pages(
577 /*================*/
578  ibool sync,
583  ulint space,
584  ulint zip_size,
586  const ulint* page_nos,
590  ulint n_stored)
592 {
593  ib_int64_t tablespace_version;
594  ulint count;
595  ulint err;
596  ulint i;
597 
598  zip_size = fil_space_get_zip_size(space);
599 
600  if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
601  /* It is a single table tablespace and the .ibd file is
602  missing: do nothing */
603 
604  return;
605  }
606 
607  tablespace_version = fil_space_get_version(space);
608 
609  for (i = 0; i < n_stored; i++) {
610  buf_pool_t* buf_pool;
611 
612  count = 0;
613 
614  os_aio_print_debug = FALSE;
615  buf_pool = buf_pool_get(space, page_nos[i]);
616  while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {
617 
619  os_thread_sleep(10000);
620 
621  count++;
622 
623  if (count > 1000) {
624  fprintf(stderr,
625  "InnoDB: Error: InnoDB has waited for"
626  " 10 seconds for pending\n"
627  "InnoDB: reads to the buffer pool to"
628  " be finished.\n"
629  "InnoDB: Number of pending reads %lu,"
630  " pending pread calls %lu\n",
631  (ulong) buf_pool->n_pend_reads,
632  (ulong)os_file_n_pending_preads);
633 
634  os_aio_print_debug = TRUE;
635  }
636  }
637 
638  os_aio_print_debug = FALSE;
639 
640  if ((i + 1 == n_stored) && sync) {
641  buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
642  zip_size, TRUE, tablespace_version,
643  page_nos[i]);
644  } else {
645  buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE
647  space, zip_size, TRUE,
648  tablespace_version, page_nos[i]);
649  }
650  }
651 
653 
654  /* Flush pages from the end of all the LRU lists if necessary */
655  buf_flush_free_margins();
656 
657 #ifdef UNIV_DEBUG
658  if (buf_debug_prints) {
659  fprintf(stderr,
660  "Recovery applies read-ahead pages %lu\n",
661  (ulong) n_stored);
662  }
663 #endif /* UNIV_DEBUG */
664 }