Drizzled Public API Documentation

buf0flu.cc
1 /*****************************************************************************
2 
3 Copyright (C) 1995, 2010, Innobase Oy. All Rights Reserved.
4 
5 This program is free software; you can redistribute it and/or modify it under
6 the terms of the GNU General Public License as published by the Free Software
7 Foundation; version 2 of the License.
8 
9 This program is distributed in the hope that it will be useful, but WITHOUT
10 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12 
13 You should have received a copy of the GNU General Public License along with
14 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
15 St, Fifth Floor, Boston, MA 02110-1301 USA
16 
17 *****************************************************************************/
18 
19 /**************************************************/
26 #include "buf0flu.h"
27 
28 #ifdef UNIV_NONINL
29 #include "buf0flu.ic"
30 #endif
31 
32 #include "buf0buf.h"
33 #include "srv0srv.h"
34 #include "page0zip.h"
35 #ifndef UNIV_HOTBACKUP
36 #include "ut0byte.h"
37 #include "ut0lst.h"
38 #include "page0page.h"
39 #include "fil0fil.h"
40 #include "buf0lru.h"
41 #include "buf0rea.h"
42 #include "ibuf0ibuf.h"
43 #include "log0log.h"
44 #include "os0file.h"
45 #include "trx0sys.h"
46 
47 /**********************************************************************
48 These statistics are generated for heuristics used in estimating the
49 rate at which we should flush the dirty blocks to avoid bursty IO
50 activity. Note that the rate of flushing not only depends on how many
51 dirty pages we have in the buffer pool but it is also a fucntion of
52 how much redo the workload is generating and at what rate. */
53 /* @{ */
54 
58 #define BUF_FLUSH_STAT_N_INTERVAL 20
59 
62 static buf_flush_stat_t buf_flush_stat_arr[BUF_FLUSH_STAT_N_INTERVAL];
63 
65 static ulint buf_flush_stat_arr_ind;
66 
69 static buf_flush_stat_t buf_flush_stat_cur;
70 
73 static buf_flush_stat_t buf_flush_stat_sum;
74 
76 static ulint buf_lru_flush_page_count = 0;
77 
78 /* @} */
79 
80 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
81 /******************************************************************/
84 static
85 ibool
86 buf_flush_validate_low(
87 /*===================*/
88  buf_pool_t* buf_pool);
89 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
90 
91 /******************************************************************/
96 static
98 buf_flush_insert_in_flush_rbt(
99 /*==========================*/
100  buf_page_t* bpage)
101 {
102  const ib_rbt_node_t* c_node;
103  const ib_rbt_node_t* p_node;
104  buf_page_t* prev = NULL;
105  buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
106 
107  ut_ad(buf_flush_list_mutex_own(buf_pool));
108 
109  /* Insert this buffer into the rbt. */
110  c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
111  ut_a(c_node != NULL);
112 
113  /* Get the predecessor. */
114  p_node = rbt_prev(buf_pool->flush_rbt, c_node);
115 
116  if (p_node != NULL) {
117  buf_page_t** value;
118  value = rbt_value(buf_page_t*, p_node);
119  prev = *value;
120  ut_a(prev != NULL);
121  }
122 
123  return(prev);
124 }
125 
126 /*********************************************************/
128 static
129 void
130 buf_flush_delete_from_flush_rbt(
131 /*============================*/
132  buf_page_t* bpage)
133 {
134 #ifdef UNIV_DEBUG
135  ibool ret = FALSE;
136 #endif /* UNIV_DEBUG */
137  buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
138 
139  ut_ad(buf_flush_list_mutex_own(buf_pool));
140 
141 #ifdef UNIV_DEBUG
142  ret =
143 #endif /* UNIV_DEBUG */
144  rbt_delete(buf_pool->flush_rbt, &bpage);
145  ut_ad(ret);
146 }
147 
148 /*****************************************************************/
158 static
159 int
160 buf_flush_block_cmp(
161 /*================*/
162  const void* p1,
163  const void* p2)
164 {
165  int ret;
166  const buf_page_t* b1 = *(const buf_page_t**) p1;
167  const buf_page_t* b2 = *(const buf_page_t**) p2;
168 #ifdef UNIV_DEBUG
169  buf_pool_t* buf_pool = buf_pool_from_bpage(b1);
170 #endif /* UNIV_DEBUG */
171 
172  ut_ad(b1 != NULL);
173  ut_ad(b2 != NULL);
174 
175  ut_ad(buf_flush_list_mutex_own(buf_pool));
176 
177  ut_ad(b1->in_flush_list);
178  ut_ad(b2->in_flush_list);
179 
181  return(1);
182  } else if (b2->oldest_modification < b1->oldest_modification) {
183  return(-1);
184  }
185 
186  /* If oldest_modification is same then decide on the space. */
187  ret = (int)(b2->space - b1->space);
188 
189  /* Or else decide ordering on the offset field. */
190  return(ret ? ret : (int)(b2->offset - b1->offset));
191 }
192 
193 /********************************************************************/
197 UNIV_INTERN
198 void
199 buf_flush_init_flush_rbt(void)
200 /*==========================*/
201 {
202  ulint i;
203 
204  for (i = 0; i < srv_buf_pool_instances; i++) {
205  buf_pool_t* buf_pool;
206 
207  buf_pool = buf_pool_from_array(i);
208 
209  buf_flush_list_mutex_enter(buf_pool);
210 
211  /* Create red black tree for speedy insertions in flush list. */
212  buf_pool->flush_rbt = rbt_create(
213  sizeof(buf_page_t*), buf_flush_block_cmp);
214 
215  buf_flush_list_mutex_exit(buf_pool);
216  }
217 }
218 
219 /********************************************************************/
221 UNIV_INTERN
222 void
223 buf_flush_free_flush_rbt(void)
224 /*==========================*/
225 {
226  ulint i;
227 
228  for (i = 0; i < srv_buf_pool_instances; i++) {
229  buf_pool_t* buf_pool;
230 
231  buf_pool = buf_pool_from_array(i);
232 
233  buf_flush_list_mutex_enter(buf_pool);
234 
235 #ifdef UNIV_DEBUG_VALGRIND
236  {
237  ulint zip_size = buf_block_get_zip_size(block);
238 
239  if (UNIV_UNLIKELY(zip_size)) {
240  UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
241  } else {
242  UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
243  }
244  }
245 #endif /* UNIV_DEBUG_VALGRIND */
246 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
247  ut_a(buf_flush_validate_low(buf_pool));
248 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
249 
250  rbt_free(buf_pool->flush_rbt);
251  buf_pool->flush_rbt = NULL;
252 
253  buf_flush_list_mutex_exit(buf_pool);
254  }
255 }
256 
257 /********************************************************************/
259 UNIV_INTERN
260 void
261 buf_flush_insert_into_flush_list(
262 /*=============================*/
263  buf_pool_t* buf_pool,
264  buf_block_t* block,
265  ib_uint64_t lsn)
266 {
267  ut_ad(!buf_pool_mutex_own(buf_pool));
269  ut_ad(mutex_own(&block->mutex));
270 
271  buf_flush_list_mutex_enter(buf_pool);
272 
273  ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
274  || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
275  <= lsn));
276 
277  /* If we are in the recovery then we need to update the flush
278  red-black tree as well. */
279  if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
280  buf_flush_list_mutex_exit(buf_pool);
281  buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn);
282  return;
283  }
284 
286  ut_ad(!block->page.in_flush_list);
287 
288  ut_d(block->page.in_flush_list = TRUE);
289  block->page.oldest_modification = lsn;
290  UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
291 
292 #ifdef UNIV_DEBUG_VALGRIND
293  {
294  ulint zip_size = buf_block_get_zip_size(block);
295 
296  if (UNIV_UNLIKELY(zip_size)) {
297  UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
298  } else {
299  UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
300  }
301  }
302 #endif /* UNIV_DEBUG_VALGRIND */
303 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
304  ut_a(buf_flush_validate_low(buf_pool));
305 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
306 
307  buf_flush_list_mutex_exit(buf_pool);
308 }
309 
310 /********************************************************************/
314 UNIV_INTERN
315 void
316 buf_flush_insert_sorted_into_flush_list(
317 /*====================================*/
318  buf_pool_t* buf_pool,
319  buf_block_t* block,
320  ib_uint64_t lsn)
321 {
322  buf_page_t* prev_b;
323  buf_page_t* b;
324 
325  ut_ad(!buf_pool_mutex_own(buf_pool));
327  ut_ad(mutex_own(&block->mutex));
329 
330  buf_flush_list_mutex_enter(buf_pool);
331 
332  /* The field in_LRU_list is protected by buf_pool->mutex, which
333  we are not holding. However, while a block is in the flush
334  list, it is dirty and cannot be discarded, not from the
335  page_hash or from the LRU list. At most, the uncompressed
336  page frame of a compressed block may be discarded or created
337  (copying the block->page to or from a buf_page_t that is
338  dynamically allocated from buf_buddy_alloc()). Because those
339  transitions hold block->mutex and the flush list mutex (via
340  buf_flush_relocate_on_flush_list()), there is no possibility
341  of a race condition in the assertions below. */
342  ut_ad(block->page.in_LRU_list);
343  ut_ad(block->page.in_page_hash);
344  /* buf_buddy_block_register() will take a block in the
345  BUF_BLOCK_MEMORY state, not a file page. */
346  ut_ad(!block->page.in_zip_hash);
347 
348  ut_ad(!block->page.in_flush_list);
349  ut_d(block->page.in_flush_list = TRUE);
350  block->page.oldest_modification = lsn;
351 
352 #ifdef UNIV_DEBUG_VALGRIND
353  {
354  ulint zip_size = buf_block_get_zip_size(block);
355 
356  if (UNIV_UNLIKELY(zip_size)) {
357  UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
358  } else {
359  UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
360  }
361  }
362 #endif /* UNIV_DEBUG_VALGRIND */
363 
364 #ifdef UNIV_DEBUG_VALGRIND
365  {
366  ulint zip_size = buf_block_get_zip_size(block);
367 
368  if (UNIV_UNLIKELY(zip_size)) {
369  UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
370  } else {
371  UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
372  }
373  }
374 #endif /* UNIV_DEBUG_VALGRIND */
375 
376  prev_b = NULL;
377 
378  /* For the most part when this function is called the flush_rbt
379  should not be NULL. In a very rare boundary case it is possible
380  that the flush_rbt has already been freed by the recovery thread
381  before the last page was hooked up in the flush_list by the
382  io-handler thread. In that case we'll just do a simple
383  linear search in the else block. */
384  if (buf_pool->flush_rbt) {
385 
386  prev_b = buf_flush_insert_in_flush_rbt(&block->page);
387 
388  } else {
389 
390  b = UT_LIST_GET_FIRST(buf_pool->flush_list);
391 
392  while (b && b->oldest_modification
393  > block->page.oldest_modification) {
394  ut_ad(b->in_flush_list);
395  prev_b = b;
396  b = UT_LIST_GET_NEXT(list, b);
397  }
398  }
399 
400  if (prev_b == NULL) {
401  UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
402  } else {
403  UT_LIST_INSERT_AFTER(list, buf_pool->flush_list,
404  prev_b, &block->page);
405  }
406 
407 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
408  ut_a(buf_flush_validate_low(buf_pool));
409 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
410 
411  buf_flush_list_mutex_exit(buf_pool);
412 }
413 
414 /********************************************************************/
418 UNIV_INTERN
419 ibool
420 buf_flush_ready_for_replace(
421 /*========================*/
422  buf_page_t* bpage)
424 {
425 #ifdef UNIV_DEBUG
426  buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
427  ut_ad(buf_pool_mutex_own(buf_pool));
428 #endif
429  ut_ad(mutex_own(buf_page_get_mutex(bpage)));
430  ut_ad(bpage->in_LRU_list);
431 
432  if (UNIV_LIKELY(buf_page_in_file(bpage))) {
433 
434  return(bpage->oldest_modification == 0
435  && buf_page_get_io_fix(bpage) == BUF_IO_NONE
436  && bpage->buf_fix_count == 0);
437  }
438 
439  ut_print_timestamp(stderr);
440  fprintf(stderr,
441  " InnoDB: Error: buffer block state %lu"
442  " in the LRU list!\n",
443  (ulong) buf_page_get_state(bpage));
444  ut_print_buf(stderr, bpage, sizeof(buf_page_t));
445  putc('\n', stderr);
446 
447  return(FALSE);
448 }
449 
450 /********************************************************************/
453 UNIV_INLINE
454 ibool
455 buf_flush_ready_for_flush(
456 /*======================*/
457  buf_page_t* bpage,
459  enum buf_flush flush_type)
460 {
461 #ifdef UNIV_DEBUG
462  buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
463  ut_ad(buf_pool_mutex_own(buf_pool));
464 #endif
465  ut_a(buf_page_in_file(bpage));
466  ut_ad(mutex_own(buf_page_get_mutex(bpage)));
467  ut_ad(flush_type == BUF_FLUSH_LRU || BUF_FLUSH_LIST);
468 
469  if (bpage->oldest_modification != 0
470  && buf_page_get_io_fix(bpage) == BUF_IO_NONE) {
471  ut_ad(bpage->in_flush_list);
472 
473  if (flush_type != BUF_FLUSH_LRU) {
474 
475  return(TRUE);
476 
477  } else if (bpage->buf_fix_count == 0) {
478 
479  /* If we are flushing the LRU list, to avoid deadlocks
480  we require the block not to be bufferfixed, and hence
481  not latched. */
482 
483  return(TRUE);
484  }
485  }
486 
487  return(FALSE);
488 }
489 
490 /********************************************************************/
492 UNIV_INTERN
493 void
494 buf_flush_remove(
495 /*=============*/
496  buf_page_t* bpage)
497 {
498  buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
499 
500  ut_ad(buf_pool_mutex_own(buf_pool));
501  ut_ad(mutex_own(buf_page_get_mutex(bpage)));
502  ut_ad(bpage->in_flush_list);
503 
504  buf_flush_list_mutex_enter(buf_pool);
505 
506  switch (buf_page_get_state(bpage)) {
507  case BUF_BLOCK_ZIP_PAGE:
508  /* Clean compressed pages should not be on the flush list */
509  case BUF_BLOCK_ZIP_FREE:
510  case BUF_BLOCK_NOT_USED:
512  case BUF_BLOCK_MEMORY:
514  ut_error;
515  return;
516  case BUF_BLOCK_ZIP_DIRTY:
518  UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
519  buf_LRU_insert_zip_clean(bpage);
520  break;
521  case BUF_BLOCK_FILE_PAGE:
522  UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
523  break;
524  }
525 
526  /* If the flush_rbt is active then delete from there as well. */
527  if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
528  buf_flush_delete_from_flush_rbt(bpage);
529  }
530 
531  /* Must be done after we have removed it from the flush_rbt
532  because we assert on in_flush_list in comparison function. */
533  ut_d(bpage->in_flush_list = FALSE);
534 
535  bpage->oldest_modification = 0;
536 
537 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
538  ut_a(buf_flush_validate_low(buf_pool));
539 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
540 
541  buf_flush_list_mutex_exit(buf_pool);
542 }
543 
544 /*******************************************************************/
555 UNIV_INTERN
556 void
557 buf_flush_relocate_on_flush_list(
558 /*=============================*/
559  buf_page_t* bpage,
560  buf_page_t* dpage)
561 {
562  buf_page_t* prev;
563  buf_page_t* prev_b = NULL;
564  buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
565 
566  ut_ad(buf_pool_mutex_own(buf_pool));
567  /* Must reside in the same buffer pool. */
568  ut_ad(buf_pool == buf_pool_from_bpage(dpage));
569 
570  ut_ad(mutex_own(buf_page_get_mutex(bpage)));
571 
572  buf_flush_list_mutex_enter(buf_pool);
573 
574  /* FIXME: At this point we have both buf_pool and flush_list
575  mutexes. Theoretically removal of a block from flush list is
576  only covered by flush_list mutex but currently we do
577  have buf_pool mutex in buf_flush_remove() therefore this block
578  is guaranteed to be in the flush list. We need to check if
579  this will work without the assumption of block removing code
580  having the buf_pool mutex. */
581  ut_ad(bpage->in_flush_list);
582  ut_ad(dpage->in_flush_list);
583 
584  /* If recovery is active we must swap the control blocks in
585  the flush_rbt as well. */
586  if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
587  buf_flush_delete_from_flush_rbt(bpage);
588  prev_b = buf_flush_insert_in_flush_rbt(dpage);
589  }
590 
591  /* Must be done after we have removed it from the flush_rbt
592  because we assert on in_flush_list in comparison function. */
593  ut_d(bpage->in_flush_list = FALSE);
594 
595  prev = UT_LIST_GET_PREV(list, bpage);
596  UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
597 
598  if (prev) {
599  ut_ad(prev->in_flush_list);
601  list,
602  buf_pool->flush_list,
603  prev, dpage);
604  } else {
606  list,
607  buf_pool->flush_list,
608  dpage);
609  }
610 
611  /* Just an extra check. Previous in flush_list
612  should be the same control block as in flush_rbt. */
613  ut_a(!buf_pool->flush_rbt || prev_b == prev);
614 
615 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
616  ut_a(buf_flush_validate_low(buf_pool));
617 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
618 
619  buf_flush_list_mutex_exit(buf_pool);
620 }
621 
622 /********************************************************************/
624 UNIV_INTERN
625 void
626 buf_flush_write_complete(
627 /*=====================*/
628  buf_page_t* bpage)
629 {
630  enum buf_flush flush_type;
631  buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
632 
633  ut_ad(bpage);
634 
635  buf_flush_remove(bpage);
636 
637  flush_type = buf_page_get_flush_type(bpage);
638  buf_pool->n_flush[flush_type]--;
639 
640  if (flush_type == BUF_FLUSH_LRU) {
641  /* Put the block to the end of the LRU list to wait to be
642  moved to the free list */
643 
644  buf_LRU_make_block_old(bpage);
645 
646  buf_pool->LRU_flush_ended++;
647  }
648 
649  /* fprintf(stderr, "n pending flush %lu\n",
650  buf_pool->n_flush[flush_type]); */
651 
652  if (buf_pool->n_flush[flush_type] == 0
653  && buf_pool->init_flush[flush_type] == FALSE) {
654 
655  /* The running flush batch has ended */
656 
657  os_event_set(buf_pool->no_flush[flush_type]);
658  }
659 }
660 
661 /********************************************************************/
664 static
665 void
666 buf_flush_sync_datafiles(void)
667 /*==========================*/
668 {
669  /* Wake possible simulated aio thread to actually post the
670  writes to the operating system */
672 
673  /* Wait that all async writes to tablespaces have been posted to
674  the OS */
676 
677  /* Now we flush the data to disk (for example, with fsync) */
678  fil_flush_file_spaces(FIL_TABLESPACE);
679 
680  return;
681 }
682 
683 /********************************************************************/
689 static
690 void
691 buf_flush_buffered_writes(void)
692 /*===========================*/
693 {
694  byte* write_buf;
695  ulint len;
696  ulint len2;
697  ulint i;
698 
699  if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) {
700  /* Sync the writes to the disk. */
701  buf_flush_sync_datafiles();
702  return;
703  }
704 
705  mutex_enter(&(trx_doublewrite->mutex));
706 
707  /* Write first to doublewrite buffer blocks. We use synchronous
708  aio and thus know that file write has been completed when the
709  control returns. */
710 
711  if (trx_doublewrite->first_free == 0) {
712 
713  mutex_exit(&(trx_doublewrite->mutex));
714 
715  return;
716  }
717 
718  for (i = 0; i < trx_doublewrite->first_free; i++) {
719 
720  const buf_block_t* block;
721 
723 
725  || block->page.zip.data) {
726  /* No simple validate for compressed pages exists. */
727  continue;
728  }
729 
730  if (UNIV_UNLIKELY
731  (memcmp(block->frame + (FIL_PAGE_LSN + 4),
732  block->frame + (UNIV_PAGE_SIZE
734  4))) {
735  ut_print_timestamp(stderr);
736  fprintf(stderr,
737  " InnoDB: ERROR: The page to be written"
738  " seems corrupt!\n"
739  "InnoDB: The lsn fields do not match!"
740  " Noticed in the buffer pool\n"
741  "InnoDB: before posting to the"
742  " doublewrite buffer.\n");
743  }
744 
745  if (!block->check_index_page_at_flush) {
746  } else if (page_is_comp(block->frame)) {
747  if (UNIV_UNLIKELY
748  (!page_simple_validate_new(block->frame))) {
749 corrupted_page:
750  buf_page_print(block->frame, 0);
751 
752  ut_print_timestamp(stderr);
753  fprintf(stderr,
754  " InnoDB: Apparent corruption of an"
755  " index page n:o %lu in space %lu\n"
756  "InnoDB: to be written to data file."
757  " We intentionally crash server\n"
758  "InnoDB: to prevent corrupt data"
759  " from ending up in data\n"
760  "InnoDB: files.\n",
761  (ulong) buf_block_get_page_no(block),
762  (ulong) buf_block_get_space(block));
763 
764  ut_error;
765  }
766  } else if (UNIV_UNLIKELY
767  (!page_simple_validate_old(block->frame))) {
768 
769  goto corrupted_page;
770  }
771  }
772 
773  /* increment the doublewrite flushed pages counter */
774  srv_dblwr_pages_written+= trx_doublewrite->first_free;
775  srv_dblwr_writes++;
776 
778  trx_doublewrite->first_free) * UNIV_PAGE_SIZE;
779 
780  write_buf = trx_doublewrite->write_buf;
781  i = 0;
782 
783  fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
784  trx_doublewrite->block1, 0, len,
785  (void*) write_buf, NULL);
786 
787  for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
788  len2 += UNIV_PAGE_SIZE, i++) {
789  const buf_block_t* block = (buf_block_t*)
791 
792  if (UNIV_LIKELY(!block->page.zip.data)
793  && UNIV_LIKELY(buf_block_get_state(block)
795  && UNIV_UNLIKELY
796  (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
797  write_buf + len2
798  + (UNIV_PAGE_SIZE
799  - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
800  ut_print_timestamp(stderr);
801  fprintf(stderr,
802  " InnoDB: ERROR: The page to be written"
803  " seems corrupt!\n"
804  "InnoDB: The lsn fields do not match!"
805  " Noticed in the doublewrite block1.\n");
806  }
807  }
808 
810  goto flush;
811  }
812 
814  * UNIV_PAGE_SIZE;
815 
816  write_buf = trx_doublewrite->write_buf
817  + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
819 
820  fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
821  trx_doublewrite->block2, 0, len,
822  (void*) write_buf, NULL);
823 
824  for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
825  len2 += UNIV_PAGE_SIZE, i++) {
826  const buf_block_t* block = (buf_block_t*)
828 
829  if (UNIV_LIKELY(!block->page.zip.data)
830  && UNIV_LIKELY(buf_block_get_state(block)
832  && UNIV_UNLIKELY
833  (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
834  write_buf + len2
835  + (UNIV_PAGE_SIZE
836  - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
837  ut_print_timestamp(stderr);
838  fprintf(stderr,
839  " InnoDB: ERROR: The page to be"
840  " written seems corrupt!\n"
841  "InnoDB: The lsn fields do not match!"
842  " Noticed in"
843  " the doublewrite block2.\n");
844  }
845  }
846 
847 flush:
848  /* Now flush the doublewrite buffer data to disk */
849 
850  fil_flush(TRX_SYS_SPACE);
851 
852  /* We know that the writes have been flushed to disk now
853  and in recovery we will find them in the doublewrite buffer
854  blocks. Next do the writes to the intended positions. */
855 
856  for (i = 0; i < trx_doublewrite->first_free; i++) {
857  const buf_block_t* block = (buf_block_t*)
859 
860  ut_a(buf_page_in_file(&block->page));
861  if (UNIV_LIKELY_NULL(block->page.zip.data)) {
862  fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
863  FALSE, buf_page_get_space(&block->page),
864  buf_page_get_zip_size(&block->page),
865  buf_page_get_page_no(&block->page), 0,
866  buf_page_get_zip_size(&block->page),
867  (void*)block->page.zip.data,
868  (void*)block);
869 
870  /* Increment the counter of I/O operations used
871  for selecting LRU policy. */
873 
874  continue;
875  }
876 
878 
879  if (UNIV_UNLIKELY(memcmp(block->frame + (FIL_PAGE_LSN + 4),
880  block->frame
881  + (UNIV_PAGE_SIZE
883  4))) {
884  ut_print_timestamp(stderr);
885  fprintf(stderr,
886  " InnoDB: ERROR: The page to be written"
887  " seems corrupt!\n"
888  "InnoDB: The lsn fields do not match!"
889  " Noticed in the buffer pool\n"
890  "InnoDB: after posting and flushing"
891  " the doublewrite buffer.\n"
892  "InnoDB: Page buf fix count %lu,"
893  " io fix %lu, state %lu\n",
894  (ulong)block->page.buf_fix_count,
895  (ulong)buf_block_get_io_fix(block),
896  (ulong)buf_block_get_state(block));
897  }
898 
899  fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
900  FALSE, buf_block_get_space(block), 0,
901  buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
902  (void*)block->frame, (void*)block);
903 
904  /* Increment the counter of I/O operations used
905  for selecting LRU policy. */
907  }
908 
909  /* Sync the writes to the disk. */
910  buf_flush_sync_datafiles();
911 
912  /* We can now reuse the doublewrite memory buffer: */
914 
915  mutex_exit(&(trx_doublewrite->mutex));
916 }
917 
918 /********************************************************************/
922 static
923 void
924 buf_flush_post_to_doublewrite_buf(
925 /*==============================*/
926  buf_page_t* bpage)
927 {
928  ulint zip_size;
929 try_again:
930  mutex_enter(&(trx_doublewrite->mutex));
931 
932  ut_a(buf_page_in_file(bpage));
933 
936  mutex_exit(&(trx_doublewrite->mutex));
937 
938  buf_flush_buffered_writes();
939 
940  goto try_again;
941  }
942 
943  zip_size = buf_page_get_zip_size(bpage);
944 
945  if (UNIV_UNLIKELY(zip_size)) {
946  UNIV_MEM_ASSERT_RW(bpage->zip.data, zip_size);
947  /* Copy the compressed page and clear the rest. */
948  memcpy(trx_doublewrite->write_buf
949  + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
950  bpage->zip.data, zip_size);
951  memset(trx_doublewrite->write_buf
952  + UNIV_PAGE_SIZE * trx_doublewrite->first_free
953  + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
954  } else {
956  UNIV_MEM_ASSERT_RW(((buf_block_t*) bpage)->frame,
957  UNIV_PAGE_SIZE);
958 
959  memcpy(trx_doublewrite->write_buf
960  + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
961  ((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
962  }
963 
965 
967 
970  mutex_exit(&(trx_doublewrite->mutex));
971 
972  buf_flush_buffered_writes();
973 
974  return;
975  }
976 
977  mutex_exit(&(trx_doublewrite->mutex));
978 }
979 #endif /* !UNIV_HOTBACKUP */
980 
981 /********************************************************************/
983 UNIV_INTERN
984 void
985 buf_flush_init_for_writing(
986 /*=======================*/
987  byte* page,
988  void* page_zip_,
989  ib_uint64_t newest_lsn)
991 {
992  ut_ad(page);
993 
994  if (page_zip_) {
995  page_zip_des_t* page_zip = static_cast<page_zip_des_t *>(page_zip_);
996  ulint zip_size = page_zip_get_size(page_zip);
997  ut_ad(zip_size);
998  ut_ad(ut_is_2pow(zip_size));
999  ut_ad(zip_size <= UNIV_PAGE_SIZE);
1000 
1001  switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
1003  case FIL_PAGE_INODE:
1004  case FIL_PAGE_IBUF_BITMAP:
1005  case FIL_PAGE_TYPE_FSP_HDR:
1006  case FIL_PAGE_TYPE_XDES:
1007  /* These are essentially uncompressed pages. */
1008  memcpy(page_zip->data, page, zip_size);
1009  /* fall through */
1010  case FIL_PAGE_TYPE_ZBLOB:
1011  case FIL_PAGE_TYPE_ZBLOB2:
1012  case FIL_PAGE_INDEX:
1013  mach_write_to_8(page_zip->data
1014  + FIL_PAGE_LSN, newest_lsn);
1015  memset(page_zip->data + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
1016  mach_write_to_4(page_zip->data
1018  srv_use_checksums
1020  page_zip->data, zip_size)
1022  return;
1023  }
1024 
1025  ut_print_timestamp(stderr);
1026  fputs(" InnoDB: ERROR: The compressed page to be written"
1027  " seems corrupt:", stderr);
1028  ut_print_buf(stderr, page, zip_size);
1029  fputs("\nInnoDB: Possibly older version of the page:", stderr);
1030  ut_print_buf(stderr, page_zip->data, zip_size);
1031  putc('\n', stderr);
1032  ut_error;
1033  }
1034 
1035  /* Write the newest modification lsn to the page header and trailer */
1036  mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
1037 
1038  mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
1039  newest_lsn);
1040 
1041  /* Store the new formula checksum */
1042 
1044  srv_use_checksums
1045  ? buf_calc_page_new_checksum(page)
1047 
1048  /* We overwrite the first 4 bytes of the end lsn field to store
1049  the old formula checksum. Since it depends also on the field
1050  FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
1051  new formula checksum. */
1052 
1053  mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
1054  srv_use_checksums
1055  ? buf_calc_page_old_checksum(page)
1056  : BUF_NO_CHECKSUM_MAGIC);
1057 }
1058 
1059 #ifndef UNIV_HOTBACKUP
1060 /********************************************************************/
1064 static
1065 void
1066 buf_flush_write_block_low(
1067 /*======================*/
1068  buf_page_t* bpage)
1069 {
1070  ulint zip_size = buf_page_get_zip_size(bpage);
1071  page_t* frame = NULL;
1072 
1073 #ifdef UNIV_DEBUG
1074  buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1075  ut_ad(!buf_pool_mutex_own(buf_pool));
1076 #endif
1077 
1078 #ifdef UNIV_LOG_DEBUG
1079  static ibool univ_log_debug_warned;
1080 #endif /* UNIV_LOG_DEBUG */
1081 
1082  ut_ad(buf_page_in_file(bpage));
1083 
1084  /* We are not holding buf_pool->mutex or block_mutex here.
1085  Nevertheless, it is safe to access bpage, because it is
1086  io_fixed and oldest_modification != 0. Thus, it cannot be
1087  relocated in the buffer pool or removed from flush_list or
1088  LRU_list. */
1089  ut_ad(!buf_pool_mutex_own(buf_pool));
1090  ut_ad(!buf_flush_list_mutex_own(buf_pool));
1091  ut_ad(!mutex_own(buf_page_get_mutex(bpage)));
1093  ut_ad(bpage->oldest_modification != 0);
1094 
1095 #ifdef UNIV_IBUF_COUNT_DEBUG
1096  ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
1097 #endif
1098  ut_ad(bpage->newest_modification != 0);
1099 
1100 #ifdef UNIV_LOG_DEBUG
1101  if (!univ_log_debug_warned) {
1102  univ_log_debug_warned = TRUE;
1103  fputs("Warning: cannot force log to disk if"
1104  " UNIV_LOG_DEBUG is defined!\n"
1105  "Crash recovery will not work!\n",
1106  stderr);
1107  }
1108 #else
1109  /* Force the log to the disk before writing the modified block */
1110  log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
1111 #endif
1112  switch (buf_page_get_state(bpage)) {
1113  case BUF_BLOCK_ZIP_FREE:
1114  case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
1115  case BUF_BLOCK_NOT_USED:
1117  case BUF_BLOCK_MEMORY:
1118  case BUF_BLOCK_REMOVE_HASH:
1119  ut_error;
1120  break;
1121  case BUF_BLOCK_ZIP_DIRTY:
1122  frame = bpage->zip.data;
1123  if (UNIV_LIKELY(srv_use_checksums)) {
1125  == page_zip_calc_checksum(frame, zip_size));
1126  }
1127  mach_write_to_8(frame + FIL_PAGE_LSN,
1128  bpage->newest_modification);
1129  memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
1130  break;
1131  case BUF_BLOCK_FILE_PAGE:
1132  frame = bpage->zip.data;
1133  if (!frame) {
1134  frame = ((buf_block_t*) bpage)->frame;
1135  }
1136 
1137  buf_flush_init_for_writing(((buf_block_t*) bpage)->frame,
1138  bpage->zip.data
1139  ? &bpage->zip : NULL,
1140  bpage->newest_modification);
1141  break;
1142  }
1143 
1144  if (!srv_use_doublewrite_buf || !trx_doublewrite) {
1145  fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
1146  FALSE, buf_page_get_space(bpage), zip_size,
1147  buf_page_get_page_no(bpage), 0,
1148  zip_size ? zip_size : UNIV_PAGE_SIZE,
1149  frame, bpage);
1150  } else {
1151  buf_flush_post_to_doublewrite_buf(bpage);
1152  }
1153 }
1154 
1155 # if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
1156 /********************************************************************/
1162 UNIV_INTERN
1163 ibool
1164 buf_flush_page_try(
1165 /*===============*/
1166  buf_pool_t* buf_pool,
1167  buf_block_t* block)
1168 {
1169  ut_ad(buf_pool_mutex_own(buf_pool));
1171  ut_ad(mutex_own(&block->mutex));
1172 
1173  if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_LRU)) {
1174  return(FALSE);
1175  }
1176 
1177  if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0
1178  || buf_pool->init_flush[BUF_FLUSH_LRU]) {
1179  /* There is already a flush batch of the same type running */
1180  return(FALSE);
1181  }
1182 
1183  buf_pool->init_flush[BUF_FLUSH_LRU] = TRUE;
1184 
1186 
1188 
1189  if (buf_pool->n_flush[BUF_FLUSH_LRU]++ == 0) {
1190 
1192  }
1193 
1194  /* VERY IMPORTANT:
1195  Because any thread may call the LRU flush, even when owning
1196  locks on pages, to avoid deadlocks, we must make sure that the
1197  s-lock is acquired on the page without waiting: this is
1198  accomplished because buf_flush_ready_for_flush() must hold,
1199  and that requires the page not to be bufferfixed. */
1200 
1201  rw_lock_s_lock_gen(&block->lock, BUF_IO_WRITE);
1202 
1203  /* Note that the s-latch is acquired before releasing the
1204  buf_pool mutex: this ensures that the latch is acquired
1205  immediately. */
1206 
1207  mutex_exit(&block->mutex);
1208  buf_pool_mutex_exit(buf_pool);
1209 
1210  /* Even though block is not protected by any mutex at this
1211  point, it is safe to access block, because it is io_fixed and
1212  oldest_modification != 0. Thus, it cannot be relocated in the
1213  buffer pool or removed from flush_list or LRU_list. */
1214 
1215  buf_flush_write_block_low(&block->page);
1216 
1217  buf_pool_mutex_enter(buf_pool);
1218  buf_pool->init_flush[BUF_FLUSH_LRU] = FALSE;
1219 
1220  if (buf_pool->n_flush[BUF_FLUSH_LRU] == 0) {
1221  /* The running flush batch has ended */
1222  os_event_set(buf_pool->no_flush[BUF_FLUSH_LRU]);
1223  }
1224 
1225  buf_pool_mutex_exit(buf_pool);
1226  buf_flush_buffered_writes();
1227 
1228  return(TRUE);
1229 }
1230 # endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
1231 
1232 /********************************************************************/
1239 static
1240 void
1241 buf_flush_page(
1242 /*===========*/
1243  buf_pool_t* buf_pool,
1244  buf_page_t* bpage,
1245  enum buf_flush flush_type)
1247 {
1248  mutex_t* block_mutex;
1249  ibool is_uncompressed;
1250 
1251  ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1252  ut_ad(buf_pool_mutex_own(buf_pool));
1253  ut_ad(buf_page_in_file(bpage));
1254 
1255  block_mutex = buf_page_get_mutex(bpage);
1256  ut_ad(mutex_own(block_mutex));
1257 
1258  ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
1259 
1261 
1262  buf_page_set_flush_type(bpage, flush_type);
1263 
1264  if (buf_pool->n_flush[flush_type] == 0) {
1265 
1266  os_event_reset(buf_pool->no_flush[flush_type]);
1267  }
1268 
1269  buf_pool->n_flush[flush_type]++;
1270 
1271  is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
1272  ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex));
1273 
1274  switch (flush_type) {
1275  ibool is_s_latched;
1276  case BUF_FLUSH_LIST:
1277  /* If the simulated aio thread is not running, we must
1278  not wait for any latch, as we may end up in a deadlock:
1279  if buf_fix_count == 0, then we know we need not wait */
1280 
1281  is_s_latched = (bpage->buf_fix_count == 0);
1282  if (is_s_latched && is_uncompressed) {
1283  rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
1284  BUF_IO_WRITE);
1285  }
1286 
1287  mutex_exit(block_mutex);
1288  buf_pool_mutex_exit(buf_pool);
1289 
1290  /* Even though bpage is not protected by any mutex at
1291  this point, it is safe to access bpage, because it is
1292  io_fixed and oldest_modification != 0. Thus, it
1293  cannot be relocated in the buffer pool or removed from
1294  flush_list or LRU_list. */
1295 
1296  if (!is_s_latched) {
1297  buf_flush_buffered_writes();
1298 
1299  if (is_uncompressed) {
1300  rw_lock_s_lock_gen(&((buf_block_t*) bpage)
1301  ->lock, BUF_IO_WRITE);
1302  }
1303  }
1304 
1305  break;
1306 
1307  case BUF_FLUSH_LRU:
1308  /* VERY IMPORTANT:
1309  Because any thread may call the LRU flush, even when owning
1310  locks on pages, to avoid deadlocks, we must make sure that the
1311  s-lock is acquired on the page without waiting: this is
1312  accomplished because buf_flush_ready_for_flush() must hold,
1313  and that requires the page not to be bufferfixed. */
1314 
1315  if (is_uncompressed) {
1316  rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
1317  BUF_IO_WRITE);
1318  }
1319 
1320  /* Note that the s-latch is acquired before releasing the
1321  buf_pool mutex: this ensures that the latch is acquired
1322  immediately. */
1323 
1324  mutex_exit(block_mutex);
1325  buf_pool_mutex_exit(buf_pool);
1326  break;
1327 
1328  default:
1329  ut_error;
1330  }
1331 
1332  /* Even though bpage is not protected by any mutex at this
1333  point, it is safe to access bpage, because it is io_fixed and
1334  oldest_modification != 0. Thus, it cannot be relocated in the
1335  buffer pool or removed from flush_list or LRU_list. */
1336 
1337 #ifdef UNIV_DEBUG
1338  if (buf_debug_prints) {
1339  fprintf(stderr,
1340  "Flushing %u space %u page %u\n",
1341  flush_type, bpage->space, bpage->offset);
1342  }
1343 #endif /* UNIV_DEBUG */
1344  buf_flush_write_block_low(bpage);
1345 }
1346 
1347 /***********************************************************/
1350 static
1351 ulint
1352 buf_flush_try_neighbors(
1353 /*====================*/
1354  ulint space,
1355  ulint offset,
1356  enum buf_flush flush_type,
1358  ulint n_flushed,
1360  ulint n_to_flush)
1362 {
1363  ulint i;
1364  ulint low;
1365  ulint high;
1366  ulint count = 0;
1367  buf_pool_t* buf_pool = buf_pool_get(space, offset);
1368 
1369  ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1370 
1371  if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN
1372  || !srv_flush_neighbor_pages) {
1373  /* If there is little space, it is better not to flush
1374  any block except from the end of the LRU list */
1375 
1376  low = offset;
1377  high = offset + 1;
1378  } else {
1379  /* When flushed, dirty blocks are searched in
1380  neighborhoods of this size, and flushed along with the
1381  original page. */
1382 
1383  ulint buf_flush_area;
1384 
1385  buf_flush_area = ut_min(
1386  BUF_READ_AHEAD_AREA(buf_pool),
1387  buf_pool->curr_size / 16);
1388 
1389  low = (offset / buf_flush_area) * buf_flush_area;
1390  high = (offset / buf_flush_area + 1) * buf_flush_area;
1391  }
1392 
1393  /* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
1394 
1395  if (high > fil_space_get_size(space)) {
1396  high = fil_space_get_size(space);
1397  }
1398 
1399  for (i = low; i < high; i++) {
1400 
1401  buf_page_t* bpage;
1402 
1403  if ((count + n_flushed) >= n_to_flush) {
1404 
1405  /* We have already flushed enough pages and
1406  should call it a day. There is, however, one
1407  exception. If the page whose neighbors we
1408  are flushing has not been flushed yet then
1409  we'll try to flush the victim that we
1410  selected originally. */
1411  if (i <= offset) {
1412  i = offset;
1413  } else {
1414  break;
1415  }
1416  }
1417 
1418  buf_pool = buf_pool_get(space, i);
1419 
1420  buf_pool_mutex_enter(buf_pool);
1421 
1422  /* We only want to flush pages from this buffer pool. */
1423  bpage = buf_page_hash_get(buf_pool, space, i);
1424 
1425  if (!bpage) {
1426 
1427  buf_pool_mutex_exit(buf_pool);
1428  continue;
1429  }
1430 
1431  ut_a(buf_page_in_file(bpage));
1432 
1433  /* We avoid flushing 'non-old' blocks in an LRU flush,
1434  because the flushed blocks are soon freed */
1435 
1436  if (flush_type != BUF_FLUSH_LRU
1437  || i == offset
1438  || buf_page_is_old(bpage)) {
1439  mutex_t* block_mutex = buf_page_get_mutex(bpage);
1440 
1441  mutex_enter(block_mutex);
1442 
1443  if (buf_flush_ready_for_flush(bpage, flush_type)
1444  && (i == offset || !bpage->buf_fix_count)) {
1445  /* We only try to flush those
1446  neighbors != offset where the buf fix
1447  count is zero, as we then know that we
1448  probably can latch the page without a
1449  semaphore wait. Semaphore waits are
1450  expensive because we must flush the
1451  doublewrite buffer before we start
1452  waiting. */
1453 
1454  buf_flush_page(buf_pool, bpage, flush_type);
1455  ut_ad(!mutex_own(block_mutex));
1456  ut_ad(!buf_pool_mutex_own(buf_pool));
1457  count++;
1458  continue;
1459  } else {
1460  mutex_exit(block_mutex);
1461  }
1462  }
1463  buf_pool_mutex_exit(buf_pool);
1464  }
1465 
1466  return(count);
1467 }
1468 
1469 /********************************************************************/
1476 static
1477 ibool
1478 buf_flush_page_and_try_neighbors(
1479 /*=============================*/
1480  buf_page_t* bpage,
1483  enum buf_flush flush_type,
1485  ulint n_to_flush,
1487  ulint* count)
1489 {
1490  mutex_t* block_mutex;
1491  ibool flushed = FALSE;
1492 #ifdef UNIV_DEBUG
1493  buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1494 #endif /* UNIV_DEBUG */
1495 
1496  ut_ad(buf_pool_mutex_own(buf_pool));
1497 
1498  block_mutex = buf_page_get_mutex(bpage);
1499  mutex_enter(block_mutex);
1500 
1501  ut_a(buf_page_in_file(bpage));
1502 
1503  if (buf_flush_ready_for_flush(bpage, flush_type)) {
1504  ulint space;
1505  ulint offset;
1506  buf_pool_t* buf_pool;
1507 
1508  buf_pool = buf_pool_from_bpage(bpage);
1509 
1510  buf_pool_mutex_exit(buf_pool);
1511 
1512  /* These fields are protected by both the
1513  buffer pool mutex and block mutex. */
1514  space = buf_page_get_space(bpage);
1515  offset = buf_page_get_page_no(bpage);
1516 
1517  mutex_exit(block_mutex);
1518 
1519  /* Try to flush also all the neighbors */
1520  *count += buf_flush_try_neighbors(space,
1521  offset,
1522  flush_type,
1523  *count,
1524  n_to_flush);
1525 
1526  buf_pool_mutex_enter(buf_pool);
1527  flushed = TRUE;
1528  } else {
1529  mutex_exit(block_mutex);
1530  }
1531 
1532  ut_ad(buf_pool_mutex_own(buf_pool));
1533 
1534  return(flushed);
1535 }
1536 
1537 /*******************************************************************/
1543 static
1544 ulint
1545 buf_flush_LRU_list_batch(
1546 /*=====================*/
1547  buf_pool_t* buf_pool,
1548  ulint max)
1549 {
1550  buf_page_t* bpage;
1551  ulint count = 0;
1552 
1553  ut_ad(buf_pool_mutex_own(buf_pool));
1554 
1555  do {
1556  /* Start from the end of the list looking for a
1557  suitable block to be flushed. */
1558  bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1559 
1560  /* Iterate backwards over the flush list till we find
1561  a page that isn't ready for flushing. */
1562  while (bpage != NULL
1563  && !buf_flush_page_and_try_neighbors(
1564  bpage, BUF_FLUSH_LRU, max, &count)) {
1565 
1566  bpage = UT_LIST_GET_PREV(LRU, bpage);
1567  }
1568  } while (bpage != NULL && count < max);
1569 
1570  /* We keep track of all flushes happening as part of LRU
1571  flush. When estimating the desired rate at which flush_list
1572  should be flushed, we factor in this value. */
1573  buf_lru_flush_page_count += count;
1574 
1575  ut_ad(buf_pool_mutex_own(buf_pool));
1576 
1577  return(count);
1578 }
1579 
1580 /*******************************************************************/
1586 static
1587 ulint
1588 buf_flush_flush_list_batch(
1589 /*=======================*/
1590  buf_pool_t* buf_pool,
1591  ulint min_n,
1595  ib_uint64_t lsn_limit)
1600 {
1601  ulint len;
1602  buf_page_t* bpage;
1603  ulint count = 0;
1604 
1605  ut_ad(buf_pool_mutex_own(buf_pool));
1606 
1607  /* If we have flushed enough, leave the loop */
1608  do {
1609  /* Start from the end of the list looking for a suitable
1610  block to be flushed. */
1611 
1612  buf_flush_list_mutex_enter(buf_pool);
1613 
1614  /* We use len here because theoretically insertions can
1615  happen in the flush_list below while we are traversing
1616  it for a suitable candidate for flushing. We'd like to
1617  set a limit on how farther we are willing to traverse
1618  the list. */
1619  len = UT_LIST_GET_LEN(buf_pool->flush_list);
1620  bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
1621 
1622  if (bpage) {
1623  ut_a(bpage->oldest_modification > 0);
1624  }
1625 
1626  if (!bpage || bpage->oldest_modification >= lsn_limit) {
1627 
1628  /* We have flushed enough */
1629  buf_flush_list_mutex_exit(buf_pool);
1630  break;
1631  }
1632 
1633  ut_a(bpage->oldest_modification > 0);
1634 
1635  ut_ad(bpage->in_flush_list);
1636 
1637  buf_flush_list_mutex_exit(buf_pool);
1638 
1639  /* The list may change during the flushing and we cannot
1640  safely preserve within this function a pointer to a
1641  block in the list! */
1642  while (bpage != NULL
1643  && len > 0
1644  && !buf_flush_page_and_try_neighbors(
1645  bpage, BUF_FLUSH_LIST, min_n, &count)) {
1646 
1647  buf_flush_list_mutex_enter(buf_pool);
1648 
1649  /* If we are here that means that buf_pool->mutex
1650  was not released in buf_flush_page_and_try_neighbors()
1651  above and this guarantees that bpage didn't get
1652  relocated since we released the flush_list
1653  mutex above. There is a chance, however, that
1654  the bpage got removed from flush_list (not
1655  currently possible because flush_list_remove()
1656  also obtains buf_pool mutex but that may change
1657  in future). To avoid this scenario we check
1658  the oldest_modification and if it is zero
1659  we start all over again. */
1660  if (bpage->oldest_modification == 0) {
1661  buf_flush_list_mutex_exit(buf_pool);
1662  break;
1663  }
1664 
1665  bpage = UT_LIST_GET_PREV(list, bpage);
1666 
1667  ut_ad(!bpage || bpage->in_flush_list);
1668 
1669  buf_flush_list_mutex_exit(buf_pool);
1670 
1671  --len;
1672  }
1673 
1674  } while (count < min_n && bpage != NULL && len > 0);
1675 
1676  ut_ad(buf_pool_mutex_own(buf_pool));
1677 
1678  return(count);
1679 }
1680 
1681 /*******************************************************************/
1689 static
1690 ulint
1691 buf_flush_batch(
1692 /*============*/
1693  buf_pool_t* buf_pool,
1694  enum buf_flush flush_type,
1698  ulint min_n,
1701  ib_uint64_t lsn_limit)
1706 {
1707  ulint count = 0;
1708 
1709  ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1710 #ifdef UNIV_SYNC_DEBUG
1711  ut_ad((flush_type != BUF_FLUSH_LIST)
1712  || sync_thread_levels_empty_gen(TRUE));
1713 #endif /* UNIV_SYNC_DEBUG */
1714 
1715  buf_pool_mutex_enter(buf_pool);
1716 
1717  /* Note: The buffer pool mutex is released and reacquired within
1718  the flush functions. */
1719  switch(flush_type) {
1720  case BUF_FLUSH_LRU:
1721  count = buf_flush_LRU_list_batch(buf_pool, min_n);
1722  break;
1723  case BUF_FLUSH_LIST:
1724  count = buf_flush_flush_list_batch(buf_pool, min_n, lsn_limit);
1725  break;
1726  default:
1727  ut_error;
1728  }
1729 
1730  buf_pool_mutex_exit(buf_pool);
1731 
1732  buf_flush_buffered_writes();
1733 
1734 #ifdef UNIV_DEBUG
1735  if (buf_debug_prints && count > 0) {
1736  fprintf(stderr, flush_type == BUF_FLUSH_LRU
1737  ? "Flushed %lu pages in LRU flush\n"
1738  : "Flushed %lu pages in flush list flush\n",
1739  (ulong) count);
1740  }
1741 #endif /* UNIV_DEBUG */
1742 
1743  srv_buf_pool_flushed += count;
1744 
1745  return(count);
1746 }
1747 
1748 /******************************************************************/
1750 static
1751 void
1752 buf_flush_common(
1753 /*=============*/
1754  enum buf_flush flush_type,
1755  ulint page_count)
1756 {
1757  buf_flush_buffered_writes();
1758 
1759  ut_a(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1760 
1761 #ifdef UNIV_DEBUG
1762  if (buf_debug_prints && page_count > 0) {
1763  fprintf(stderr, flush_type == BUF_FLUSH_LRU
1764  ? "Flushed %lu pages in LRU flush\n"
1765  : "Flushed %lu pages in flush list flush\n",
1766  (ulong) page_count);
1767  }
1768 #endif /* UNIV_DEBUG */
1769 
1770  srv_buf_pool_flushed += page_count;
1771 
1772  if (flush_type == BUF_FLUSH_LRU) {
1773  /* We keep track of all flushes happening as part of LRU
1774  flush. When estimating the desired rate at which flush_list
1775  should be flushed we factor in this value. */
1776  buf_lru_flush_page_count += page_count;
1777  }
1778 }
1779 
1780 /******************************************************************/
1782 static
1783 ibool
1784 buf_flush_start(
1785 /*============*/
1786  buf_pool_t* buf_pool,
1787  enum buf_flush flush_type)
1789 {
1790  buf_pool_mutex_enter(buf_pool);
1791 
1792  if (buf_pool->n_flush[flush_type] > 0
1793  || buf_pool->init_flush[flush_type] == TRUE) {
1794 
1795  /* There is already a flush batch of the same type running */
1796 
1797  buf_pool_mutex_exit(buf_pool);
1798 
1799  return(FALSE);
1800  }
1801 
1802  buf_pool->init_flush[flush_type] = TRUE;
1803 
1804  buf_pool_mutex_exit(buf_pool);
1805 
1806  return(TRUE);
1807 }
1808 
1809 /******************************************************************/
1811 static
1812 void
1813 buf_flush_end(
1814 /*==========*/
1815  buf_pool_t* buf_pool,
1816  enum buf_flush flush_type)
1818 {
1819  buf_pool_mutex_enter(buf_pool);
1820 
1821  buf_pool->init_flush[flush_type] = FALSE;
1822 
1823  if (buf_pool->n_flush[flush_type] == 0) {
1824 
1825  /* The running flush batch has ended */
1826 
1827  os_event_set(buf_pool->no_flush[flush_type]);
1828  }
1829 
1830  buf_pool_mutex_exit(buf_pool);
1831 }
1832 
1833 /******************************************************************/
1835 UNIV_INTERN
1836 void
1837 buf_flush_wait_batch_end(
1838 /*=====================*/
1839  buf_pool_t* buf_pool,
1840  enum buf_flush type)
1842 {
1843  ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
1844 
1845  if (buf_pool == NULL) {
1846  ulint i;
1847 
1848  for (i = 0; i < srv_buf_pool_instances; ++i) {
1849  buf_pool_t* i_buf_pool = buf_pool_from_array(i);
1850 
1851  os_event_wait(i_buf_pool->no_flush[type]);
1852  }
1853  } else {
1854  os_event_wait(buf_pool->no_flush[type]);
1855  }
1856 }
1857 
1858 /*******************************************************************/
1865 UNIV_INTERN
1866 ulint
1867 buf_flush_LRU(
1868 /*==========*/
1869  buf_pool_t* buf_pool,
1870  ulint min_n)
1873 {
1874  ulint page_count;
1875 
1876  if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) {
1877  return(ULINT_UNDEFINED);
1878  }
1879 
1880  page_count = buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0);
1881 
1882  buf_flush_end(buf_pool, BUF_FLUSH_LRU);
1883 
1884  buf_flush_common(BUF_FLUSH_LRU, page_count);
1885 
1886  return(page_count);
1887 }
1888 
1889 /*******************************************************************/
1895 UNIV_INTERN
1896 ulint
1897 buf_flush_list(
1898 /*===========*/
1899  ulint min_n,
1902  ib_uint64_t lsn_limit)
1907 {
1908  ulint i;
1909  ulint total_page_count = 0;
1910  ibool skipped = FALSE;
1911 
1912  if (min_n != ULINT_MAX) {
1913  /* Ensure that flushing is spread evenly amongst the
1914  buffer pool instances. When min_n is ULINT_MAX
1915  we need to flush everything up to the lsn limit
1916  so no limit here. */
1917  min_n = (min_n + srv_buf_pool_instances - 1)
1919  }
1920 
1921  /* Flush to lsn_limit in all buffer pool instances */
1922  for (i = 0; i < srv_buf_pool_instances; i++) {
1923  buf_pool_t* buf_pool;
1924  ulint page_count = 0;
1925 
1926  buf_pool = buf_pool_from_array(i);
1927 
1928  if (!buf_flush_start(buf_pool, BUF_FLUSH_LIST)) {
1929  /* We have two choices here. If lsn_limit was
1930  specified then skipping an instance of buffer
1931  pool means we cannot guarantee that all pages
1932  up to lsn_limit has been flushed. We can
1933  return right now with failure or we can try
1934  to flush remaining buffer pools up to the
1935  lsn_limit. We attempt to flush other buffer
1936  pools based on the assumption that it will
1937  help in the retry which will follow the
1938  failure. */
1939  skipped = TRUE;
1940 
1941  continue;
1942  }
1943 
1944  page_count = buf_flush_batch(
1945  buf_pool, BUF_FLUSH_LIST, min_n, lsn_limit);
1946 
1947  buf_flush_end(buf_pool, BUF_FLUSH_LIST);
1948 
1949  buf_flush_common(BUF_FLUSH_LIST, page_count);
1950 
1951  total_page_count += page_count;
1952  }
1953 
1954  return(lsn_limit != IB_ULONGLONG_MAX && skipped
1955  ? ULINT_UNDEFINED : total_page_count);
1956 }
1957 
1958 /******************************************************************/
1964 static
1965 ulint
1966 buf_flush_LRU_recommendation(
1967 /*=========================*/
1968  buf_pool_t* buf_pool)
1969 {
1970  buf_page_t* bpage;
1971  ulint n_replaceable;
1972  ulint distance = 0;
1973 
1974  buf_pool_mutex_enter(buf_pool);
1975 
1976  n_replaceable = UT_LIST_GET_LEN(buf_pool->free);
1977 
1978  bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1979 
1980  while ((bpage != NULL)
1981  && (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)
1982  + BUF_FLUSH_EXTRA_MARGIN(buf_pool))
1983  && (distance < BUF_LRU_FREE_SEARCH_LEN(buf_pool))) {
1984 
1985  mutex_t* block_mutex = buf_page_get_mutex(bpage);
1986 
1987  mutex_enter(block_mutex);
1988 
1989  if (buf_flush_ready_for_replace(bpage)) {
1990  n_replaceable++;
1991  }
1992 
1993  mutex_exit(block_mutex);
1994 
1995  distance++;
1996 
1997  bpage = UT_LIST_GET_PREV(LRU, bpage);
1998  }
1999 
2000  buf_pool_mutex_exit(buf_pool);
2001 
2002  if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)) {
2003 
2004  return(0);
2005  }
2006 
2007  return(BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)
2008  + BUF_FLUSH_EXTRA_MARGIN(buf_pool)
2009  - n_replaceable);
2010 }
2011 
2012 /*********************************************************************/
2018 UNIV_INTERN
2019 void
2020 buf_flush_free_margin(
2021 /*==================*/
2022  buf_pool_t* buf_pool)
2023 {
2024  ulint n_to_flush;
2025 
2026  n_to_flush = buf_flush_LRU_recommendation(buf_pool);
2027 
2028  if (n_to_flush > 0) {
2029  ulint n_flushed;
2030 
2031  n_flushed = buf_flush_LRU(buf_pool, n_to_flush);
2032 
2033  if (n_flushed == ULINT_UNDEFINED) {
2034  /* There was an LRU type flush batch already running;
2035  let us wait for it to end */
2036 
2037  buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
2038  }
2039  }
2040 }
2041 
2042 /*********************************************************************/
2044 UNIV_INTERN
2045 void
2046 buf_flush_free_margins(void)
2047 /*========================*/
2048 {
2049  ulint i;
2050 
2051  for (i = 0; i < srv_buf_pool_instances; i++) {
2052  buf_pool_t* buf_pool;
2053 
2054  buf_pool = buf_pool_from_array(i);
2055 
2056  buf_flush_free_margin(buf_pool);
2057  }
2058 }
2059 
2060 /*********************************************************************
2061 Update the historical stats that we are collecting for flush rate
2062 heuristics at the end of each interval.
2063 Flush rate heuristic depends on (a) rate of redo log generation and
2064 (b) the rate at which LRU flush is happening. */
2065 UNIV_INTERN
2066 void
2067 buf_flush_stat_update(void)
2068 /*=======================*/
2069 {
2070  buf_flush_stat_t* item;
2071  ib_uint64_t lsn_diff;
2072  ib_uint64_t lsn;
2073  ulint n_flushed;
2074 
2075  lsn = log_get_lsn();
2076  if (buf_flush_stat_cur.redo == 0) {
2077  /* First time around. Just update the current LSN
2078  and return. */
2079  buf_flush_stat_cur.redo = lsn;
2080  return;
2081  }
2082 
2083  item = &buf_flush_stat_arr[buf_flush_stat_arr_ind];
2084 
2085  /* values for this interval */
2086  lsn_diff = lsn - buf_flush_stat_cur.redo;
2087  n_flushed = buf_lru_flush_page_count
2088  - buf_flush_stat_cur.n_flushed;
2089 
2090  /* add the current value and subtract the obsolete entry. */
2091  buf_flush_stat_sum.redo += lsn_diff - item->redo;
2092  buf_flush_stat_sum.n_flushed += n_flushed - item->n_flushed;
2093 
2094  /* put current entry in the array. */
2095  item->redo = lsn_diff;
2096  item->n_flushed = n_flushed;
2097 
2098  /* update the index */
2099  buf_flush_stat_arr_ind++;
2100  buf_flush_stat_arr_ind %= BUF_FLUSH_STAT_N_INTERVAL;
2101 
2102  /* reset the current entry. */
2103  buf_flush_stat_cur.redo = lsn;
2104  buf_flush_stat_cur.n_flushed = buf_lru_flush_page_count;
2105 }
2106 
2107 /*********************************************************************
2108 Determines the fraction of dirty pages that need to be flushed based
2109 on the speed at which we generate redo log. Note that if redo log
2110 is generated at a significant rate without corresponding increase
2111 in the number of dirty pages (for example, an in-memory workload)
2112 it can cause IO bursts of flushing. This function implements heuristics
2113 to avoid this burstiness.
2114 @return number of dirty pages to be flushed / second */
2115 UNIV_INTERN
2116 ulint
2117 buf_flush_get_desired_flush_rate(void)
2118 /*==================================*/
2119 {
2120  ulint i;
2121  lint rate;
2122  ulint redo_avg;
2123  ulint n_dirty = 0;
2124  ulint n_flush_req;
2125  ulint lru_flush_avg;
2126  ib_uint64_t lsn = log_get_lsn();
2127  ulint log_capacity = log_get_capacity();
2128 
2129  /* log_capacity should never be zero after the initialization
2130  of log subsystem. */
2131  ut_ad(log_capacity != 0);
2132 
2133  /* Get total number of dirty pages. It is OK to access
2134  flush_list without holding any mutex as we are using this
2135  only for heuristics. */
2136  for (i = 0; i < srv_buf_pool_instances; i++) {
2137  buf_pool_t* buf_pool;
2138 
2139  buf_pool = buf_pool_from_array(i);
2140  n_dirty += UT_LIST_GET_LEN(buf_pool->flush_list);
2141  }
2142 
2143  /* An overflow can happen if we generate more than 2^32 bytes
2144  of redo in this interval i.e.: 4G of redo in 1 second. We can
2145  safely consider this as infinity because if we ever come close
2146  to 4G we'll start a synchronous flush of dirty pages. */
2147  /* redo_avg below is average at which redo is generated in
2148  past BUF_FLUSH_STAT_N_INTERVAL + redo generated in the current
2149  interval. */
2150  redo_avg = (ulint) (buf_flush_stat_sum.redo
2151  / BUF_FLUSH_STAT_N_INTERVAL
2152  + (lsn - buf_flush_stat_cur.redo));
2153 
2154  /* An overflow can happen possibly if we flush more than 2^32
2155  pages in BUF_FLUSH_STAT_N_INTERVAL. This is a very very
2156  unlikely scenario. Even when this happens it means that our
2157  flush rate will be off the mark. It won't affect correctness
2158  of any subsystem. */
2159  /* lru_flush_avg below is rate at which pages are flushed as
2160  part of LRU flush in past BUF_FLUSH_STAT_N_INTERVAL + the
2161  number of pages flushed in the current interval. */
2162  lru_flush_avg = buf_flush_stat_sum.n_flushed
2163  / BUF_FLUSH_STAT_N_INTERVAL
2164  + (buf_lru_flush_page_count
2165  - buf_flush_stat_cur.n_flushed);
2166 
2167  n_flush_req = (n_dirty * redo_avg) / log_capacity;
2168 
2169  /* The number of pages that we want to flush from the flush
2170  list is the difference between the required rate and the
2171  number of pages that we are historically flushing from the
2172  LRU list */
2173  rate = n_flush_req - lru_flush_avg;
2174  return(rate > 0 ? (ulint) rate : 0);
2175 }
2176 
2177 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2178 /******************************************************************/
2181 static
2182 ibool
2183 buf_flush_validate_low(
2184 /*===================*/
2185  buf_pool_t* buf_pool)
2186 {
2187  buf_page_t* bpage;
2188  const ib_rbt_node_t* rnode = NULL;
2189 
2190  ut_ad(buf_flush_list_mutex_own(buf_pool));
2191 
2192  UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list,
2193  ut_ad(ut_list_node_313->in_flush_list));
2194 
2195  bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
2196 
2197  /* If we are in recovery mode i.e.: flush_rbt != NULL
2198  then each block in the flush_list must also be present
2199  in the flush_rbt. */
2200  if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
2201  rnode = rbt_first(buf_pool->flush_rbt);
2202  }
2203 
2204  while (bpage != NULL) {
2205  const ib_uint64_t om = bpage->oldest_modification;
2206 
2207  ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
2208 
2209  ut_ad(bpage->in_flush_list);
2210 
2211  /* A page in buf_pool->flush_list can be in
2212  BUF_BLOCK_REMOVE_HASH state. This happens when a page
2213  is in the middle of being relocated. In that case the
2214  original descriptor can have this state and still be
2215  in the flush list waiting to acquire the
2216  buf_pool->flush_list_mutex to complete the relocation. */
2217  ut_a(buf_page_in_file(bpage)
2219  ut_a(om > 0);
2220 
2221  if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
2222  buf_page_t** prpage;
2223 
2224  ut_a(rnode);
2225  prpage = rbt_value(buf_page_t*, rnode);
2226 
2227  ut_a(*prpage);
2228  ut_a(*prpage == bpage);
2229  rnode = rbt_next(buf_pool->flush_rbt, rnode);
2230  }
2231 
2232  bpage = UT_LIST_GET_NEXT(list, bpage);
2233 
2234  ut_a(!bpage || om >= bpage->oldest_modification);
2235  }
2236 
2237  /* By this time we must have exhausted the traversal of
2238  flush_rbt (if active) as well. */
2239  ut_a(rnode == NULL);
2240 
2241  return(TRUE);
2242 }
2243 
2244 /******************************************************************/
2247 UNIV_INTERN
2248 ibool
2249 buf_flush_validate(
2250 /*===============*/
2251  buf_pool_t* buf_pool)
2252 {
2253  ibool ret;
2254 
2255  buf_flush_list_mutex_enter(buf_pool);
2256 
2257  ret = buf_flush_validate_low(buf_pool);
2258 
2259  buf_flush_list_mutex_exit(buf_pool);
2260 
2261  return(ret);
2262 }
2263 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
2264 #endif /* !UNIV_HOTBACKUP */