diff --git a/.gitignore b/.gitignore index 427bdb67f..bc5ee0f99 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ build .idea +*.iml cmake-build-debug diff --git a/CMakeLists.txt b/CMakeLists.txt index a6cacc0fd..43556c5bb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,7 +25,7 @@ add_definitions(-DDUCKDB_EXTENSION_AUTOLOAD_DEFAULT=1 -DDUCKDB_EXTENSION_AUTOINS file(GLOB_RECURSE JAVA_SRC_FILES src/main/java/org/duckdb/*.java) file(GLOB_RECURSE JAVA_TEST_FILES src/test/java/org/duckdb/*.java) -set(DUCKDB_SRC_FILES src/duckdb/ub_src_catalog.cpp src/duckdb/ub_src_catalog_catalog_entry.cpp src/duckdb/ub_src_catalog_catalog_entry_dependency.cpp src/duckdb/ub_src_catalog_default.cpp src/duckdb/ub_src_common_adbc.cpp src/duckdb/ub_src_common_adbc_nanoarrow.cpp src/duckdb/ub_src_common.cpp src/duckdb/ub_src_common_arrow_appender.cpp src/duckdb/ub_src_common_arrow.cpp src/duckdb/ub_src_common_crypto.cpp src/duckdb/ub_src_common_enums.cpp src/duckdb/ub_src_common_exception.cpp src/duckdb/ub_src_common_operator.cpp src/duckdb/ub_src_common_progress_bar.cpp src/duckdb/ub_src_common_row_operations.cpp src/duckdb/ub_src_common_serializer.cpp src/duckdb/ub_src_common_sort.cpp src/duckdb/ub_src_common_tree_renderer.cpp src/duckdb/ub_src_common_types.cpp src/duckdb/ub_src_common_types_column.cpp src/duckdb/ub_src_common_types_row.cpp src/duckdb/ub_src_common_value_operations.cpp src/duckdb/src/common/vector_operations/boolean_operators.cpp src/duckdb/src/common/vector_operations/comparison_operators.cpp src/duckdb/src/common/vector_operations/generators.cpp src/duckdb/src/common/vector_operations/is_distinct_from.cpp src/duckdb/src/common/vector_operations/null_operations.cpp src/duckdb/src/common/vector_operations/numeric_inplace_operators.cpp src/duckdb/src/common/vector_operations/vector_cast.cpp src/duckdb/src/common/vector_operations/vector_copy.cpp src/duckdb/src/common/vector_operations/vector_hash.cpp src/duckdb/src/common/vector_operations/vector_storage.cpp src/duckdb/ub_src_execution.cpp src/duckdb/ub_src_execution_expression_executor.cpp src/duckdb/ub_src_execution_index_art.cpp src/duckdb/ub_src_execution_index.cpp src/duckdb/ub_src_execution_nested_loop_join.cpp src/duckdb/ub_src_execution_operator_aggregate.cpp src/duckdb/ub_src_execution_operator_csv_scanner_buffer_manager.cpp src/duckdb/ub_src_execution_operator_csv_scanner_encode.cpp src/duckdb/ub_src_execution_operator_csv_scanner_scanner.cpp src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp src/duckdb/ub_src_execution_operator_csv_scanner_state_machine.cpp src/duckdb/ub_src_execution_operator_csv_scanner_table_function.cpp src/duckdb/ub_src_execution_operator_csv_scanner_util.cpp src/duckdb/ub_src_execution_operator_filter.cpp src/duckdb/ub_src_execution_operator_helper.cpp src/duckdb/ub_src_execution_operator_join.cpp src/duckdb/ub_src_execution_operator_order.cpp src/duckdb/ub_src_execution_operator_persistent.cpp src/duckdb/ub_src_execution_operator_projection.cpp src/duckdb/ub_src_execution_operator_scan.cpp src/duckdb/ub_src_execution_operator_schema.cpp src/duckdb/ub_src_execution_operator_set.cpp src/duckdb/ub_src_execution_physical_plan.cpp src/duckdb/ub_src_execution_sample.cpp src/duckdb/ub_src_function_aggregate_distributive.cpp src/duckdb/ub_src_function_aggregate.cpp src/duckdb/ub_src_function.cpp src/duckdb/ub_src_function_cast.cpp src/duckdb/ub_src_function_cast_union.cpp src/duckdb/ub_src_function_pragma.cpp src/duckdb/ub_src_function_scalar_compressed_materialization.cpp src/duckdb/ub_src_function_scalar.cpp src/duckdb/ub_src_function_scalar_date.cpp src/duckdb/ub_src_function_scalar_generic.cpp src/duckdb/ub_src_function_scalar_list.cpp src/duckdb/ub_src_function_scalar_map.cpp src/duckdb/ub_src_function_scalar_operator.cpp src/duckdb/ub_src_function_scalar_sequence.cpp src/duckdb/ub_src_function_scalar_string.cpp src/duckdb/ub_src_function_scalar_string_regexp.cpp src/duckdb/ub_src_function_scalar_struct.cpp src/duckdb/ub_src_function_scalar_system.cpp src/duckdb/ub_src_function_table_arrow.cpp src/duckdb/ub_src_function_table.cpp src/duckdb/ub_src_function_table_system.cpp src/duckdb/ub_src_function_table_version.cpp src/duckdb/ub_src_function_window.cpp src/duckdb/ub_src_main.cpp src/duckdb/ub_src_main_buffered_data.cpp src/duckdb/ub_src_main_capi.cpp src/duckdb/ub_src_main_capi_cast.cpp src/duckdb/ub_src_main_chunk_scan_state.cpp src/duckdb/ub_src_main_extension.cpp src/duckdb/ub_src_main_relation.cpp src/duckdb/ub_src_main_secret.cpp src/duckdb/ub_src_main_settings.cpp src/duckdb/ub_src_optimizer.cpp src/duckdb/ub_src_optimizer_compressed_materialization.cpp src/duckdb/ub_src_optimizer_join_order.cpp src/duckdb/ub_src_optimizer_matcher.cpp src/duckdb/ub_src_optimizer_pullup.cpp src/duckdb/ub_src_optimizer_pushdown.cpp src/duckdb/ub_src_optimizer_rule.cpp src/duckdb/ub_src_optimizer_statistics_expression.cpp src/duckdb/ub_src_optimizer_statistics_operator.cpp src/duckdb/ub_src_parallel.cpp src/duckdb/ub_src_parser.cpp src/duckdb/ub_src_parser_constraints.cpp src/duckdb/ub_src_parser_expression.cpp src/duckdb/ub_src_parser_parsed_data.cpp src/duckdb/ub_src_parser_query_node.cpp src/duckdb/ub_src_parser_statement.cpp src/duckdb/ub_src_parser_tableref.cpp src/duckdb/ub_src_parser_transform_constraint.cpp src/duckdb/ub_src_parser_transform_expression.cpp src/duckdb/ub_src_parser_transform_helpers.cpp src/duckdb/ub_src_parser_transform_statement.cpp src/duckdb/ub_src_parser_transform_tableref.cpp src/duckdb/ub_src_planner.cpp src/duckdb/ub_src_planner_binder_expression.cpp src/duckdb/ub_src_planner_binder_query_node.cpp src/duckdb/ub_src_planner_binder_statement.cpp src/duckdb/ub_src_planner_binder_tableref.cpp src/duckdb/ub_src_planner_expression.cpp src/duckdb/ub_src_planner_expression_binder.cpp src/duckdb/ub_src_planner_filter.cpp src/duckdb/ub_src_planner_operator.cpp src/duckdb/ub_src_planner_subquery.cpp src/duckdb/ub_src_storage.cpp src/duckdb/ub_src_storage_buffer.cpp src/duckdb/ub_src_storage_checkpoint.cpp src/duckdb/ub_src_storage_compression_alp.cpp src/duckdb/ub_src_storage_compression.cpp src/duckdb/ub_src_storage_compression_chimp.cpp src/duckdb/ub_src_storage_compression_dictionary.cpp src/duckdb/ub_src_storage_compression_roaring.cpp src/duckdb/ub_src_storage_metadata.cpp src/duckdb/ub_src_storage_serialization.cpp src/duckdb/ub_src_storage_statistics.cpp src/duckdb/ub_src_storage_table.cpp src/duckdb/ub_src_transaction.cpp src/duckdb/src/verification/copied_statement_verifier.cpp src/duckdb/src/verification/deserialized_statement_verifier.cpp src/duckdb/src/verification/external_statement_verifier.cpp src/duckdb/src/verification/fetch_row_verifier.cpp src/duckdb/src/verification/no_operator_caching_verifier.cpp src/duckdb/src/verification/parsed_statement_verifier.cpp src/duckdb/src/verification/prepared_statement_verifier.cpp src/duckdb/src/verification/statement_verifier.cpp src/duckdb/src/verification/unoptimized_statement_verifier.cpp src/duckdb/third_party/fmt/format.cc src/duckdb/third_party/fsst/libfsst.cpp src/duckdb/third_party/miniz/miniz.cpp src/duckdb/third_party/re2/re2/bitmap256.cc src/duckdb/third_party/re2/re2/bitstate.cc src/duckdb/third_party/re2/re2/compile.cc src/duckdb/third_party/re2/re2/dfa.cc src/duckdb/third_party/re2/re2/filtered_re2.cc src/duckdb/third_party/re2/re2/mimics_pcre.cc src/duckdb/third_party/re2/re2/nfa.cc src/duckdb/third_party/re2/re2/onepass.cc src/duckdb/third_party/re2/re2/parse.cc src/duckdb/third_party/re2/re2/perl_groups.cc src/duckdb/third_party/re2/re2/prefilter.cc src/duckdb/third_party/re2/re2/prefilter_tree.cc src/duckdb/third_party/re2/re2/prog.cc src/duckdb/third_party/re2/re2/re2.cc src/duckdb/third_party/re2/re2/regexp.cc src/duckdb/third_party/re2/re2/set.cc src/duckdb/third_party/re2/re2/simplify.cc src/duckdb/third_party/re2/re2/stringpiece.cc src/duckdb/third_party/re2/re2/tostring.cc src/duckdb/third_party/re2/re2/unicode_casefold.cc src/duckdb/third_party/re2/re2/unicode_groups.cc src/duckdb/third_party/re2/util/rune.cc src/duckdb/third_party/re2/util/strutil.cc src/duckdb/third_party/hyperloglog/hyperloglog.cpp src/duckdb/third_party/hyperloglog/sds.cpp src/duckdb/third_party/skiplist/SkipList.cpp src/duckdb/third_party/fastpforlib/bitpacking.cpp src/duckdb/third_party/utf8proc/utf8proc.cpp src/duckdb/third_party/utf8proc/utf8proc_wrapper.cpp src/duckdb/third_party/libpg_query/pg_functions.cpp src/duckdb/third_party/libpg_query/postgres_parser.cpp src/duckdb/third_party/libpg_query/src_backend_nodes_list.cpp src/duckdb/third_party/libpg_query/src_backend_nodes_makefuncs.cpp src/duckdb/third_party/libpg_query/src_backend_nodes_value.cpp src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp src/duckdb/third_party/libpg_query/src_backend_parser_parser.cpp src/duckdb/third_party/libpg_query/src_backend_parser_scan.cpp src/duckdb/third_party/libpg_query/src_backend_parser_scansup.cpp src/duckdb/third_party/libpg_query/src_common_keywords.cpp src/duckdb/third_party/mbedtls/library/aes.cpp src/duckdb/third_party/mbedtls/library/aria.cpp src/duckdb/third_party/mbedtls/library/asn1parse.cpp src/duckdb/third_party/mbedtls/library/base64.cpp src/duckdb/third_party/mbedtls/library/bignum.cpp src/duckdb/third_party/mbedtls/library/camellia.cpp src/duckdb/third_party/mbedtls/library/cipher.cpp src/duckdb/third_party/mbedtls/library/cipher_wrap.cpp src/duckdb/third_party/mbedtls/library/constant_time.cpp src/duckdb/third_party/mbedtls/library/entropy.cpp src/duckdb/third_party/mbedtls/library/entropy_poll.cpp src/duckdb/third_party/mbedtls/library/gcm.cpp src/duckdb/third_party/mbedtls/library/md.cpp src/duckdb/third_party/mbedtls/library/oid.cpp src/duckdb/third_party/mbedtls/library/pem.cpp src/duckdb/third_party/mbedtls/library/pk.cpp src/duckdb/third_party/mbedtls/library/pk_wrap.cpp src/duckdb/third_party/mbedtls/library/pkparse.cpp src/duckdb/third_party/mbedtls/library/platform_util.cpp src/duckdb/third_party/mbedtls/library/rsa.cpp src/duckdb/third_party/mbedtls/library/rsa_alt_helpers.cpp src/duckdb/third_party/mbedtls/library/sha1.cpp src/duckdb/third_party/mbedtls/library/sha256.cpp src/duckdb/third_party/mbedtls/library/sha512.cpp src/duckdb/third_party/mbedtls/mbedtls_wrapper.cpp src/duckdb/third_party/yyjson/yyjson.cpp src/duckdb/third_party/zstd/common/debug.cpp src/duckdb/third_party/zstd/common/entropy_common.cpp src/duckdb/third_party/zstd/common/error_private.cpp src/duckdb/third_party/zstd/common/fse_decompress.cpp src/duckdb/third_party/zstd/common/pool.cpp src/duckdb/third_party/zstd/common/threading.cpp src/duckdb/third_party/zstd/common/xxhash.cpp src/duckdb/third_party/zstd/common/zstd_common.cpp src/duckdb/third_party/zstd/compress/fse_compress.cpp src/duckdb/third_party/zstd/compress/hist.cpp src/duckdb/third_party/zstd/compress/huf_compress.cpp src/duckdb/third_party/zstd/compress/zstd_compress.cpp src/duckdb/third_party/zstd/compress/zstd_compress_literals.cpp src/duckdb/third_party/zstd/compress/zstd_compress_sequences.cpp src/duckdb/third_party/zstd/compress/zstd_compress_superblock.cpp src/duckdb/third_party/zstd/compress/zstd_double_fast.cpp src/duckdb/third_party/zstd/compress/zstd_fast.cpp src/duckdb/third_party/zstd/compress/zstd_lazy.cpp src/duckdb/third_party/zstd/compress/zstd_ldm.cpp src/duckdb/third_party/zstd/compress/zstd_opt.cpp src/duckdb/third_party/zstd/compress/zstdmt_compress.cpp src/duckdb/third_party/zstd/decompress/huf_decompress.cpp src/duckdb/third_party/zstd/decompress/zstd_ddict.cpp src/duckdb/third_party/zstd/decompress/zstd_decompress.cpp src/duckdb/third_party/zstd/decompress/zstd_decompress_block.cpp src/duckdb/third_party/zstd/deprecated/zbuff_common.cpp src/duckdb/third_party/zstd/deprecated/zbuff_compress.cpp src/duckdb/third_party/zstd/deprecated/zbuff_decompress.cpp src/duckdb/third_party/zstd/dict/cover.cpp src/duckdb/third_party/zstd/dict/divsufsort.cpp src/duckdb/third_party/zstd/dict/fastcover.cpp src/duckdb/third_party/zstd/dict/zdict.cpp src/duckdb/extension/core_functions/core_functions_extension.cpp src/duckdb/extension/core_functions/lambda_functions.cpp src/duckdb/extension/core_functions/function_list.cpp src/duckdb/ub_extension_core_functions_aggregate_regression.cpp src/duckdb/ub_extension_core_functions_aggregate_algebraic.cpp src/duckdb/ub_extension_core_functions_aggregate_nested.cpp src/duckdb/ub_extension_core_functions_aggregate_holistic.cpp src/duckdb/ub_extension_core_functions_aggregate_distributive.cpp src/duckdb/ub_extension_core_functions_scalar_random.cpp src/duckdb/ub_extension_core_functions_scalar_string.cpp src/duckdb/ub_extension_core_functions_scalar_math.cpp src/duckdb/ub_extension_core_functions_scalar_generic.cpp src/duckdb/ub_extension_core_functions_scalar_enum.cpp src/duckdb/ub_extension_core_functions_scalar_map.cpp src/duckdb/ub_extension_core_functions_scalar_operators.cpp src/duckdb/ub_extension_core_functions_scalar_date.cpp src/duckdb/ub_extension_core_functions_scalar_list.cpp src/duckdb/ub_extension_core_functions_scalar_blob.cpp src/duckdb/ub_extension_core_functions_scalar_debug.cpp src/duckdb/ub_extension_core_functions_scalar_array.cpp src/duckdb/ub_extension_core_functions_scalar_union.cpp src/duckdb/ub_extension_core_functions_scalar_struct.cpp src/duckdb/ub_extension_core_functions_scalar_bit.cpp src/duckdb/extension/parquet/column_reader.cpp src/duckdb/extension/parquet/column_writer.cpp src/duckdb/extension/parquet/parquet_crypto.cpp src/duckdb/extension/parquet/parquet_extension.cpp src/duckdb/extension/parquet/parquet_metadata.cpp src/duckdb/extension/parquet/parquet_reader.cpp src/duckdb/extension/parquet/parquet_statistics.cpp src/duckdb/extension/parquet/parquet_timestamp.cpp src/duckdb/extension/parquet/parquet_writer.cpp src/duckdb/extension/parquet/serialize_parquet.cpp src/duckdb/extension/parquet/zstd_file_system.cpp src/duckdb/extension/parquet/geo_parquet.cpp src/duckdb/third_party/parquet/parquet_types.cpp src/duckdb/third_party/thrift/thrift/protocol/TProtocol.cpp src/duckdb/third_party/thrift/thrift/transport/TTransportException.cpp src/duckdb/third_party/thrift/thrift/transport/TBufferTransports.cpp src/duckdb/third_party/snappy/snappy.cc src/duckdb/third_party/snappy/snappy-sinksource.cc src/duckdb/third_party/lz4/lz4.cpp src/duckdb/third_party/brotli/common/constants.cpp src/duckdb/third_party/brotli/common/context.cpp src/duckdb/third_party/brotli/common/dictionary.cpp src/duckdb/third_party/brotli/common/platform.cpp src/duckdb/third_party/brotli/common/shared_dictionary.cpp src/duckdb/third_party/brotli/common/transform.cpp src/duckdb/third_party/brotli/dec/bit_reader.cpp src/duckdb/third_party/brotli/dec/decode.cpp src/duckdb/third_party/brotli/dec/huffman.cpp src/duckdb/third_party/brotli/dec/state.cpp src/duckdb/third_party/brotli/enc/backward_references.cpp src/duckdb/third_party/brotli/enc/backward_references_hq.cpp src/duckdb/third_party/brotli/enc/bit_cost.cpp src/duckdb/third_party/brotli/enc/block_splitter.cpp src/duckdb/third_party/brotli/enc/brotli_bit_stream.cpp src/duckdb/third_party/brotli/enc/cluster.cpp src/duckdb/third_party/brotli/enc/command.cpp src/duckdb/third_party/brotli/enc/compound_dictionary.cpp src/duckdb/third_party/brotli/enc/compress_fragment.cpp src/duckdb/third_party/brotli/enc/compress_fragment_two_pass.cpp src/duckdb/third_party/brotli/enc/dictionary_hash.cpp src/duckdb/third_party/brotli/enc/encode.cpp src/duckdb/third_party/brotli/enc/encoder_dict.cpp src/duckdb/third_party/brotli/enc/entropy_encode.cpp src/duckdb/third_party/brotli/enc/fast_log.cpp src/duckdb/third_party/brotli/enc/histogram.cpp src/duckdb/third_party/brotli/enc/literal_cost.cpp src/duckdb/third_party/brotli/enc/memory.cpp src/duckdb/third_party/brotli/enc/metablock.cpp src/duckdb/third_party/brotli/enc/static_dict.cpp src/duckdb/third_party/brotli/enc/utf8_util.cpp src/duckdb/extension/icu/./icu-table-range.cpp src/duckdb/extension/icu/./icu-datefunc.cpp src/duckdb/extension/icu/./icu-datepart.cpp src/duckdb/extension/icu/./icu-datetrunc.cpp src/duckdb/extension/icu/./icu_extension.cpp src/duckdb/extension/icu/./icu-timezone.cpp src/duckdb/extension/icu/./icu-timebucket.cpp src/duckdb/extension/icu/./icu-makedate.cpp src/duckdb/extension/icu/./icu-datesub.cpp src/duckdb/extension/icu/./icu-dateadd.cpp src/duckdb/extension/icu/./icu-list-range.cpp src/duckdb/extension/icu/./icu-strptime.cpp src/duckdb/ub_extension_icu_third_party_icu_common.cpp src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp src/duckdb/extension/icu/third_party/icu/stubdata/stubdata.cpp src/duckdb/extension/json/buffered_json_reader.cpp src/duckdb/extension/json/json_enums.cpp src/duckdb/extension/json/json_extension.cpp src/duckdb/extension/json/json_common.cpp src/duckdb/extension/json/json_functions.cpp src/duckdb/extension/json/json_scan.cpp src/duckdb/extension/json/json_serializer.cpp src/duckdb/extension/json/json_deserializer.cpp src/duckdb/extension/json/serialize_json.cpp src/duckdb/ub_extension_json_json_functions.cpp) +set(DUCKDB_SRC_FILES src/duckdb/ub_src_catalog.cpp src/duckdb/ub_src_catalog_catalog_entry.cpp src/duckdb/ub_src_catalog_catalog_entry_dependency.cpp src/duckdb/ub_src_catalog_default.cpp src/duckdb/ub_src_common_adbc.cpp src/duckdb/ub_src_common_adbc_nanoarrow.cpp src/duckdb/ub_src_common.cpp src/duckdb/ub_src_common_arrow_appender.cpp src/duckdb/ub_src_common_arrow.cpp src/duckdb/ub_src_common_crypto.cpp src/duckdb/ub_src_common_enums.cpp src/duckdb/ub_src_common_exception.cpp src/duckdb/ub_src_common_operator.cpp src/duckdb/ub_src_common_progress_bar.cpp src/duckdb/ub_src_common_row_operations.cpp src/duckdb/ub_src_common_serializer.cpp src/duckdb/ub_src_common_sort.cpp src/duckdb/ub_src_common_tree_renderer.cpp src/duckdb/ub_src_common_types.cpp src/duckdb/ub_src_common_types_column.cpp src/duckdb/ub_src_common_types_row.cpp src/duckdb/ub_src_common_value_operations.cpp src/duckdb/src/common/vector_operations/boolean_operators.cpp src/duckdb/src/common/vector_operations/comparison_operators.cpp src/duckdb/src/common/vector_operations/generators.cpp src/duckdb/src/common/vector_operations/is_distinct_from.cpp src/duckdb/src/common/vector_operations/null_operations.cpp src/duckdb/src/common/vector_operations/numeric_inplace_operators.cpp src/duckdb/src/common/vector_operations/vector_cast.cpp src/duckdb/src/common/vector_operations/vector_copy.cpp src/duckdb/src/common/vector_operations/vector_hash.cpp src/duckdb/src/common/vector_operations/vector_storage.cpp src/duckdb/ub_src_execution.cpp src/duckdb/ub_src_execution_expression_executor.cpp src/duckdb/ub_src_execution_index_art.cpp src/duckdb/ub_src_execution_index.cpp src/duckdb/ub_src_execution_nested_loop_join.cpp src/duckdb/ub_src_execution_operator_aggregate.cpp src/duckdb/ub_src_execution_operator_csv_scanner_buffer_manager.cpp src/duckdb/ub_src_execution_operator_csv_scanner_encode.cpp src/duckdb/ub_src_execution_operator_csv_scanner_scanner.cpp src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp src/duckdb/ub_src_execution_operator_csv_scanner_state_machine.cpp src/duckdb/ub_src_execution_operator_csv_scanner_table_function.cpp src/duckdb/ub_src_execution_operator_csv_scanner_util.cpp src/duckdb/ub_src_execution_operator_filter.cpp src/duckdb/ub_src_execution_operator_helper.cpp src/duckdb/ub_src_execution_operator_join.cpp src/duckdb/ub_src_execution_operator_order.cpp src/duckdb/ub_src_execution_operator_persistent.cpp src/duckdb/ub_src_execution_operator_projection.cpp src/duckdb/ub_src_execution_operator_scan.cpp src/duckdb/ub_src_execution_operator_schema.cpp src/duckdb/ub_src_execution_operator_set.cpp src/duckdb/ub_src_execution_physical_plan.cpp src/duckdb/ub_src_execution_sample.cpp src/duckdb/ub_src_function_aggregate_distributive.cpp src/duckdb/ub_src_function_aggregate.cpp src/duckdb/ub_src_function.cpp src/duckdb/ub_src_function_cast.cpp src/duckdb/ub_src_function_cast_union.cpp src/duckdb/ub_src_function_pragma.cpp src/duckdb/ub_src_function_scalar_compressed_materialization.cpp src/duckdb/ub_src_function_scalar.cpp src/duckdb/ub_src_function_scalar_date.cpp src/duckdb/ub_src_function_scalar_generic.cpp src/duckdb/ub_src_function_scalar_list.cpp src/duckdb/ub_src_function_scalar_map.cpp src/duckdb/ub_src_function_scalar_operator.cpp src/duckdb/ub_src_function_scalar_sequence.cpp src/duckdb/ub_src_function_scalar_string.cpp src/duckdb/ub_src_function_scalar_string_regexp.cpp src/duckdb/ub_src_function_scalar_struct.cpp src/duckdb/ub_src_function_scalar_system.cpp src/duckdb/ub_src_function_table_arrow.cpp src/duckdb/ub_src_function_table.cpp src/duckdb/ub_src_function_table_system.cpp src/duckdb/ub_src_function_table_version.cpp src/duckdb/ub_src_function_window.cpp src/duckdb/ub_src_main.cpp src/duckdb/ub_src_main_buffered_data.cpp src/duckdb/ub_src_main_capi.cpp src/duckdb/ub_src_main_capi_cast.cpp src/duckdb/ub_src_main_chunk_scan_state.cpp src/duckdb/ub_src_main_extension.cpp src/duckdb/ub_src_main_relation.cpp src/duckdb/ub_src_main_secret.cpp src/duckdb/ub_src_main_settings.cpp src/duckdb/ub_src_optimizer.cpp src/duckdb/ub_src_optimizer_compressed_materialization.cpp src/duckdb/ub_src_optimizer_join_order.cpp src/duckdb/ub_src_optimizer_matcher.cpp src/duckdb/ub_src_optimizer_pullup.cpp src/duckdb/ub_src_optimizer_pushdown.cpp src/duckdb/ub_src_optimizer_rule.cpp src/duckdb/ub_src_optimizer_statistics_expression.cpp src/duckdb/ub_src_optimizer_statistics_operator.cpp src/duckdb/ub_src_parallel.cpp src/duckdb/ub_src_parser.cpp src/duckdb/ub_src_parser_constraints.cpp src/duckdb/ub_src_parser_expression.cpp src/duckdb/ub_src_parser_parsed_data.cpp src/duckdb/ub_src_parser_query_node.cpp src/duckdb/ub_src_parser_statement.cpp src/duckdb/ub_src_parser_tableref.cpp src/duckdb/ub_src_parser_transform_constraint.cpp src/duckdb/ub_src_parser_transform_expression.cpp src/duckdb/ub_src_parser_transform_helpers.cpp src/duckdb/ub_src_parser_transform_statement.cpp src/duckdb/ub_src_parser_transform_tableref.cpp src/duckdb/ub_src_planner.cpp src/duckdb/ub_src_planner_binder_expression.cpp src/duckdb/ub_src_planner_binder_query_node.cpp src/duckdb/ub_src_planner_binder_statement.cpp src/duckdb/ub_src_planner_binder_tableref.cpp src/duckdb/ub_src_planner_expression.cpp src/duckdb/ub_src_planner_expression_binder.cpp src/duckdb/ub_src_planner_filter.cpp src/duckdb/ub_src_planner_operator.cpp src/duckdb/ub_src_planner_subquery.cpp src/duckdb/ub_src_storage.cpp src/duckdb/ub_src_storage_buffer.cpp src/duckdb/ub_src_storage_checkpoint.cpp src/duckdb/ub_src_storage_compression_alp.cpp src/duckdb/ub_src_storage_compression.cpp src/duckdb/ub_src_storage_compression_chimp.cpp src/duckdb/ub_src_storage_compression_dictionary.cpp src/duckdb/ub_src_storage_compression_roaring.cpp src/duckdb/ub_src_storage_metadata.cpp src/duckdb/ub_src_storage_serialization.cpp src/duckdb/ub_src_storage_statistics.cpp src/duckdb/ub_src_storage_table.cpp src/duckdb/ub_src_transaction.cpp src/duckdb/src/verification/copied_statement_verifier.cpp src/duckdb/src/verification/deserialized_statement_verifier.cpp src/duckdb/src/verification/external_statement_verifier.cpp src/duckdb/src/verification/fetch_row_verifier.cpp src/duckdb/src/verification/no_operator_caching_verifier.cpp src/duckdb/src/verification/parsed_statement_verifier.cpp src/duckdb/src/verification/prepared_statement_verifier.cpp src/duckdb/src/verification/statement_verifier.cpp src/duckdb/src/verification/unoptimized_statement_verifier.cpp src/duckdb/third_party/fmt/format.cc src/duckdb/third_party/fsst/libfsst.cpp src/duckdb/third_party/miniz/miniz.cpp src/duckdb/third_party/re2/re2/bitmap256.cc src/duckdb/third_party/re2/re2/bitstate.cc src/duckdb/third_party/re2/re2/compile.cc src/duckdb/third_party/re2/re2/dfa.cc src/duckdb/third_party/re2/re2/filtered_re2.cc src/duckdb/third_party/re2/re2/mimics_pcre.cc src/duckdb/third_party/re2/re2/nfa.cc src/duckdb/third_party/re2/re2/onepass.cc src/duckdb/third_party/re2/re2/parse.cc src/duckdb/third_party/re2/re2/perl_groups.cc src/duckdb/third_party/re2/re2/prefilter.cc src/duckdb/third_party/re2/re2/prefilter_tree.cc src/duckdb/third_party/re2/re2/prog.cc src/duckdb/third_party/re2/re2/re2.cc src/duckdb/third_party/re2/re2/regexp.cc src/duckdb/third_party/re2/re2/set.cc src/duckdb/third_party/re2/re2/simplify.cc src/duckdb/third_party/re2/re2/stringpiece.cc src/duckdb/third_party/re2/re2/tostring.cc src/duckdb/third_party/re2/re2/unicode_casefold.cc src/duckdb/third_party/re2/re2/unicode_groups.cc src/duckdb/third_party/re2/util/rune.cc src/duckdb/third_party/re2/util/strutil.cc src/duckdb/third_party/hyperloglog/hyperloglog.cpp src/duckdb/third_party/hyperloglog/sds.cpp src/duckdb/third_party/skiplist/SkipList.cpp src/duckdb/third_party/fastpforlib/bitpacking.cpp src/duckdb/third_party/utf8proc/utf8proc.cpp src/duckdb/third_party/utf8proc/utf8proc_wrapper.cpp src/duckdb/third_party/libpg_query/pg_functions.cpp src/duckdb/third_party/libpg_query/postgres_parser.cpp src/duckdb/third_party/libpg_query/src_backend_nodes_list.cpp src/duckdb/third_party/libpg_query/src_backend_nodes_makefuncs.cpp src/duckdb/third_party/libpg_query/src_backend_nodes_value.cpp src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp src/duckdb/third_party/libpg_query/src_backend_parser_parser.cpp src/duckdb/third_party/libpg_query/src_backend_parser_scan.cpp src/duckdb/third_party/libpg_query/src_backend_parser_scansup.cpp src/duckdb/third_party/libpg_query/src_common_keywords.cpp src/duckdb/third_party/mbedtls/library/aes.cpp src/duckdb/third_party/mbedtls/library/aria.cpp src/duckdb/third_party/mbedtls/library/asn1parse.cpp src/duckdb/third_party/mbedtls/library/base64.cpp src/duckdb/third_party/mbedtls/library/bignum.cpp src/duckdb/third_party/mbedtls/library/camellia.cpp src/duckdb/third_party/mbedtls/library/cipher.cpp src/duckdb/third_party/mbedtls/library/cipher_wrap.cpp src/duckdb/third_party/mbedtls/library/constant_time.cpp src/duckdb/third_party/mbedtls/library/entropy.cpp src/duckdb/third_party/mbedtls/library/entropy_poll.cpp src/duckdb/third_party/mbedtls/library/gcm.cpp src/duckdb/third_party/mbedtls/library/md.cpp src/duckdb/third_party/mbedtls/library/oid.cpp src/duckdb/third_party/mbedtls/library/pem.cpp src/duckdb/third_party/mbedtls/library/pk.cpp src/duckdb/third_party/mbedtls/library/pk_wrap.cpp src/duckdb/third_party/mbedtls/library/pkparse.cpp src/duckdb/third_party/mbedtls/library/platform_util.cpp src/duckdb/third_party/mbedtls/library/rsa.cpp src/duckdb/third_party/mbedtls/library/rsa_alt_helpers.cpp src/duckdb/third_party/mbedtls/library/sha1.cpp src/duckdb/third_party/mbedtls/library/sha256.cpp src/duckdb/third_party/mbedtls/library/sha512.cpp src/duckdb/third_party/mbedtls/mbedtls_wrapper.cpp src/duckdb/third_party/yyjson/yyjson.cpp src/duckdb/third_party/zstd/common/debug.cpp src/duckdb/third_party/zstd/common/entropy_common.cpp src/duckdb/third_party/zstd/common/error_private.cpp src/duckdb/third_party/zstd/common/fse_decompress.cpp src/duckdb/third_party/zstd/common/pool.cpp src/duckdb/third_party/zstd/common/threading.cpp src/duckdb/third_party/zstd/common/xxhash.cpp src/duckdb/third_party/zstd/common/zstd_common.cpp src/duckdb/third_party/zstd/compress/fse_compress.cpp src/duckdb/third_party/zstd/compress/hist.cpp src/duckdb/third_party/zstd/compress/huf_compress.cpp src/duckdb/third_party/zstd/compress/zstd_compress.cpp src/duckdb/third_party/zstd/compress/zstd_compress_literals.cpp src/duckdb/third_party/zstd/compress/zstd_compress_sequences.cpp src/duckdb/third_party/zstd/compress/zstd_compress_superblock.cpp src/duckdb/third_party/zstd/compress/zstd_double_fast.cpp src/duckdb/third_party/zstd/compress/zstd_fast.cpp src/duckdb/third_party/zstd/compress/zstd_lazy.cpp src/duckdb/third_party/zstd/compress/zstd_ldm.cpp src/duckdb/third_party/zstd/compress/zstd_opt.cpp src/duckdb/third_party/zstd/compress/zstdmt_compress.cpp src/duckdb/third_party/zstd/decompress/huf_decompress.cpp src/duckdb/third_party/zstd/decompress/zstd_ddict.cpp src/duckdb/third_party/zstd/decompress/zstd_decompress.cpp src/duckdb/third_party/zstd/decompress/zstd_decompress_block.cpp src/duckdb/third_party/zstd/deprecated/zbuff_common.cpp src/duckdb/third_party/zstd/deprecated/zbuff_compress.cpp src/duckdb/third_party/zstd/deprecated/zbuff_decompress.cpp src/duckdb/third_party/zstd/dict/cover.cpp src/duckdb/third_party/zstd/dict/divsufsort.cpp src/duckdb/third_party/zstd/dict/fastcover.cpp src/duckdb/third_party/zstd/dict/zdict.cpp src/duckdb/extension/core_functions/function_list.cpp src/duckdb/extension/core_functions/core_functions_extension.cpp src/duckdb/extension/core_functions/lambda_functions.cpp src/duckdb/ub_extension_core_functions_aggregate_holistic.cpp src/duckdb/ub_extension_core_functions_aggregate_distributive.cpp src/duckdb/ub_extension_core_functions_aggregate_algebraic.cpp src/duckdb/ub_extension_core_functions_aggregate_regression.cpp src/duckdb/ub_extension_core_functions_aggregate_nested.cpp src/duckdb/ub_extension_core_functions_scalar_bit.cpp src/duckdb/ub_extension_core_functions_scalar_operators.cpp src/duckdb/ub_extension_core_functions_scalar_array.cpp src/duckdb/ub_extension_core_functions_scalar_date.cpp src/duckdb/ub_extension_core_functions_scalar_enum.cpp src/duckdb/ub_extension_core_functions_scalar_math.cpp src/duckdb/ub_extension_core_functions_scalar_struct.cpp src/duckdb/ub_extension_core_functions_scalar_map.cpp src/duckdb/ub_extension_core_functions_scalar_list.cpp src/duckdb/ub_extension_core_functions_scalar_union.cpp src/duckdb/ub_extension_core_functions_scalar_generic.cpp src/duckdb/ub_extension_core_functions_scalar_string.cpp src/duckdb/ub_extension_core_functions_scalar_random.cpp src/duckdb/ub_extension_core_functions_scalar_blob.cpp src/duckdb/ub_extension_core_functions_scalar_debug.cpp src/duckdb/extension/parquet/column_reader.cpp src/duckdb/extension/parquet/column_writer.cpp src/duckdb/extension/parquet/parquet_crypto.cpp src/duckdb/extension/parquet/parquet_extension.cpp src/duckdb/extension/parquet/parquet_metadata.cpp src/duckdb/extension/parquet/parquet_reader.cpp src/duckdb/extension/parquet/parquet_statistics.cpp src/duckdb/extension/parquet/parquet_timestamp.cpp src/duckdb/extension/parquet/parquet_writer.cpp src/duckdb/extension/parquet/serialize_parquet.cpp src/duckdb/extension/parquet/zstd_file_system.cpp src/duckdb/extension/parquet/geo_parquet.cpp src/duckdb/third_party/parquet/parquet_types.cpp src/duckdb/third_party/thrift/thrift/protocol/TProtocol.cpp src/duckdb/third_party/thrift/thrift/transport/TTransportException.cpp src/duckdb/third_party/thrift/thrift/transport/TBufferTransports.cpp src/duckdb/third_party/snappy/snappy.cc src/duckdb/third_party/snappy/snappy-sinksource.cc src/duckdb/third_party/lz4/lz4.cpp src/duckdb/third_party/brotli/common/constants.cpp src/duckdb/third_party/brotli/common/context.cpp src/duckdb/third_party/brotli/common/dictionary.cpp src/duckdb/third_party/brotli/common/platform.cpp src/duckdb/third_party/brotli/common/shared_dictionary.cpp src/duckdb/third_party/brotli/common/transform.cpp src/duckdb/third_party/brotli/dec/bit_reader.cpp src/duckdb/third_party/brotli/dec/decode.cpp src/duckdb/third_party/brotli/dec/huffman.cpp src/duckdb/third_party/brotli/dec/state.cpp src/duckdb/third_party/brotli/enc/backward_references.cpp src/duckdb/third_party/brotli/enc/backward_references_hq.cpp src/duckdb/third_party/brotli/enc/bit_cost.cpp src/duckdb/third_party/brotli/enc/block_splitter.cpp src/duckdb/third_party/brotli/enc/brotli_bit_stream.cpp src/duckdb/third_party/brotli/enc/cluster.cpp src/duckdb/third_party/brotli/enc/command.cpp src/duckdb/third_party/brotli/enc/compound_dictionary.cpp src/duckdb/third_party/brotli/enc/compress_fragment.cpp src/duckdb/third_party/brotli/enc/compress_fragment_two_pass.cpp src/duckdb/third_party/brotli/enc/dictionary_hash.cpp src/duckdb/third_party/brotli/enc/encode.cpp src/duckdb/third_party/brotli/enc/encoder_dict.cpp src/duckdb/third_party/brotli/enc/entropy_encode.cpp src/duckdb/third_party/brotli/enc/fast_log.cpp src/duckdb/third_party/brotli/enc/histogram.cpp src/duckdb/third_party/brotli/enc/literal_cost.cpp src/duckdb/third_party/brotli/enc/memory.cpp src/duckdb/third_party/brotli/enc/metablock.cpp src/duckdb/third_party/brotli/enc/static_dict.cpp src/duckdb/third_party/brotli/enc/utf8_util.cpp src/duckdb/extension/icu/./icu-table-range.cpp src/duckdb/extension/icu/./icu-makedate.cpp src/duckdb/extension/icu/./icu-list-range.cpp src/duckdb/extension/icu/./icu-timebucket.cpp src/duckdb/extension/icu/./icu-timezone.cpp src/duckdb/extension/icu/./icu-dateadd.cpp src/duckdb/extension/icu/./icu-datetrunc.cpp src/duckdb/extension/icu/./icu-datesub.cpp src/duckdb/extension/icu/./icu_extension.cpp src/duckdb/extension/icu/./icu-strptime.cpp src/duckdb/extension/icu/./icu-datefunc.cpp src/duckdb/extension/icu/./icu-datepart.cpp src/duckdb/ub_extension_icu_third_party_icu_common.cpp src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp src/duckdb/extension/icu/third_party/icu/stubdata/stubdata.cpp src/duckdb/extension/json/buffered_json_reader.cpp src/duckdb/extension/json/json_enums.cpp src/duckdb/extension/json/json_extension.cpp src/duckdb/extension/json/json_common.cpp src/duckdb/extension/json/json_functions.cpp src/duckdb/extension/json/json_scan.cpp src/duckdb/extension/json/json_serializer.cpp src/duckdb/extension/json/json_deserializer.cpp src/duckdb/extension/json/serialize_json.cpp src/duckdb/ub_extension_json_json_functions.cpp) set(CMAKE_JAVA_COMPILE_FLAGS -source 1.8 -target 1.8 -encoding utf-8) diff --git a/src/duckdb/extension/core_functions/scalar/random/random.cpp b/src/duckdb/extension/core_functions/scalar/random/random.cpp index c093f0f00..3054170ff 100644 --- a/src/duckdb/extension/core_functions/scalar/random/random.cpp +++ b/src/duckdb/extension/core_functions/scalar/random/random.cpp @@ -9,7 +9,8 @@ namespace duckdb { struct RandomLocalState : public FunctionLocalState { - explicit RandomLocalState(uint32_t seed) : random_engine(seed) { + explicit RandomLocalState(uint64_t seed) : random_engine(0) { + random_engine.SetSeed(seed); } RandomEngine random_engine; @@ -30,7 +31,7 @@ static unique_ptr RandomInitLocalState(ExpressionState &stat FunctionData *bind_data) { auto &random_engine = RandomEngine::Get(state.GetContext()); lock_guard guard(random_engine.lock); - return make_uniq(random_engine.NextRandomInteger()); + return make_uniq(random_engine.NextRandomInteger64()); } ScalarFunction RandomFun::GetFunction() { diff --git a/src/duckdb/extension/jemalloc/include/jemalloc_extension.hpp b/src/duckdb/extension/jemalloc/include/jemalloc_extension.hpp new file mode 100644 index 000000000..a5ef49671 --- /dev/null +++ b/src/duckdb/extension/jemalloc/include/jemalloc_extension.hpp @@ -0,0 +1,32 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// jemalloc_extension.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "duckdb.hpp" + +namespace duckdb { + +class JemallocExtension : public Extension { +public: + void Load(DuckDB &db) override; + std::string Name() override; + std::string Version() const override; + + static data_ptr_t Allocate(PrivateAllocatorData *private_data, idx_t size); + static void Free(PrivateAllocatorData *private_data, data_ptr_t pointer, idx_t size); + static data_ptr_t Reallocate(PrivateAllocatorData *private_data, data_ptr_t pointer, idx_t old_size, idx_t size); + + static int64_t DecayDelay(); + static void ThreadFlush(idx_t threshold); + static void ThreadIdle(); + static void FlushAll(); + static void SetBackgroundThreads(bool enable); +}; + +} // namespace duckdb diff --git a/src/duckdb/extension/jemalloc/include/malloc_ncpus.h b/src/duckdb/extension/jemalloc/include/malloc_ncpus.h new file mode 100644 index 000000000..18b044a44 --- /dev/null +++ b/src/duckdb/extension/jemalloc/include/malloc_ncpus.h @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// malloc_ncpus.h +// +// +//===----------------------------------------------------------------------===// + +#ifndef MALLOC_NCPUS_H +#define MALLOC_NCPUS_H + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned duckdb_malloc_ncpus(); + +#ifdef __cplusplus +} +#endif + +#endif // MALLOC_NCPUS_H diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/activity_callback.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/activity_callback.h new file mode 100644 index 000000000..0f4f39622 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/activity_callback.h @@ -0,0 +1,25 @@ +#ifndef JEMALLOC_INTERNAL_ACTIVITY_CALLBACK_H +#define JEMALLOC_INTERNAL_ACTIVITY_CALLBACK_H + +#include "jemalloc/internal/jemalloc_preamble.h" + +/* + * The callback to be executed "periodically", in response to some amount of + * allocator activity. + * + * This callback need not be computing any sort of peak (although that's the + * intended first use case), but we drive it from the peak counter, so it's + * keeps things tidy to keep it here. + * + * The calls to this thunk get driven by the peak_event module. + */ +#define ACTIVITY_CALLBACK_THUNK_INITIALIZER {NULL, NULL} +typedef void (*activity_callback_t)(void *uctx, uint64_t allocated, + uint64_t deallocated); +typedef struct activity_callback_thunk_s activity_callback_thunk_t; +struct activity_callback_thunk_s { + activity_callback_t callback; + void *uctx; +}; + +#endif /* JEMALLOC_INTERNAL_ACTIVITY_CALLBACK_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/arena_externs.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/arena_externs.h new file mode 100644 index 000000000..3d0329fce --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/arena_externs.h @@ -0,0 +1,124 @@ +#ifndef JEMALLOC_INTERNAL_ARENA_EXTERNS_H +#define JEMALLOC_INTERNAL_ARENA_EXTERNS_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/arena_stats.h" +#include "jemalloc/internal/bin.h" +#include "jemalloc/internal/div.h" +#include "jemalloc/internal/emap.h" +#include "jemalloc/internal/extent_dss.h" +#include "jemalloc/internal/hook.h" +#include "jemalloc/internal/pages.h" +#include "jemalloc/internal/stats.h" + +/* + * When the amount of pages to be purged exceeds this amount, deferred purge + * should happen. + */ +#define ARENA_DEFERRED_PURGE_NPAGES_THRESHOLD UINT64_C(1024) + +extern ssize_t opt_dirty_decay_ms; +extern ssize_t opt_muzzy_decay_ms; + +extern percpu_arena_mode_t opt_percpu_arena; +extern const char *const percpu_arena_mode_names[]; + +extern div_info_t arena_binind_div_info[SC_NBINS]; + +extern emap_t arena_emap_global; + +extern size_t opt_oversize_threshold; +extern size_t oversize_threshold; + +/* + * arena_bin_offsets[binind] is the offset of the first bin shard for size class + * binind. + */ +extern uint32_t arena_bin_offsets[SC_NBINS]; + +void arena_basic_stats_merge(tsdn_t *tsdn, arena_t *arena, + unsigned *nthreads, const char **dss, ssize_t *dirty_decay_ms, + ssize_t *muzzy_decay_ms, size_t *nactive, size_t *ndirty, size_t *nmuzzy); +void arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads, + const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms, + size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats, + bin_stats_data_t *bstats, arena_stats_large_t *lstats, + pac_estats_t *estats, hpa_shard_stats_t *hpastats, sec_stats_t *secstats); +void arena_handle_deferred_work(tsdn_t *tsdn, arena_t *arena); +edata_t *arena_extent_alloc_large(tsdn_t *tsdn, arena_t *arena, + size_t usize, size_t alignment, bool zero); +void arena_extent_dalloc_large_prep(tsdn_t *tsdn, arena_t *arena, + edata_t *edata); +void arena_extent_ralloc_large_shrink(tsdn_t *tsdn, arena_t *arena, + edata_t *edata, size_t oldusize); +void arena_extent_ralloc_large_expand(tsdn_t *tsdn, arena_t *arena, + edata_t *edata, size_t oldusize); +bool arena_decay_ms_set(tsdn_t *tsdn, arena_t *arena, extent_state_t state, + ssize_t decay_ms); +ssize_t arena_decay_ms_get(arena_t *arena, extent_state_t state); +void arena_decay(tsdn_t *tsdn, arena_t *arena, bool is_background_thread, + bool all); +uint64_t arena_time_until_deferred(tsdn_t *tsdn, arena_t *arena); +void arena_do_deferred_work(tsdn_t *tsdn, arena_t *arena); +void arena_reset(tsd_t *tsd, arena_t *arena); +void arena_destroy(tsd_t *tsd, arena_t *arena); +void arena_cache_bin_fill_small(tsdn_t *tsdn, arena_t *arena, + cache_bin_t *cache_bin, szind_t binind, const cache_bin_sz_t nfill); + +void *arena_malloc_hard(tsdn_t *tsdn, arena_t *arena, size_t size, + szind_t ind, bool zero, bool slab); +void *arena_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, + size_t alignment, bool zero, bool slab, tcache_t *tcache); +void arena_prof_promote(tsdn_t *tsdn, void *ptr, size_t usize, + size_t bumped_usize); +void arena_dalloc_promoted(tsdn_t *tsdn, void *ptr, tcache_t *tcache, + bool slow_path); +void arena_slab_dalloc(tsdn_t *tsdn, arena_t *arena, edata_t *slab); + +void arena_dalloc_bin_locked_handle_newly_empty(tsdn_t *tsdn, arena_t *arena, + edata_t *slab, bin_t *bin); +void arena_dalloc_bin_locked_handle_newly_nonempty(tsdn_t *tsdn, arena_t *arena, + edata_t *slab, bin_t *bin); +void arena_dalloc_small(tsdn_t *tsdn, void *ptr); +bool arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, + size_t extra, bool zero, size_t *newsize); +void *arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize, + size_t size, size_t alignment, bool zero, bool slab, tcache_t *tcache, + hook_ralloc_args_t *hook_args); +dss_prec_t arena_dss_prec_get(arena_t *arena); +ehooks_t *arena_get_ehooks(arena_t *arena); +extent_hooks_t *arena_set_extent_hooks(tsd_t *tsd, arena_t *arena, + extent_hooks_t *extent_hooks); +bool arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec); +void arena_name_get(arena_t *arena, char *name); +void arena_name_set(arena_t *arena, const char *name); +ssize_t arena_dirty_decay_ms_default_get(void); +bool arena_dirty_decay_ms_default_set(ssize_t decay_ms); +ssize_t arena_muzzy_decay_ms_default_get(void); +bool arena_muzzy_decay_ms_default_set(ssize_t decay_ms); +bool arena_retain_grow_limit_get_set(tsd_t *tsd, arena_t *arena, + size_t *old_limit, size_t *new_limit); +unsigned arena_nthreads_get(arena_t *arena, bool internal); +void arena_nthreads_inc(arena_t *arena, bool internal); +void arena_nthreads_dec(arena_t *arena, bool internal); +arena_t *arena_new(tsdn_t *tsdn, unsigned ind, const arena_config_t *config); +bool arena_init_huge(arena_t *a0); +arena_t *arena_choose_huge(tsd_t *tsd); +bin_t *arena_bin_choose(tsdn_t *tsdn, arena_t *arena, szind_t binind, + unsigned *binshard); +size_t arena_fill_small_fresh(tsdn_t *tsdn, arena_t *arena, szind_t binind, + void **ptrs, size_t nfill, bool zero); +bool arena_boot(sc_data_t *sc_data, base_t *base, bool hpa); +void arena_prefork0(tsdn_t *tsdn, arena_t *arena); +void arena_prefork1(tsdn_t *tsdn, arena_t *arena); +void arena_prefork2(tsdn_t *tsdn, arena_t *arena); +void arena_prefork3(tsdn_t *tsdn, arena_t *arena); +void arena_prefork4(tsdn_t *tsdn, arena_t *arena); +void arena_prefork5(tsdn_t *tsdn, arena_t *arena); +void arena_prefork6(tsdn_t *tsdn, arena_t *arena); +void arena_prefork7(tsdn_t *tsdn, arena_t *arena); +void arena_prefork8(tsdn_t *tsdn, arena_t *arena); +void arena_postfork_parent(tsdn_t *tsdn, arena_t *arena); +void arena_postfork_child(tsdn_t *tsdn, arena_t *arena); + +#endif /* JEMALLOC_INTERNAL_ARENA_EXTERNS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/arena_inlines_a.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/arena_inlines_a.h new file mode 100644 index 000000000..214ce80b2 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/arena_inlines_a.h @@ -0,0 +1,27 @@ +#ifndef JEMALLOC_INTERNAL_ARENA_INLINES_A_H +#define JEMALLOC_INTERNAL_ARENA_INLINES_A_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/arena_structs.h" + +static inline unsigned +arena_ind_get(const arena_t *arena) { + return arena->ind; +} + +static inline void +arena_internal_add(arena_t *arena, size_t size) { + atomic_fetch_add_zu(&arena->stats.internal, size, ATOMIC_RELAXED); +} + +static inline void +arena_internal_sub(arena_t *arena, size_t size) { + atomic_fetch_sub_zu(&arena->stats.internal, size, ATOMIC_RELAXED); +} + +static inline size_t +arena_internal_get(arena_t *arena) { + return atomic_load_zu(&arena->stats.internal, ATOMIC_RELAXED); +} + +#endif /* JEMALLOC_INTERNAL_ARENA_INLINES_A_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/arena_inlines_b.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/arena_inlines_b.h new file mode 100644 index 000000000..ea246cc5b --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/arena_inlines_b.h @@ -0,0 +1,758 @@ +#ifndef JEMALLOC_INTERNAL_ARENA_INLINES_B_H +#define JEMALLOC_INTERNAL_ARENA_INLINES_B_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/arena_externs.h" +#include "jemalloc/internal/arena_structs.h" +#include "jemalloc/internal/div.h" +#include "jemalloc/internal/emap.h" +#include "jemalloc/internal/jemalloc_internal_inlines_b.h" +#include "jemalloc/internal/jemalloc_internal_types.h" +#include "jemalloc/internal/large_externs.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/prof_externs.h" +#include "jemalloc/internal/prof_structs.h" +#include "jemalloc/internal/rtree.h" +#include "jemalloc/internal/safety_check.h" +#include "jemalloc/internal/sc.h" +#include "jemalloc/internal/sz.h" +#include "jemalloc/internal/tcache_inlines.h" +#include "jemalloc/internal/ticker.h" + +static inline arena_t * +arena_get_from_edata(edata_t *edata) { + return (arena_t *)atomic_load_p(&arenas[edata_arena_ind_get(edata)], + ATOMIC_RELAXED); +} + +JEMALLOC_ALWAYS_INLINE arena_t * +arena_choose_maybe_huge(tsd_t *tsd, arena_t *arena, size_t size) { + if (arena != NULL) { + return arena; + } + + /* + * For huge allocations, use the dedicated huge arena if both are true: + * 1) is using auto arena selection (i.e. arena == NULL), and 2) the + * thread is not assigned to a manual arena. + */ + arena_t *tsd_arena = tsd_arena_get(tsd); + if (tsd_arena == NULL) { + tsd_arena = arena_choose(tsd, NULL); + } + + size_t threshold = atomic_load_zu( + &tsd_arena->pa_shard.pac.oversize_threshold, ATOMIC_RELAXED); + if (unlikely(size >= threshold) && arena_is_auto(tsd_arena)) { + return arena_choose_huge(tsd); + } + + return tsd_arena; +} + +JEMALLOC_ALWAYS_INLINE bool +large_dalloc_safety_checks(edata_t *edata, const void *ptr, szind_t szind) { + if (!config_opt_safety_checks) { + return false; + } + + /* + * Eagerly detect double free and sized dealloc bugs for large sizes. + * The cost is low enough (as edata will be accessed anyway) to be + * enabled all the time. + */ + if (unlikely(edata == NULL || + edata_state_get(edata) != extent_state_active)) { + safety_check_fail("Invalid deallocation detected: " + "pages being freed (%p) not currently active, " + "possibly caused by double free bugs.", ptr); + return true; + } + size_t input_size = sz_index2size(szind); + if (unlikely(input_size != edata_usize_get(edata))) { + safety_check_fail_sized_dealloc(/* current_dealloc */ true, ptr, + /* true_size */ edata_usize_get(edata), input_size); + return true; + } + + return false; +} + +JEMALLOC_ALWAYS_INLINE void +arena_prof_info_get(tsd_t *tsd, const void *ptr, emap_alloc_ctx_t *alloc_ctx, + prof_info_t *prof_info, bool reset_recent) { + cassert(config_prof); + assert(ptr != NULL); + assert(prof_info != NULL); + + edata_t *edata = NULL; + bool is_slab; + + /* Static check. */ + if (alloc_ctx == NULL) { + edata = emap_edata_lookup(tsd_tsdn(tsd), &arena_emap_global, + ptr); + is_slab = edata_slab_get(edata); + } else if (unlikely(!(is_slab = alloc_ctx->slab))) { + edata = emap_edata_lookup(tsd_tsdn(tsd), &arena_emap_global, + ptr); + } + + if (unlikely(!is_slab)) { + /* edata must have been initialized at this point. */ + assert(edata != NULL); + if (reset_recent && + large_dalloc_safety_checks(edata, ptr, + edata_szind_get(edata))) { + prof_info->alloc_tctx = PROF_TCTX_SENTINEL; + return; + } + large_prof_info_get(tsd, edata, prof_info, reset_recent); + } else { + prof_info->alloc_tctx = PROF_TCTX_SENTINEL; + /* + * No need to set other fields in prof_info; they will never be + * accessed if alloc_tctx == PROF_TCTX_SENTINEL. + */ + } +} + +JEMALLOC_ALWAYS_INLINE void +arena_prof_tctx_reset(tsd_t *tsd, const void *ptr, + emap_alloc_ctx_t *alloc_ctx) { + cassert(config_prof); + assert(ptr != NULL); + + /* Static check. */ + if (alloc_ctx == NULL) { + edata_t *edata = emap_edata_lookup(tsd_tsdn(tsd), + &arena_emap_global, ptr); + if (unlikely(!edata_slab_get(edata))) { + large_prof_tctx_reset(edata); + } + } else { + if (unlikely(!alloc_ctx->slab)) { + edata_t *edata = emap_edata_lookup(tsd_tsdn(tsd), + &arena_emap_global, ptr); + large_prof_tctx_reset(edata); + } + } +} + +JEMALLOC_ALWAYS_INLINE void +arena_prof_tctx_reset_sampled(tsd_t *tsd, const void *ptr) { + cassert(config_prof); + assert(ptr != NULL); + + edata_t *edata = emap_edata_lookup(tsd_tsdn(tsd), &arena_emap_global, + ptr); + assert(!edata_slab_get(edata)); + + large_prof_tctx_reset(edata); +} + +JEMALLOC_ALWAYS_INLINE void +arena_prof_info_set(tsd_t *tsd, edata_t *edata, prof_tctx_t *tctx, + size_t size) { + cassert(config_prof); + + assert(!edata_slab_get(edata)); + large_prof_info_set(edata, tctx, size); +} + +JEMALLOC_ALWAYS_INLINE void +arena_decay_ticks(tsdn_t *tsdn, arena_t *arena, unsigned nticks) { + if (unlikely(tsdn_null(tsdn))) { + return; + } + tsd_t *tsd = tsdn_tsd(tsdn); + /* + * We use the ticker_geom_t to avoid having per-arena state in the tsd. + * Instead of having a countdown-until-decay timer running for every + * arena in every thread, we flip a coin once per tick, whose + * probability of coming up heads is 1/nticks; this is effectively the + * operation of the ticker_geom_t. Each arena has the same chance of a + * coinflip coming up heads (1/ARENA_DECAY_NTICKS_PER_UPDATE), so we can + * use a single ticker for all of them. + */ + ticker_geom_t *decay_ticker = tsd_arena_decay_tickerp_get(tsd); + uint64_t *prng_state = tsd_prng_statep_get(tsd); + if (unlikely(ticker_geom_ticks(decay_ticker, prng_state, nticks, + tsd_reentrancy_level_get(tsd) > 0))) { + arena_decay(tsdn, arena, false, false); + } +} + +JEMALLOC_ALWAYS_INLINE void +arena_decay_tick(tsdn_t *tsdn, arena_t *arena) { + arena_decay_ticks(tsdn, arena, 1); +} + +JEMALLOC_ALWAYS_INLINE void * +arena_malloc(tsdn_t *tsdn, arena_t *arena, size_t size, szind_t ind, bool zero, + bool slab, tcache_t *tcache, bool slow_path) { + assert(!tsdn_null(tsdn) || tcache == NULL); + + if (likely(tcache != NULL)) { + if (likely(slab)) { + assert(sz_can_use_slab(size)); + return tcache_alloc_small(tsdn_tsd(tsdn), arena, + tcache, size, ind, zero, slow_path); + } else if (likely( + ind < tcache_nbins_get(tcache->tcache_slow) && + !tcache_bin_disabled(ind, &tcache->bins[ind], + tcache->tcache_slow))) { + return tcache_alloc_large(tsdn_tsd(tsdn), arena, + tcache, size, ind, zero, slow_path); + } + /* (size > tcache_max) case falls through. */ + } + + return arena_malloc_hard(tsdn, arena, size, ind, zero, slab); +} + +JEMALLOC_ALWAYS_INLINE arena_t * +arena_aalloc(tsdn_t *tsdn, const void *ptr) { + edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, ptr); + unsigned arena_ind = edata_arena_ind_get(edata); + return (arena_t *)atomic_load_p(&arenas[arena_ind], ATOMIC_RELAXED); +} + +JEMALLOC_ALWAYS_INLINE size_t +arena_salloc(tsdn_t *tsdn, const void *ptr) { + assert(ptr != NULL); + emap_alloc_ctx_t alloc_ctx; + emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr, &alloc_ctx); + assert(alloc_ctx.szind != SC_NSIZES); + + return sz_index2size(alloc_ctx.szind); +} + +JEMALLOC_ALWAYS_INLINE size_t +arena_vsalloc(tsdn_t *tsdn, const void *ptr) { + /* + * Return 0 if ptr is not within an extent managed by jemalloc. This + * function has two extra costs relative to isalloc(): + * - The rtree calls cannot claim to be dependent lookups, which induces + * rtree lookup load dependencies. + * - The lookup may fail, so there is an extra branch to check for + * failure. + */ + + emap_full_alloc_ctx_t full_alloc_ctx; + bool missing = emap_full_alloc_ctx_try_lookup(tsdn, &arena_emap_global, + ptr, &full_alloc_ctx); + if (missing) { + return 0; + } + + if (full_alloc_ctx.edata == NULL) { + return 0; + } + assert(edata_state_get(full_alloc_ctx.edata) == extent_state_active); + /* Only slab members should be looked up via interior pointers. */ + assert(edata_addr_get(full_alloc_ctx.edata) == ptr + || edata_slab_get(full_alloc_ctx.edata)); + + assert(full_alloc_ctx.szind != SC_NSIZES); + + return sz_index2size(full_alloc_ctx.szind); +} + +static inline void +arena_dalloc_large_no_tcache(tsdn_t *tsdn, void *ptr, szind_t szind) { + if (config_prof && unlikely(szind < SC_NBINS)) { + arena_dalloc_promoted(tsdn, ptr, NULL, true); + } else { + edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, + ptr); + if (large_dalloc_safety_checks(edata, ptr, szind)) { + /* See the comment in isfree. */ + return; + } + large_dalloc(tsdn, edata); + } +} + +static inline void +arena_dalloc_no_tcache(tsdn_t *tsdn, void *ptr) { + assert(ptr != NULL); + + emap_alloc_ctx_t alloc_ctx; + emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr, &alloc_ctx); + + if (config_debug) { + edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, + ptr); + assert(alloc_ctx.szind == edata_szind_get(edata)); + assert(alloc_ctx.szind < SC_NSIZES); + assert(alloc_ctx.slab == edata_slab_get(edata)); + } + + if (likely(alloc_ctx.slab)) { + /* Small allocation. */ + arena_dalloc_small(tsdn, ptr); + } else { + arena_dalloc_large_no_tcache(tsdn, ptr, alloc_ctx.szind); + } +} + +JEMALLOC_ALWAYS_INLINE void +arena_dalloc_large(tsdn_t *tsdn, void *ptr, tcache_t *tcache, szind_t szind, + bool slow_path) { + assert (!tsdn_null(tsdn) && tcache != NULL); + bool is_sample_promoted = config_prof && szind < SC_NBINS; + if (unlikely(is_sample_promoted)) { + arena_dalloc_promoted(tsdn, ptr, tcache, slow_path); + } else { + if (szind < tcache_nbins_get(tcache->tcache_slow) && + !tcache_bin_disabled(szind, &tcache->bins[szind], + tcache->tcache_slow)) { + tcache_dalloc_large(tsdn_tsd(tsdn), tcache, ptr, szind, + slow_path); + } else { + edata_t *edata = emap_edata_lookup(tsdn, + &arena_emap_global, ptr); + if (large_dalloc_safety_checks(edata, ptr, szind)) { + /* See the comment in isfree. */ + return; + } + large_dalloc(tsdn, edata); + } + } +} + +/* Find the region index of a pointer. */ +JEMALLOC_ALWAYS_INLINE size_t +arena_slab_regind_impl(div_info_t* div_info, szind_t binind, + edata_t *slab, const void *ptr) { + size_t diff, regind; + + /* Freeing a pointer outside the slab can cause assertion failure. */ + assert((uintptr_t)ptr >= (uintptr_t)edata_addr_get(slab)); + assert((uintptr_t)ptr < (uintptr_t)edata_past_get(slab)); + /* Freeing an interior pointer can cause assertion failure. */ + assert(((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab)) % + (uintptr_t)bin_infos[binind].reg_size == 0); + + diff = (size_t)((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab)); + + /* Avoid doing division with a variable divisor. */ + regind = div_compute(div_info, diff); + assert(regind < bin_infos[binind].nregs); + return regind; +} + +/* Checks whether ptr is currently active in the arena. */ +JEMALLOC_ALWAYS_INLINE bool +arena_tcache_dalloc_small_safety_check(tsdn_t *tsdn, void *ptr) { + if (!config_debug) { + return false; + } + edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, ptr); + szind_t binind = edata_szind_get(edata); + div_info_t div_info = arena_binind_div_info[binind]; + /* + * Calls the internal function arena_slab_regind_impl because the + * safety check does not require a lock. + */ + size_t regind = arena_slab_regind_impl(&div_info, binind, edata, ptr); + slab_data_t *slab_data = edata_slab_data_get(edata); + const bin_info_t *bin_info = &bin_infos[binind]; + assert(edata_nfree_get(edata) < bin_info->nregs); + if (unlikely(!bitmap_get(slab_data->bitmap, &bin_info->bitmap_info, + regind))) { + safety_check_fail( + "Invalid deallocation detected: the pointer being freed (%p) not " + "currently active, possibly caused by double free bugs.\n", ptr); + return true; + } + return false; +} + +JEMALLOC_ALWAYS_INLINE void +arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache, + emap_alloc_ctx_t *caller_alloc_ctx, bool slow_path) { + assert(!tsdn_null(tsdn) || tcache == NULL); + assert(ptr != NULL); + + if (unlikely(tcache == NULL)) { + arena_dalloc_no_tcache(tsdn, ptr); + return; + } + + emap_alloc_ctx_t alloc_ctx; + if (caller_alloc_ctx != NULL) { + alloc_ctx = *caller_alloc_ctx; + } else { + util_assume(tsdn != NULL); + emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr, + &alloc_ctx); + } + + if (config_debug) { + edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, + ptr); + assert(alloc_ctx.szind == edata_szind_get(edata)); + assert(alloc_ctx.szind < SC_NSIZES); + assert(alloc_ctx.slab == edata_slab_get(edata)); + } + + if (likely(alloc_ctx.slab)) { + /* Small allocation. */ + if (arena_tcache_dalloc_small_safety_check(tsdn, ptr)) { + return; + } + tcache_dalloc_small(tsdn_tsd(tsdn), tcache, ptr, + alloc_ctx.szind, slow_path); + } else { + arena_dalloc_large(tsdn, ptr, tcache, alloc_ctx.szind, + slow_path); + } +} + +static inline void +arena_sdalloc_no_tcache(tsdn_t *tsdn, void *ptr, size_t size) { + assert(ptr != NULL); + assert(size <= SC_LARGE_MAXCLASS); + + emap_alloc_ctx_t alloc_ctx; + if (!config_prof || !opt_prof) { + /* + * There is no risk of being confused by a promoted sampled + * object, so base szind and slab on the given size. + */ + alloc_ctx.szind = sz_size2index(size); + alloc_ctx.slab = (alloc_ctx.szind < SC_NBINS); + } + + if ((config_prof && opt_prof) || config_debug) { + emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr, + &alloc_ctx); + + assert(alloc_ctx.szind == sz_size2index(size)); + assert((config_prof && opt_prof) + || alloc_ctx.slab == (alloc_ctx.szind < SC_NBINS)); + + if (config_debug) { + edata_t *edata = emap_edata_lookup(tsdn, + &arena_emap_global, ptr); + assert(alloc_ctx.szind == edata_szind_get(edata)); + assert(alloc_ctx.slab == edata_slab_get(edata)); + } + } + + if (likely(alloc_ctx.slab)) { + /* Small allocation. */ + arena_dalloc_small(tsdn, ptr); + } else { + arena_dalloc_large_no_tcache(tsdn, ptr, alloc_ctx.szind); + } +} + +JEMALLOC_ALWAYS_INLINE void +arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache, + emap_alloc_ctx_t *caller_alloc_ctx, bool slow_path) { + assert(!tsdn_null(tsdn) || tcache == NULL); + assert(ptr != NULL); + assert(size <= SC_LARGE_MAXCLASS); + + if (unlikely(tcache == NULL)) { + arena_sdalloc_no_tcache(tsdn, ptr, size); + return; + } + + emap_alloc_ctx_t alloc_ctx; + if (config_prof && opt_prof) { + if (caller_alloc_ctx == NULL) { + /* Uncommon case and should be a static check. */ + emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr, + &alloc_ctx); + assert(alloc_ctx.szind == sz_size2index(size)); + } else { + alloc_ctx = *caller_alloc_ctx; + } + } else { + /* + * There is no risk of being confused by a promoted sampled + * object, so base szind and slab on the given size. + */ + alloc_ctx.szind = sz_size2index(size); + alloc_ctx.slab = (alloc_ctx.szind < SC_NBINS); + } + + if (config_debug) { + edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, + ptr); + assert(alloc_ctx.szind == edata_szind_get(edata)); + assert(alloc_ctx.slab == edata_slab_get(edata)); + } + + if (likely(alloc_ctx.slab)) { + /* Small allocation. */ + if (arena_tcache_dalloc_small_safety_check(tsdn, ptr)) { + return; + } + tcache_dalloc_small(tsdn_tsd(tsdn), tcache, ptr, + alloc_ctx.szind, slow_path); + } else { + arena_dalloc_large(tsdn, ptr, tcache, alloc_ctx.szind, + slow_path); + } +} + +static inline void +arena_cache_oblivious_randomize(tsdn_t *tsdn, arena_t *arena, edata_t *edata, + size_t alignment) { + assert(edata_base_get(edata) == edata_addr_get(edata)); + + if (alignment < PAGE) { + unsigned lg_range = LG_PAGE - + lg_floor(CACHELINE_CEILING(alignment)); + size_t r; + if (!tsdn_null(tsdn)) { + tsd_t *tsd = tsdn_tsd(tsdn); + r = (size_t)prng_lg_range_u64( + tsd_prng_statep_get(tsd), lg_range); + } else { + uint64_t stack_value = (uint64_t)(uintptr_t)&r; + r = (size_t)prng_lg_range_u64(&stack_value, lg_range); + } + uintptr_t random_offset = ((uintptr_t)r) << (LG_PAGE - + lg_range); + edata->e_addr = (void *)((byte_t *)edata->e_addr + + random_offset); + assert(ALIGNMENT_ADDR2BASE(edata->e_addr, alignment) == + edata->e_addr); + } +} + +/* + * The dalloc bin info contains just the information that the common paths need + * during tcache flushes. By force-inlining these paths, and using local copies + * of data (so that the compiler knows it's constant), we avoid a whole bunch of + * redundant loads and stores by leaving this information in registers. + */ +typedef struct arena_dalloc_bin_locked_info_s arena_dalloc_bin_locked_info_t; +struct arena_dalloc_bin_locked_info_s { + div_info_t div_info; + uint32_t nregs; + uint64_t ndalloc; +}; + +JEMALLOC_ALWAYS_INLINE size_t +arena_slab_regind(arena_dalloc_bin_locked_info_t *info, szind_t binind, + edata_t *slab, const void *ptr) { + size_t regind = arena_slab_regind_impl(&info->div_info, binind, slab, ptr); + return regind; +} + +JEMALLOC_ALWAYS_INLINE void +arena_dalloc_bin_locked_begin(arena_dalloc_bin_locked_info_t *info, + szind_t binind) { + info->div_info = arena_binind_div_info[binind]; + info->nregs = bin_infos[binind].nregs; + info->ndalloc = 0; +} + +/* + * Does the deallocation work associated with freeing a single pointer (a + * "step") in between a arena_dalloc_bin_locked begin and end call. + * + * Returns true if arena_slab_dalloc must be called on slab. Doesn't do + * stats updates, which happen during finish (this lets running counts get left + * in a register). + */ +JEMALLOC_ALWAYS_INLINE void +arena_dalloc_bin_locked_step(tsdn_t *tsdn, arena_t *arena, bin_t *bin, + arena_dalloc_bin_locked_info_t *info, szind_t binind, edata_t *slab, + void *ptr, edata_t **dalloc_slabs, unsigned ndalloc_slabs, + unsigned *dalloc_slabs_count, edata_list_active_t *dalloc_slabs_extra) { + const bin_info_t *bin_info = &bin_infos[binind]; + size_t regind = arena_slab_regind(info, binind, slab, ptr); + slab_data_t *slab_data = edata_slab_data_get(slab); + + assert(edata_nfree_get(slab) < bin_info->nregs); + /* Freeing an unallocated pointer can cause assertion failure. */ + assert(bitmap_get(slab_data->bitmap, &bin_info->bitmap_info, regind)); + + bitmap_unset(slab_data->bitmap, &bin_info->bitmap_info, regind); + edata_nfree_inc(slab); + + if (config_stats) { + info->ndalloc++; + } + + unsigned nfree = edata_nfree_get(slab); + if (nfree == bin_info->nregs) { + arena_dalloc_bin_locked_handle_newly_empty(tsdn, arena, slab, + bin); + + if (*dalloc_slabs_count < ndalloc_slabs) { + dalloc_slabs[*dalloc_slabs_count] = slab; + (*dalloc_slabs_count)++; + } else { + edata_list_active_append(dalloc_slabs_extra, slab); + } + } else if (nfree == 1 && slab != bin->slabcur) { + arena_dalloc_bin_locked_handle_newly_nonempty(tsdn, arena, slab, + bin); + } +} + +JEMALLOC_ALWAYS_INLINE void +arena_dalloc_bin_locked_finish(tsdn_t *tsdn, arena_t *arena, bin_t *bin, + arena_dalloc_bin_locked_info_t *info) { + if (config_stats) { + bin->stats.ndalloc += info->ndalloc; + assert(bin->stats.curregs >= (size_t)info->ndalloc); + bin->stats.curregs -= (size_t)info->ndalloc; + } +} + +JEMALLOC_ALWAYS_INLINE void +arena_bin_flush_batch_impl(tsdn_t *tsdn, arena_t *arena, bin_t *bin, + arena_dalloc_bin_locked_info_t *dalloc_bin_info, unsigned binind, + edata_t **dalloc_slabs, unsigned ndalloc_slabs, unsigned *dalloc_count, + edata_list_active_t *dalloc_slabs_extra) { + assert(binind < bin_info_nbatched_sizes); + bin_with_batch_t *batched_bin = (bin_with_batch_t *)bin; + size_t nelems_to_pop = batcher_pop_begin(tsdn, + &batched_bin->remote_frees); + + bin_batching_test_mid_pop(nelems_to_pop); + if (nelems_to_pop == BATCHER_NO_IDX) { + malloc_mutex_assert_not_owner(tsdn, + &batched_bin->remote_frees.mtx); + return; + } else { + malloc_mutex_assert_owner(tsdn, + &batched_bin->remote_frees.mtx); + } + + size_t npushes = batcher_pop_get_pushes(tsdn, + &batched_bin->remote_frees); + bin_remote_free_data_t remote_free_data[BIN_REMOTE_FREE_ELEMS_MAX]; + for (size_t i = 0; i < nelems_to_pop; i++) { + remote_free_data[i] = batched_bin->remote_free_data[i]; + } + batcher_pop_end(tsdn, &batched_bin->remote_frees); + + for (size_t i = 0; i < nelems_to_pop; i++) { + arena_dalloc_bin_locked_step(tsdn, arena, bin, dalloc_bin_info, + binind, remote_free_data[i].slab, remote_free_data[i].ptr, + dalloc_slabs, ndalloc_slabs, dalloc_count, + dalloc_slabs_extra); + } + + bin->stats.batch_pops++; + bin->stats.batch_pushes += npushes; + bin->stats.batch_pushed_elems += nelems_to_pop; +} + +typedef struct arena_bin_flush_batch_state_s arena_bin_flush_batch_state_t; +struct arena_bin_flush_batch_state_s { + arena_dalloc_bin_locked_info_t info; + + /* + * Bin batching is subtle in that there are unusual edge cases in which + * it can trigger the deallocation of more slabs than there were items + * flushed (say, if every original deallocation triggered a slab + * deallocation, and so did every batched one). So we keep a small + * backup array for any "extra" slabs, as well as a a list to allow a + * dynamic number of ones exceeding that array. + */ + edata_t *dalloc_slabs[8]; + unsigned dalloc_slab_count; + edata_list_active_t dalloc_slabs_extra; +}; + +JEMALLOC_ALWAYS_INLINE unsigned +arena_bin_batch_get_ndalloc_slabs(unsigned preallocated_slabs) { + if (preallocated_slabs > bin_batching_test_ndalloc_slabs_max) { + return bin_batching_test_ndalloc_slabs_max; + } + return preallocated_slabs; +} + +JEMALLOC_ALWAYS_INLINE void +arena_bin_flush_batch_after_lock(tsdn_t *tsdn, arena_t *arena, bin_t *bin, + unsigned binind, arena_bin_flush_batch_state_t *state) { + if (binind >= bin_info_nbatched_sizes) { + return; + } + + arena_dalloc_bin_locked_begin(&state->info, binind); + state->dalloc_slab_count = 0; + edata_list_active_init(&state->dalloc_slabs_extra); + + unsigned preallocated_slabs = (unsigned)(sizeof(state->dalloc_slabs) + / sizeof(state->dalloc_slabs[0])); + unsigned ndalloc_slabs = arena_bin_batch_get_ndalloc_slabs( + preallocated_slabs); + + arena_bin_flush_batch_impl(tsdn, arena, bin, &state->info, binind, + state->dalloc_slabs, ndalloc_slabs, + &state->dalloc_slab_count, &state->dalloc_slabs_extra); +} + +JEMALLOC_ALWAYS_INLINE void +arena_bin_flush_batch_before_unlock(tsdn_t *tsdn, arena_t *arena, bin_t *bin, + unsigned binind, arena_bin_flush_batch_state_t *state) { + if (binind >= bin_info_nbatched_sizes) { + return; + } + + arena_dalloc_bin_locked_finish(tsdn, arena, bin, &state->info); +} + +static inline bool +arena_bin_has_batch(szind_t binind) { + return binind < bin_info_nbatched_sizes; +} + +JEMALLOC_ALWAYS_INLINE void +arena_bin_flush_batch_after_unlock(tsdn_t *tsdn, arena_t *arena, bin_t *bin, + unsigned binind, arena_bin_flush_batch_state_t *state) { + if (!arena_bin_has_batch(binind)) { + return; + } + /* + * The initialization of dalloc_slabs_extra is guarded by an + * arena_bin_has_batch check higher up the stack. But the clang + * analyzer forgets this down the stack, triggering a spurious error + * reported here. + */ + JEMALLOC_CLANG_ANALYZER_SUPPRESS { + bin_batching_test_after_unlock(state->dalloc_slab_count, + edata_list_active_empty(&state->dalloc_slabs_extra)); + } + for (unsigned i = 0; i < state->dalloc_slab_count; i++) { + edata_t *slab = state->dalloc_slabs[i]; + arena_slab_dalloc(tsdn, arena_get_from_edata(slab), slab); + } + while (!edata_list_active_empty(&state->dalloc_slabs_extra)) { + edata_t *slab = edata_list_active_first( + &state->dalloc_slabs_extra); + edata_list_active_remove(&state->dalloc_slabs_extra, slab); + arena_slab_dalloc(tsdn, arena_get_from_edata(slab), slab); + } +} + +static inline bin_t * +arena_get_bin(arena_t *arena, szind_t binind, unsigned binshard) { + bin_t *shard0 = (bin_t *)((byte_t *)arena + arena_bin_offsets[binind]); + bin_t *ret; + if (arena_bin_has_batch(binind)) { + ret = (bin_t *)((bin_with_batch_t *)shard0 + binshard); + } else { + ret = shard0 + binshard; + } + assert(binind >= SC_NBINS - 1 + || (uintptr_t)ret < (uintptr_t)arena + + arena_bin_offsets[binind + 1]); + + return ret; +} + +#endif /* JEMALLOC_INTERNAL_ARENA_INLINES_B_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/arena_stats.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/arena_stats.h new file mode 100644 index 000000000..3d512630c --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/arena_stats.h @@ -0,0 +1,117 @@ +#ifndef JEMALLOC_INTERNAL_ARENA_STATS_H +#define JEMALLOC_INTERNAL_ARENA_STATS_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/lockedint.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/mutex_prof.h" +#include "jemalloc/internal/pa.h" +#include "jemalloc/internal/sc.h" + +JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS + +typedef struct arena_stats_large_s arena_stats_large_t; +struct arena_stats_large_s { + /* + * Total number of allocation/deallocation requests served directly by + * the arena. + */ + locked_u64_t nmalloc; + locked_u64_t ndalloc; + + /* + * Number of allocation requests that correspond to this size class. + * This includes requests served by tcache, though tcache only + * periodically merges into this counter. + */ + locked_u64_t nrequests; /* Partially derived. */ + /* + * Number of tcache fills / flushes for large (similarly, periodically + * merged). Note that there is no large tcache batch-fill currently + * (i.e. only fill 1 at a time); however flush may be batched. + */ + locked_u64_t nfills; /* Partially derived. */ + locked_u64_t nflushes; /* Partially derived. */ + + /* Current number of allocations of this size class. */ + size_t curlextents; /* Derived. */ +}; + +/* + * Arena stats. Note that fields marked "derived" are not directly maintained + * within the arena code; rather their values are derived during stats merge + * requests. + */ +typedef struct arena_stats_s arena_stats_t; +struct arena_stats_s { + LOCKEDINT_MTX_DECLARE(mtx) + + /* + * resident includes the base stats -- that's why it lives here and not + * in pa_shard_stats_t. + */ + size_t base; /* Derived. */ + size_t metadata_edata; /* Derived. */ + size_t metadata_rtree; /* Derived. */ + size_t resident; /* Derived. */ + size_t metadata_thp; /* Derived. */ + size_t mapped; /* Derived. */ + + atomic_zu_t internal; + + size_t allocated_large; /* Derived. */ + uint64_t nmalloc_large; /* Derived. */ + uint64_t ndalloc_large; /* Derived. */ + uint64_t nfills_large; /* Derived. */ + uint64_t nflushes_large; /* Derived. */ + uint64_t nrequests_large; /* Derived. */ + + /* + * The stats logically owned by the pa_shard in the same arena. This + * lives here only because it's convenient for the purposes of the ctl + * module -- it only knows about the single arena_stats. + */ + pa_shard_stats_t pa_shard_stats; + + /* Number of bytes cached in tcache associated with this arena. */ + size_t tcache_bytes; /* Derived. */ + size_t tcache_stashed_bytes; /* Derived. */ + + mutex_prof_data_t mutex_prof_data[mutex_prof_num_arena_mutexes]; + + /* One element for each large size class. */ + arena_stats_large_t lstats[SC_NSIZES - SC_NBINS]; + + /* Arena uptime. */ + nstime_t uptime; +}; + +static inline bool +arena_stats_init(tsdn_t *tsdn, arena_stats_t *arena_stats) { + if (config_debug) { + for (size_t i = 0; i < sizeof(arena_stats_t); i++) { + assert(((char *)arena_stats)[i] == 0); + } + } + if (LOCKEDINT_MTX_INIT(arena_stats->mtx, "arena_stats", + WITNESS_RANK_ARENA_STATS, malloc_mutex_rank_exclusive)) { + return true; + } + /* Memory is zeroed, so there is no need to clear stats. */ + return false; +} + +static inline void +arena_stats_large_flush_nrequests_add(tsdn_t *tsdn, arena_stats_t *arena_stats, + szind_t szind, uint64_t nrequests) { + LOCKEDINT_MTX_LOCK(tsdn, arena_stats->mtx); + arena_stats_large_t *lstats = &arena_stats->lstats[szind - SC_NBINS]; + locked_inc_u64(tsdn, LOCKEDINT_MTX(arena_stats->mtx), + &lstats->nrequests, nrequests); + locked_inc_u64(tsdn, LOCKEDINT_MTX(arena_stats->mtx), + &lstats->nflushes, 1); + LOCKEDINT_MTX_UNLOCK(tsdn, arena_stats->mtx); +} + +#endif /* JEMALLOC_INTERNAL_ARENA_STATS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/arena_structs.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/arena_structs.h new file mode 100644 index 000000000..56e12f958 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/arena_structs.h @@ -0,0 +1,110 @@ +#ifndef JEMALLOC_INTERNAL_ARENA_STRUCTS_H +#define JEMALLOC_INTERNAL_ARENA_STRUCTS_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/arena_stats.h" +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/bin.h" +#include "jemalloc/internal/bitmap.h" +#include "jemalloc/internal/counter.h" +#include "jemalloc/internal/ecache.h" +#include "jemalloc/internal/edata_cache.h" +#include "jemalloc/internal/extent_dss.h" +#include "jemalloc/internal/jemalloc_internal_types.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/nstime.h" +#include "jemalloc/internal/pa.h" +#include "jemalloc/internal/ql.h" +#include "jemalloc/internal/sc.h" +#include "jemalloc/internal/ticker.h" + +struct arena_s { + /* + * Number of threads currently assigned to this arena. Each thread has + * two distinct assignments, one for application-serving allocation, and + * the other for internal metadata allocation. Internal metadata must + * not be allocated from arenas explicitly created via the arenas.create + * mallctl, because the arena..reset mallctl indiscriminately + * discards all allocations for the affected arena. + * + * 0: Application allocation. + * 1: Internal metadata allocation. + * + * Synchronization: atomic. + */ + atomic_u_t nthreads[2]; + + /* Next bin shard for binding new threads. Synchronization: atomic. */ + atomic_u_t binshard_next; + + /* + * When percpu_arena is enabled, to amortize the cost of reading / + * updating the current CPU id, track the most recent thread accessing + * this arena, and only read CPU if there is a mismatch. + */ + tsdn_t *last_thd; + + /* Synchronization: internal. */ + arena_stats_t stats; + + /* + * Lists of tcaches and cache_bin_array_descriptors for extant threads + * associated with this arena. Stats from these are merged + * incrementally, and at exit if opt_stats_print is enabled. + * + * Synchronization: tcache_ql_mtx. + */ + ql_head(tcache_slow_t) tcache_ql; + ql_head(cache_bin_array_descriptor_t) cache_bin_array_descriptor_ql; + malloc_mutex_t tcache_ql_mtx; + + /* + * Represents a dss_prec_t, but atomically. + * + * Synchronization: atomic. + */ + atomic_u_t dss_prec; + + /* + * Extant large allocations. + * + * Synchronization: large_mtx. + */ + edata_list_active_t large; + /* Synchronizes all large allocation/update/deallocation. */ + malloc_mutex_t large_mtx; + + /* The page-level allocator shard this arena uses. */ + pa_shard_t pa_shard; + + /* + * A cached copy of base->ind. This can get accessed on hot paths; + * looking it up in base requires an extra pointer hop / cache miss. + */ + unsigned ind; + + /* + * Base allocator, from which arena metadata are allocated. + * + * Synchronization: internal. + */ + base_t *base; + /* Used to determine uptime. Read-only after initialization. */ + nstime_t create_time; + + /* The name of the arena. */ + char name[ARENA_NAME_LEN]; + + /* + * The arena is allocated alongside its bins; really this is a + * dynamically sized array determined by the binshard settings. + * Enforcing cacheline-alignment to minimize the number of cachelines + * touched on the hot paths. + */ + JEMALLOC_WARN_ON_USAGE("Do not use this field directly. " + "Use `arena_get_bin` instead.") + JEMALLOC_ALIGNED(CACHELINE) + bin_with_batch_t all_bins[0]; +}; + +#endif /* JEMALLOC_INTERNAL_ARENA_STRUCTS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/arena_types.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/arena_types.h new file mode 100644 index 000000000..a1fc8926e --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/arena_types.h @@ -0,0 +1,61 @@ +#ifndef JEMALLOC_INTERNAL_ARENA_TYPES_H +#define JEMALLOC_INTERNAL_ARENA_TYPES_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/sc.h" + +/* Default decay times in milliseconds. */ +#define DIRTY_DECAY_MS_DEFAULT ZD(10 * 1000) +#define MUZZY_DECAY_MS_DEFAULT (0) +/* Number of event ticks between time checks. */ +#define ARENA_DECAY_NTICKS_PER_UPDATE 1000 +/* Maximum length of the arena name. */ +#define ARENA_NAME_LEN 32 + +typedef struct arena_decay_s arena_decay_t; +typedef struct arena_s arena_t; + +typedef enum { + percpu_arena_mode_names_base = 0, /* Used for options processing. */ + + /* + * *_uninit are used only during bootstrapping, and must correspond + * to initialized variant plus percpu_arena_mode_enabled_base. + */ + percpu_arena_uninit = 0, + per_phycpu_arena_uninit = 1, + + /* All non-disabled modes must come after percpu_arena_disabled. */ + percpu_arena_disabled = 2, + + percpu_arena_mode_names_limit = 3, /* Used for options processing. */ + percpu_arena_mode_enabled_base = 3, + + percpu_arena = 3, + per_phycpu_arena = 4 /* Hyper threads share arena. */ +} percpu_arena_mode_t; + +#define PERCPU_ARENA_ENABLED(m) ((m) >= percpu_arena_mode_enabled_base) +#define PERCPU_ARENA_DEFAULT percpu_arena_disabled + +/* + * When allocation_size >= oversize_threshold, use the dedicated huge arena + * (unless have explicitly spicified arena index). 0 disables the feature. + */ +#define OVERSIZE_THRESHOLD_DEFAULT (8 << 20) + +struct arena_config_s { + /* extent hooks to be used for the arena */ + extent_hooks_t *extent_hooks; + + /* + * Use extent hooks for metadata (base) allocations when true. + */ + bool metadata_use_hooks; +}; + +typedef struct arena_config_s arena_config_t; + +extern const arena_config_t arena_config_default; + +#endif /* JEMALLOC_INTERNAL_ARENA_TYPES_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/assert.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/assert.h new file mode 100644 index 000000000..38eb2a2c0 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/assert.h @@ -0,0 +1,57 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/malloc_io.h" +#include "jemalloc/internal/util.h" + +/* + * Define a custom assert() in order to reduce the chances of deadlock during + * assertion failure. + */ +#ifndef assert +#define assert(e) do { \ + if (unlikely(config_debug && !(e))) { \ + malloc_printf( \ + ": %s:%d: Failed assertion: \"%s\"\n", \ + __FILE__, __LINE__, #e); \ + abort(); \ + } \ +} while (0) +#endif + +#ifndef not_reached +#define not_reached() do { \ + if (config_debug) { \ + malloc_printf( \ + ": %s:%d: Unreachable code reached\n", \ + __FILE__, __LINE__); \ + abort(); \ + } \ + unreachable(); \ +} while (0) +#endif + +#ifndef not_implemented +#define not_implemented() do { \ + if (config_debug) { \ + malloc_printf(": %s:%d: Not implemented\n", \ + __FILE__, __LINE__); \ + abort(); \ + } \ +} while (0) +#endif + +#ifndef assert_not_implemented +#define assert_not_implemented(e) do { \ + if (unlikely(config_debug && !(e))) { \ + not_implemented(); \ + } \ +} while (0) +#endif + +/* Use to assert a particular configuration, e.g., cassert(config_debug). */ +#ifndef cassert +#define cassert(c) do { \ + if (unlikely(!(c))) { \ + not_reached(); \ + } \ +} while (0) +#endif diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/atomic.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/atomic.h new file mode 100644 index 000000000..6dd2a7c60 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/atomic.h @@ -0,0 +1,109 @@ +#ifndef JEMALLOC_INTERNAL_ATOMIC_H +#define JEMALLOC_INTERNAL_ATOMIC_H + +#include "jemalloc/internal/jemalloc_preamble.h" + +#define JEMALLOC_U8_ATOMICS +#if defined(JEMALLOC_GCC_ATOMIC_ATOMICS) +# include "jemalloc/internal/atomic_gcc_atomic.h" +# if !defined(JEMALLOC_GCC_U8_ATOMIC_ATOMICS) +# undef JEMALLOC_U8_ATOMICS +# endif +#elif defined(JEMALLOC_GCC_SYNC_ATOMICS) +# include "jemalloc/internal/atomic_gcc_sync.h" +# if !defined(JEMALLOC_GCC_U8_SYNC_ATOMICS) +# undef JEMALLOC_U8_ATOMICS +# endif +#elif defined(_MSC_VER) +# include "jemalloc/internal/atomic_msvc.h" +#elif defined(JEMALLOC_C11_ATOMICS) +# include "jemalloc/internal/atomic_c11.h" +#else +# error "Don't have atomics implemented on this platform." +#endif + +#define ATOMIC_INLINE JEMALLOC_ALWAYS_INLINE + +/* + * This header gives more or less a backport of C11 atomics. The user can write + * JEMALLOC_GENERATE_ATOMICS(type, short_type, lg_sizeof_type); to generate + * counterparts of the C11 atomic functions for type, as so: + * JEMALLOC_GENERATE_ATOMICS(int *, pi, 3); + * and then write things like: + * int *some_ptr; + * atomic_pi_t atomic_ptr_to_int; + * atomic_store_pi(&atomic_ptr_to_int, some_ptr, ATOMIC_RELAXED); + * int *prev_value = atomic_exchange_pi(&ptr_to_int, NULL, ATOMIC_ACQ_REL); + * assert(some_ptr == prev_value); + * and expect things to work in the obvious way. + * + * Also included (with naming differences to avoid conflicts with the standard + * library): + * atomic_fence(atomic_memory_order_t) (mimics C11's atomic_thread_fence). + * ATOMIC_INIT (mimics C11's ATOMIC_VAR_INIT). + */ + +/* + * Pure convenience, so that we don't have to type "atomic_memory_order_" + * quite so often. + */ +#define ATOMIC_RELAXED atomic_memory_order_relaxed +#define ATOMIC_ACQUIRE atomic_memory_order_acquire +#define ATOMIC_RELEASE atomic_memory_order_release +#define ATOMIC_ACQ_REL atomic_memory_order_acq_rel +#define ATOMIC_SEQ_CST atomic_memory_order_seq_cst + +/* + * Another convenience -- simple atomic helper functions. + */ +#define JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(type, short_type, \ + lg_size) \ + JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, lg_size) \ + ATOMIC_INLINE void \ + atomic_load_add_store_##short_type(atomic_##short_type##_t *a, \ + type inc) { \ + type oldval = atomic_load_##short_type(a, ATOMIC_RELAXED); \ + type newval = oldval + inc; \ + atomic_store_##short_type(a, newval, ATOMIC_RELAXED); \ + } \ + ATOMIC_INLINE void \ + atomic_load_sub_store_##short_type(atomic_##short_type##_t *a, \ + type inc) { \ + type oldval = atomic_load_##short_type(a, ATOMIC_RELAXED); \ + type newval = oldval - inc; \ + atomic_store_##short_type(a, newval, ATOMIC_RELAXED); \ + } + +/* + * Not all platforms have 64-bit atomics. If we do, this #define exposes that + * fact. + */ +#if (LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3) +# define JEMALLOC_ATOMIC_U64 +#endif + +JEMALLOC_GENERATE_ATOMICS(void *, p, LG_SIZEOF_PTR) + +/* + * There's no actual guarantee that sizeof(bool) == 1, but it's true on the only + * platform that actually needs to know the size, MSVC. + */ +JEMALLOC_GENERATE_ATOMICS(bool, b, 0) + +JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(unsigned, u, LG_SIZEOF_INT) + +JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(size_t, zu, LG_SIZEOF_PTR) + +JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(ssize_t, zd, LG_SIZEOF_PTR) + +JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(uint8_t, u8, 0) + +JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(uint32_t, u32, 2) + +#ifdef JEMALLOC_ATOMIC_U64 +JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(uint64_t, u64, 3) +#endif + +#undef ATOMIC_INLINE + +#endif /* JEMALLOC_INTERNAL_ATOMIC_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/atomic_c11.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/atomic_c11.h new file mode 100644 index 000000000..74173b031 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/atomic_c11.h @@ -0,0 +1,98 @@ +#ifndef JEMALLOC_INTERNAL_ATOMIC_C11_H +#define JEMALLOC_INTERNAL_ATOMIC_C11_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include + +#define ATOMIC_INIT(...) ATOMIC_VAR_INIT(__VA_ARGS__) + +#define atomic_memory_order_t memory_order +#define atomic_memory_order_relaxed memory_order_relaxed +#define atomic_memory_order_acquire memory_order_acquire +#define atomic_memory_order_release memory_order_release +#define atomic_memory_order_acq_rel memory_order_acq_rel +#define atomic_memory_order_seq_cst memory_order_seq_cst + +#define atomic_fence atomic_thread_fence + +#define JEMALLOC_GENERATE_ATOMICS(type, short_type, \ + /* unused */ lg_size) \ +typedef _Atomic(type) atomic_##short_type##_t; \ + \ +ATOMIC_INLINE type \ +atomic_load_##short_type(const atomic_##short_type##_t *a, \ + atomic_memory_order_t mo) { \ + /* \ + * A strict interpretation of the C standard prevents \ + * atomic_load from taking a const argument, but it's \ + * convenient for our purposes. This cast is a workaround. \ + */ \ + atomic_##short_type##_t* a_nonconst = \ + (atomic_##short_type##_t*)a; \ + return atomic_load_explicit(a_nonconst, mo); \ +} \ + \ +ATOMIC_INLINE void \ +atomic_store_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + atomic_store_explicit(a, val, mo); \ +} \ + \ +ATOMIC_INLINE type \ +atomic_exchange_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + return atomic_exchange_explicit(a, val, mo); \ +} \ + \ +ATOMIC_INLINE bool \ +atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a, \ + type *expected, type desired, atomic_memory_order_t success_mo, \ + atomic_memory_order_t failure_mo) { \ + return atomic_compare_exchange_weak_explicit(a, expected, \ + desired, success_mo, failure_mo); \ +} \ + \ +ATOMIC_INLINE bool \ +atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a, \ + type *expected, type desired, atomic_memory_order_t success_mo, \ + atomic_memory_order_t failure_mo) { \ + return atomic_compare_exchange_strong_explicit(a, expected, \ + desired, success_mo, failure_mo); \ +} + +/* + * Integral types have some special operations available that non-integral ones + * lack. + */ +#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, \ + /* unused */ lg_size) \ +JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size) \ + \ +ATOMIC_INLINE type \ +atomic_fetch_add_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + return atomic_fetch_add_explicit(a, val, mo); \ +} \ + \ +ATOMIC_INLINE type \ +atomic_fetch_sub_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + return atomic_fetch_sub_explicit(a, val, mo); \ +} \ +ATOMIC_INLINE type \ +atomic_fetch_and_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + return atomic_fetch_and_explicit(a, val, mo); \ +} \ +ATOMIC_INLINE type \ +atomic_fetch_or_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + return atomic_fetch_or_explicit(a, val, mo); \ +} \ +ATOMIC_INLINE type \ +atomic_fetch_xor_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + return atomic_fetch_xor_explicit(a, val, mo); \ +} + +#endif /* JEMALLOC_INTERNAL_ATOMIC_C11_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/atomic_gcc_atomic.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/atomic_gcc_atomic.h new file mode 100644 index 000000000..0819fde17 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/atomic_gcc_atomic.h @@ -0,0 +1,134 @@ +#ifndef JEMALLOC_INTERNAL_ATOMIC_GCC_ATOMIC_H +#define JEMALLOC_INTERNAL_ATOMIC_GCC_ATOMIC_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/assert.h" + +#define ATOMIC_INLINE JEMALLOC_ALWAYS_INLINE + +#define ATOMIC_INIT(...) {__VA_ARGS__} + +typedef enum { + atomic_memory_order_relaxed, + atomic_memory_order_acquire, + atomic_memory_order_release, + atomic_memory_order_acq_rel, + atomic_memory_order_seq_cst +} atomic_memory_order_t; + +ATOMIC_INLINE int +atomic_enum_to_builtin(atomic_memory_order_t mo) { + switch (mo) { + case atomic_memory_order_relaxed: + return __ATOMIC_RELAXED; + case atomic_memory_order_acquire: + return __ATOMIC_ACQUIRE; + case atomic_memory_order_release: + return __ATOMIC_RELEASE; + case atomic_memory_order_acq_rel: + return __ATOMIC_ACQ_REL; + case atomic_memory_order_seq_cst: + return __ATOMIC_SEQ_CST; + } + /* Can't happen; the switch is exhaustive. */ + not_reached(); +} + +ATOMIC_INLINE void +atomic_fence(atomic_memory_order_t mo) { + __atomic_thread_fence(atomic_enum_to_builtin(mo)); +} + +#define JEMALLOC_GENERATE_ATOMICS(type, short_type, \ + /* unused */ lg_size) \ +typedef struct { \ + type repr; \ +} atomic_##short_type##_t; \ + \ +ATOMIC_INLINE type \ +atomic_load_##short_type(const atomic_##short_type##_t *a, \ + atomic_memory_order_t mo) { \ + type result; \ + __atomic_load(&a->repr, &result, atomic_enum_to_builtin(mo)); \ + return result; \ +} \ + \ +ATOMIC_INLINE void \ +atomic_store_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + __atomic_store(&a->repr, &val, atomic_enum_to_builtin(mo)); \ +} \ + \ +ATOMIC_INLINE type \ +atomic_exchange_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + type result; \ + __atomic_exchange(&a->repr, &val, &result, \ + atomic_enum_to_builtin(mo)); \ + return result; \ +} \ + \ +ATOMIC_INLINE bool \ +atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a, \ + UNUSED type *expected, type desired, \ + atomic_memory_order_t success_mo, \ + atomic_memory_order_t failure_mo) { \ + return __atomic_compare_exchange(&a->repr, expected, &desired, \ + true, atomic_enum_to_builtin(success_mo), \ + atomic_enum_to_builtin(failure_mo)); \ +} \ + \ +ATOMIC_INLINE bool \ +atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a, \ + UNUSED type *expected, type desired, \ + atomic_memory_order_t success_mo, \ + atomic_memory_order_t failure_mo) { \ + return __atomic_compare_exchange(&a->repr, expected, &desired, \ + false, \ + atomic_enum_to_builtin(success_mo), \ + atomic_enum_to_builtin(failure_mo)); \ +} + + +#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, \ + /* unused */ lg_size) \ +JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size) \ + \ +ATOMIC_INLINE type \ +atomic_fetch_add_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + return __atomic_fetch_add(&a->repr, val, \ + atomic_enum_to_builtin(mo)); \ +} \ + \ +ATOMIC_INLINE type \ +atomic_fetch_sub_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + return __atomic_fetch_sub(&a->repr, val, \ + atomic_enum_to_builtin(mo)); \ +} \ + \ +ATOMIC_INLINE type \ +atomic_fetch_and_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + return __atomic_fetch_and(&a->repr, val, \ + atomic_enum_to_builtin(mo)); \ +} \ + \ +ATOMIC_INLINE type \ +atomic_fetch_or_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + return __atomic_fetch_or(&a->repr, val, \ + atomic_enum_to_builtin(mo)); \ +} \ + \ +ATOMIC_INLINE type \ +atomic_fetch_xor_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + return __atomic_fetch_xor(&a->repr, val, \ + atomic_enum_to_builtin(mo)); \ +} + +#undef ATOMIC_INLINE + +#endif /* JEMALLOC_INTERNAL_ATOMIC_GCC_ATOMIC_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/atomic_gcc_sync.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/atomic_gcc_sync.h new file mode 100644 index 000000000..21136bd0d --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/atomic_gcc_sync.h @@ -0,0 +1,201 @@ +#ifndef JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H +#define JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H + +#include "jemalloc/internal/jemalloc_preamble.h" + +#define ATOMIC_INLINE JEMALLOC_ALWAYS_INLINE + +#define ATOMIC_INIT(...) {__VA_ARGS__} + +typedef enum { + atomic_memory_order_relaxed, + atomic_memory_order_acquire, + atomic_memory_order_release, + atomic_memory_order_acq_rel, + atomic_memory_order_seq_cst +} atomic_memory_order_t; + +ATOMIC_INLINE void +atomic_fence(atomic_memory_order_t mo) { + /* Easy cases first: no barrier, and full barrier. */ + if (mo == atomic_memory_order_relaxed) { + asm volatile("" ::: "memory"); + return; + } + if (mo == atomic_memory_order_seq_cst) { + asm volatile("" ::: "memory"); + __sync_synchronize(); + asm volatile("" ::: "memory"); + return; + } + asm volatile("" ::: "memory"); +# if defined(__i386__) || defined(__x86_64__) + /* This is implicit on x86. */ +# elif defined(__ppc64__) + asm volatile("lwsync"); +# elif defined(__ppc__) + asm volatile("sync"); +# elif defined(__sparc__) && defined(__arch64__) + if (mo == atomic_memory_order_acquire) { + asm volatile("membar #LoadLoad | #LoadStore"); + } else if (mo == atomic_memory_order_release) { + asm volatile("membar #LoadStore | #StoreStore"); + } else { + asm volatile("membar #LoadLoad | #LoadStore | #StoreStore"); + } +# else + __sync_synchronize(); +# endif + asm volatile("" ::: "memory"); +} + +/* + * A correct implementation of seq_cst loads and stores on weakly ordered + * architectures could do either of the following: + * 1. store() is weak-fence -> store -> strong fence, load() is load -> + * strong-fence. + * 2. store() is strong-fence -> store, load() is strong-fence -> load -> + * weak-fence. + * The tricky thing is, load() and store() above can be the load or store + * portions of a gcc __sync builtin, so we have to follow GCC's lead, which + * means going with strategy 2. + * On strongly ordered architectures, the natural strategy is to stick a strong + * fence after seq_cst stores, and have naked loads. So we want the strong + * fences in different places on different architectures. + * atomic_pre_sc_load_fence and atomic_post_sc_store_fence allow us to + * accomplish this. + */ + +ATOMIC_INLINE void +atomic_pre_sc_load_fence() { +# if defined(__i386__) || defined(__x86_64__) || \ + (defined(__sparc__) && defined(__arch64__)) + atomic_fence(atomic_memory_order_relaxed); +# else + atomic_fence(atomic_memory_order_seq_cst); +# endif +} + +ATOMIC_INLINE void +atomic_post_sc_store_fence() { +# if defined(__i386__) || defined(__x86_64__) || \ + (defined(__sparc__) && defined(__arch64__)) + atomic_fence(atomic_memory_order_seq_cst); +# else + atomic_fence(atomic_memory_order_relaxed); +# endif + +} + +#define JEMALLOC_GENERATE_ATOMICS(type, short_type, \ + /* unused */ lg_size) \ +typedef struct { \ + type volatile repr; \ +} atomic_##short_type##_t; \ + \ +ATOMIC_INLINE type \ +atomic_load_##short_type(const atomic_##short_type##_t *a, \ + atomic_memory_order_t mo) { \ + if (mo == atomic_memory_order_seq_cst) { \ + atomic_pre_sc_load_fence(); \ + } \ + type result = a->repr; \ + if (mo != atomic_memory_order_relaxed) { \ + atomic_fence(atomic_memory_order_acquire); \ + } \ + return result; \ +} \ + \ +ATOMIC_INLINE void \ +atomic_store_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + if (mo != atomic_memory_order_relaxed) { \ + atomic_fence(atomic_memory_order_release); \ + } \ + a->repr = val; \ + if (mo == atomic_memory_order_seq_cst) { \ + atomic_post_sc_store_fence(); \ + } \ +} \ + \ +ATOMIC_INLINE type \ +atomic_exchange_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + /* \ + * Because of FreeBSD, we care about gcc 4.2, which doesn't have\ + * an atomic exchange builtin. We fake it with a CAS loop. \ + */ \ + while (true) { \ + type old = a->repr; \ + if (__sync_bool_compare_and_swap(&a->repr, old, val)) { \ + return old; \ + } \ + } \ +} \ + \ +ATOMIC_INLINE bool \ +atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a, \ + type *expected, type desired, \ + atomic_memory_order_t success_mo, \ + atomic_memory_order_t failure_mo) { \ + type prev = __sync_val_compare_and_swap(&a->repr, *expected, \ + desired); \ + if (prev == *expected) { \ + return true; \ + } else { \ + *expected = prev; \ + return false; \ + } \ +} \ +ATOMIC_INLINE bool \ +atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a, \ + type *expected, type desired, \ + atomic_memory_order_t success_mo, \ + atomic_memory_order_t failure_mo) { \ + type prev = __sync_val_compare_and_swap(&a->repr, *expected, \ + desired); \ + if (prev == *expected) { \ + return true; \ + } else { \ + *expected = prev; \ + return false; \ + } \ +} + +#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, \ + /* unused */ lg_size) \ +JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size) \ + \ +ATOMIC_INLINE type \ +atomic_fetch_add_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + return __sync_fetch_and_add(&a->repr, val); \ +} \ + \ +ATOMIC_INLINE type \ +atomic_fetch_sub_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + return __sync_fetch_and_sub(&a->repr, val); \ +} \ + \ +ATOMIC_INLINE type \ +atomic_fetch_and_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + return __sync_fetch_and_and(&a->repr, val); \ +} \ + \ +ATOMIC_INLINE type \ +atomic_fetch_or_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + return __sync_fetch_and_or(&a->repr, val); \ +} \ + \ +ATOMIC_INLINE type \ +atomic_fetch_xor_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + return __sync_fetch_and_xor(&a->repr, val); \ +} + +#undef ATOMIC_INLINE + +#endif /* JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/atomic_msvc.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/atomic_msvc.h new file mode 100644 index 000000000..a429f1abd --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/atomic_msvc.h @@ -0,0 +1,164 @@ +#ifndef JEMALLOC_INTERNAL_ATOMIC_MSVC_H +#define JEMALLOC_INTERNAL_ATOMIC_MSVC_H + +#include "jemalloc/internal/jemalloc_preamble.h" + +#define ATOMIC_INLINE JEMALLOC_ALWAYS_INLINE + +#define ATOMIC_INIT(...) {__VA_ARGS__} + +typedef enum { + atomic_memory_order_relaxed, + atomic_memory_order_acquire, + atomic_memory_order_release, + atomic_memory_order_acq_rel, + atomic_memory_order_seq_cst +} atomic_memory_order_t; + +typedef char atomic_repr_0_t; +typedef short atomic_repr_1_t; +typedef long atomic_repr_2_t; +typedef __int64 atomic_repr_3_t; + +ATOMIC_INLINE void +atomic_fence(atomic_memory_order_t mo) { + _ReadWriteBarrier(); +# if defined(_M_ARM) || defined(_M_ARM64) + /* ARM needs a barrier for everything but relaxed. */ + if (mo != atomic_memory_order_relaxed) { + MemoryBarrier(); + } +# elif defined(_M_IX86) || defined (_M_X64) + /* x86 needs a barrier only for seq_cst. */ + if (mo == atomic_memory_order_seq_cst) { + MemoryBarrier(); + } +# else +# error "Don't know how to create atomics for this platform for MSVC." +# endif + _ReadWriteBarrier(); +} + +#define ATOMIC_INTERLOCKED_REPR(lg_size) atomic_repr_ ## lg_size ## _t + +#define ATOMIC_CONCAT(a, b) ATOMIC_RAW_CONCAT(a, b) +#define ATOMIC_RAW_CONCAT(a, b) a ## b + +#define ATOMIC_INTERLOCKED_NAME(base_name, lg_size) ATOMIC_CONCAT( \ + base_name, ATOMIC_INTERLOCKED_SUFFIX(lg_size)) + +#define ATOMIC_INTERLOCKED_SUFFIX(lg_size) \ + ATOMIC_CONCAT(ATOMIC_INTERLOCKED_SUFFIX_, lg_size) + +#define ATOMIC_INTERLOCKED_SUFFIX_0 8 +#define ATOMIC_INTERLOCKED_SUFFIX_1 16 +#define ATOMIC_INTERLOCKED_SUFFIX_2 +#define ATOMIC_INTERLOCKED_SUFFIX_3 64 + +#define JEMALLOC_GENERATE_ATOMICS(type, short_type, lg_size) \ +typedef struct { \ + ATOMIC_INTERLOCKED_REPR(lg_size) repr; \ +} atomic_##short_type##_t; \ + \ +ATOMIC_INLINE type \ +atomic_load_##short_type(const atomic_##short_type##_t *a, \ + atomic_memory_order_t mo) { \ + ATOMIC_INTERLOCKED_REPR(lg_size) ret = a->repr; \ + if (mo != atomic_memory_order_relaxed) { \ + atomic_fence(atomic_memory_order_acquire); \ + } \ + return (type) ret; \ +} \ + \ +ATOMIC_INLINE void \ +atomic_store_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + if (mo != atomic_memory_order_relaxed) { \ + atomic_fence(atomic_memory_order_release); \ + } \ + a->repr = (ATOMIC_INTERLOCKED_REPR(lg_size)) val; \ + if (mo == atomic_memory_order_seq_cst) { \ + atomic_fence(atomic_memory_order_seq_cst); \ + } \ +} \ + \ +ATOMIC_INLINE type \ +atomic_exchange_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedExchange, \ + lg_size)(&a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val); \ +} \ + \ +ATOMIC_INLINE bool \ +atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a, \ + type *expected, type desired, atomic_memory_order_t success_mo, \ + atomic_memory_order_t failure_mo) { \ + ATOMIC_INTERLOCKED_REPR(lg_size) e = \ + (ATOMIC_INTERLOCKED_REPR(lg_size))*expected; \ + ATOMIC_INTERLOCKED_REPR(lg_size) d = \ + (ATOMIC_INTERLOCKED_REPR(lg_size))desired; \ + ATOMIC_INTERLOCKED_REPR(lg_size) old = \ + ATOMIC_INTERLOCKED_NAME(_InterlockedCompareExchange, \ + lg_size)(&a->repr, d, e); \ + if (old == e) { \ + return true; \ + } else { \ + *expected = (type)old; \ + return false; \ + } \ +} \ + \ +ATOMIC_INLINE bool \ +atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a, \ + type *expected, type desired, atomic_memory_order_t success_mo, \ + atomic_memory_order_t failure_mo) { \ + /* We implement the weak version with strong semantics. */ \ + return atomic_compare_exchange_weak_##short_type(a, expected, \ + desired, success_mo, failure_mo); \ +} + + +#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, lg_size) \ +JEMALLOC_GENERATE_ATOMICS(type, short_type, lg_size) \ + \ +ATOMIC_INLINE type \ +atomic_fetch_add_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedExchangeAdd, \ + lg_size)(&a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val); \ +} \ + \ +ATOMIC_INLINE type \ +atomic_fetch_sub_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + /* \ + * MSVC warns on negation of unsigned operands, but for us it \ + * gives exactly the right semantics (MAX_TYPE + 1 - operand). \ + */ \ + __pragma(warning(push)) \ + __pragma(warning(disable: 4146)) \ + return atomic_fetch_add_##short_type(a, -val, mo); \ + __pragma(warning(pop)) \ +} \ +ATOMIC_INLINE type \ +atomic_fetch_and_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedAnd, lg_size)( \ + &a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val); \ +} \ +ATOMIC_INLINE type \ +atomic_fetch_or_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedOr, lg_size)( \ + &a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val); \ +} \ +ATOMIC_INLINE type \ +atomic_fetch_xor_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedXor, lg_size)( \ + &a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val); \ +} + +#undef ATOMIC_INLINE + +#endif /* JEMALLOC_INTERNAL_ATOMIC_MSVC_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/background_thread_externs.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/background_thread_externs.h new file mode 100644 index 000000000..0d34ee550 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/background_thread_externs.h @@ -0,0 +1,38 @@ +#ifndef JEMALLOC_INTERNAL_BACKGROUND_THREAD_EXTERNS_H +#define JEMALLOC_INTERNAL_BACKGROUND_THREAD_EXTERNS_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/background_thread_structs.h" +#include "jemalloc/internal/base.h" +#include "jemalloc/internal/mutex.h" + +extern bool opt_background_thread; +extern size_t opt_max_background_threads; +extern malloc_mutex_t background_thread_lock; +extern atomic_b_t background_thread_enabled_state; +extern size_t n_background_threads; +extern size_t max_background_threads; +extern background_thread_info_t *background_thread_info; + +bool background_thread_create(tsd_t *tsd, unsigned arena_ind); +bool background_threads_enable(tsd_t *tsd); +bool background_threads_disable(tsd_t *tsd); +bool background_thread_is_started(background_thread_info_t* info); +void background_thread_wakeup_early(background_thread_info_t *info, + nstime_t *remaining_sleep); +void background_thread_prefork0(tsdn_t *tsdn); +void background_thread_prefork1(tsdn_t *tsdn); +void background_thread_postfork_parent(tsdn_t *tsdn); +void background_thread_postfork_child(tsdn_t *tsdn); +bool background_thread_stats_read(tsdn_t *tsdn, + background_thread_stats_t *stats); +void background_thread_ctl_init(tsdn_t *tsdn); + +#ifdef JEMALLOC_PTHREAD_CREATE_WRAPPER +extern int pthread_create_wrapper(pthread_t *__restrict, const pthread_attr_t *, + void *(*)(void *), void *__restrict); +#endif +bool background_thread_boot0(void); +bool background_thread_boot1(tsdn_t *tsdn, base_t *base); + +#endif /* JEMALLOC_INTERNAL_BACKGROUND_THREAD_EXTERNS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/background_thread_inlines.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/background_thread_inlines.h new file mode 100644 index 000000000..4ed05d1b9 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/background_thread_inlines.h @@ -0,0 +1,53 @@ +#ifndef JEMALLOC_INTERNAL_BACKGROUND_THREAD_INLINES_H +#define JEMALLOC_INTERNAL_BACKGROUND_THREAD_INLINES_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/arena_inlines_a.h" +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/background_thread_externs.h" + +JEMALLOC_ALWAYS_INLINE bool +background_thread_enabled(void) { + return atomic_load_b(&background_thread_enabled_state, ATOMIC_RELAXED); +} + +JEMALLOC_ALWAYS_INLINE void +background_thread_enabled_set(tsdn_t *tsdn, bool state) { + malloc_mutex_assert_owner(tsdn, &background_thread_lock); + atomic_store_b(&background_thread_enabled_state, state, ATOMIC_RELAXED); +} + +JEMALLOC_ALWAYS_INLINE background_thread_info_t * +arena_background_thread_info_get(arena_t *arena) { + unsigned arena_ind = arena_ind_get(arena); + return &background_thread_info[arena_ind % max_background_threads]; +} + +JEMALLOC_ALWAYS_INLINE background_thread_info_t * +background_thread_info_get(size_t ind) { + return &background_thread_info[ind % max_background_threads]; +} + +JEMALLOC_ALWAYS_INLINE uint64_t +background_thread_wakeup_time_get(background_thread_info_t *info) { + uint64_t next_wakeup = nstime_ns(&info->next_wakeup); + assert(atomic_load_b(&info->indefinite_sleep, ATOMIC_ACQUIRE) == + (next_wakeup == BACKGROUND_THREAD_INDEFINITE_SLEEP)); + return next_wakeup; +} + +JEMALLOC_ALWAYS_INLINE void +background_thread_wakeup_time_set(tsdn_t *tsdn, background_thread_info_t *info, + uint64_t wakeup_time) { + malloc_mutex_assert_owner(tsdn, &info->mtx); + atomic_store_b(&info->indefinite_sleep, + wakeup_time == BACKGROUND_THREAD_INDEFINITE_SLEEP, ATOMIC_RELEASE); + nstime_init(&info->next_wakeup, wakeup_time); +} + +JEMALLOC_ALWAYS_INLINE bool +background_thread_indefinite_sleep(background_thread_info_t *info) { + return atomic_load_b(&info->indefinite_sleep, ATOMIC_ACQUIRE); +} + +#endif /* JEMALLOC_INTERNAL_BACKGROUND_THREAD_INLINES_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/background_thread_structs.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/background_thread_structs.h new file mode 100644 index 000000000..67b687974 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/background_thread_structs.h @@ -0,0 +1,69 @@ +#ifndef JEMALLOC_INTERNAL_BACKGROUND_THREAD_STRUCTS_H +#define JEMALLOC_INTERNAL_BACKGROUND_THREAD_STRUCTS_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/mutex.h" + +/* This file really combines "structs" and "types", but only transitionally. */ + +#if defined(JEMALLOC_BACKGROUND_THREAD) || defined(JEMALLOC_LAZY_LOCK) +# define JEMALLOC_PTHREAD_CREATE_WRAPPER +#endif + +#define BACKGROUND_THREAD_INDEFINITE_SLEEP UINT64_MAX +#define MAX_BACKGROUND_THREAD_LIMIT MALLOCX_ARENA_LIMIT +#define DEFAULT_NUM_BACKGROUND_THREAD 4 + +/* + * These exist only as a transitional state. Eventually, deferral should be + * part of the PAI, and each implementation can indicate wait times with more + * specificity. + */ +#define BACKGROUND_THREAD_HPA_INTERVAL_MAX_UNINITIALIZED (-2) +#define BACKGROUND_THREAD_HPA_INTERVAL_MAX_DEFAULT_WHEN_ENABLED 5000 + +#define BACKGROUND_THREAD_DEFERRED_MIN UINT64_C(0) +#define BACKGROUND_THREAD_DEFERRED_MAX UINT64_MAX + +typedef enum { + background_thread_stopped, + background_thread_started, + /* Thread waits on the global lock when paused (for arena_reset). */ + background_thread_paused, +} background_thread_state_t; + +struct background_thread_info_s { +#ifdef JEMALLOC_BACKGROUND_THREAD + /* Background thread is pthread specific. */ + pthread_t thread; + pthread_cond_t cond; +#endif + malloc_mutex_t mtx; + background_thread_state_t state; + /* When true, it means no wakeup scheduled. */ + atomic_b_t indefinite_sleep; + /* Next scheduled wakeup time (absolute time in ns). */ + nstime_t next_wakeup; + /* + * Since the last background thread run, newly added number of pages + * that need to be purged by the next wakeup. This is adjusted on + * epoch advance, and is used to determine whether we should signal the + * background thread to wake up earlier. + */ + size_t npages_to_purge_new; + /* Stats: total number of runs since started. */ + uint64_t tot_n_runs; + /* Stats: total sleep time since started. */ + nstime_t tot_sleep_time; +}; +typedef struct background_thread_info_s background_thread_info_t; + +struct background_thread_stats_s { + size_t num_threads; + uint64_t num_runs; + nstime_t run_interval; + mutex_prof_data_t max_counter_per_bg_thd; +}; +typedef struct background_thread_stats_s background_thread_stats_t; + +#endif /* JEMALLOC_INTERNAL_BACKGROUND_THREAD_STRUCTS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/base.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/base.h new file mode 100644 index 000000000..86b0cf4a6 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/base.h @@ -0,0 +1,120 @@ +#ifndef JEMALLOC_INTERNAL_BASE_H +#define JEMALLOC_INTERNAL_BASE_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/edata.h" +#include "jemalloc/internal/ehooks.h" +#include "jemalloc/internal/mutex.h" + +enum metadata_thp_mode_e { + metadata_thp_disabled = 0, + /* + * Lazily enable hugepage for metadata. To avoid high RSS caused by THP + * + low usage arena (i.e. THP becomes a significant percentage), the + * "auto" option only starts using THP after a base allocator used up + * the first THP region. Starting from the second hugepage (in a single + * arena), "auto" behaves the same as "always", i.e. madvise hugepage + * right away. + */ + metadata_thp_auto = 1, + metadata_thp_always = 2, + metadata_thp_mode_limit = 3 +}; +typedef enum metadata_thp_mode_e metadata_thp_mode_t; + +#define METADATA_THP_DEFAULT metadata_thp_disabled +extern metadata_thp_mode_t opt_metadata_thp; +extern const char *const metadata_thp_mode_names[]; + + +/* Embedded at the beginning of every block of base-managed virtual memory. */ +typedef struct base_block_s base_block_t; +struct base_block_s { + /* Total size of block's virtual memory mapping. */ + size_t size; + + /* Next block in list of base's blocks. */ + base_block_t *next; + + /* Tracks unused trailing space. */ + edata_t edata; +}; + +typedef struct base_s base_t; +struct base_s { + /* + * User-configurable extent hook functions. + */ + ehooks_t ehooks; + + /* + * User-configurable extent hook functions for metadata allocations. + */ + ehooks_t ehooks_base; + + /* Protects base_alloc() and base_stats_get() operations. */ + malloc_mutex_t mtx; + + /* Using THP when true (metadata_thp auto mode). */ + bool auto_thp_switched; + /* + * Most recent size class in the series of increasingly large base + * extents. Logarithmic spacing between subsequent allocations ensures + * that the total number of distinct mappings remains small. + */ + pszind_t pind_last; + + /* Serial number generation state. */ + size_t extent_sn_next; + + /* Chain of all blocks associated with base. */ + base_block_t *blocks; + + /* Heap of extents that track unused trailing space within blocks. */ + edata_heap_t avail[SC_NSIZES]; + + /* Contains reusable base edata (used by tcache_stacks currently). */ + edata_avail_t edata_avail; + + /* Stats, only maintained if config_stats. */ + size_t allocated; + size_t edata_allocated; + size_t rtree_allocated; + size_t resident; + size_t mapped; + /* Number of THP regions touched. */ + size_t n_thp; +}; + +static inline unsigned +base_ind_get(const base_t *base) { + return ehooks_ind_get(&base->ehooks); +} + +static inline bool +metadata_thp_enabled(void) { + return (opt_metadata_thp != metadata_thp_disabled); +} + +base_t *b0get(void); +base_t *base_new(tsdn_t *tsdn, unsigned ind, + const extent_hooks_t *extent_hooks, bool metadata_use_hooks); +void base_delete(tsdn_t *tsdn, base_t *base); +ehooks_t *base_ehooks_get(base_t *base); +ehooks_t *base_ehooks_get_for_metadata(base_t *base); +extent_hooks_t *base_extent_hooks_set(base_t *base, + extent_hooks_t *extent_hooks); +void *base_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment); +edata_t *base_alloc_edata(tsdn_t *tsdn, base_t *base); +void *base_alloc_rtree(tsdn_t *tsdn, base_t *base, size_t size); +void *b0_alloc_tcache_stack(tsdn_t *tsdn, size_t size); +void b0_dalloc_tcache_stack(tsdn_t *tsdn, void *tcache_stack); +void base_stats_get(tsdn_t *tsdn, base_t *base, size_t *allocated, + size_t *edata_allocated, size_t *rtree_allocated, size_t *resident, + size_t *mapped, size_t *n_thp); +void base_prefork(tsdn_t *tsdn, base_t *base); +void base_postfork_parent(tsdn_t *tsdn, base_t *base); +void base_postfork_child(tsdn_t *tsdn, base_t *base); +bool base_boot(tsdn_t *tsdn); + +#endif /* JEMALLOC_INTERNAL_BASE_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/batcher.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/batcher.h new file mode 100644 index 000000000..40c8b35f7 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/batcher.h @@ -0,0 +1,46 @@ +#ifndef JEMALLOC_INTERNAL_BATCHER_H +#define JEMALLOC_INTERNAL_BATCHER_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/mutex.h" + +#define BATCHER_NO_IDX ((size_t)-1) + +typedef struct batcher_s batcher_t; +struct batcher_s { + /* + * Optimize for locality -- nelems_max and nelems are always touched + * togehter, along with the front of the mutex. The end of the mutex is + * only touched if there's contention. + */ + atomic_zu_t nelems; + size_t nelems_max; + size_t npushes; + malloc_mutex_t mtx; +}; + +void batcher_init(batcher_t *batcher, size_t nelems_max); + +/* + * Returns an index (into some user-owned array) to use for pushing, or + * BATCHER_NO_IDX if no index is free. If the former, the caller must call + * batcher_push_end once done. + */ +size_t batcher_push_begin(tsdn_t *tsdn, batcher_t *batcher, + size_t elems_to_push); +void batcher_push_end(tsdn_t *tsdn, batcher_t *batcher); + +/* + * Returns the number of items to pop, or BATCHER_NO_IDX if there are none. + * If the former, must be followed by a call to batcher_pop_end. + */ +size_t batcher_pop_begin(tsdn_t *tsdn, batcher_t *batcher); +size_t batcher_pop_get_pushes(tsdn_t *tsdn, batcher_t *batcher); +void batcher_pop_end(tsdn_t *tsdn, batcher_t *batcher); + +void batcher_prefork(tsdn_t *tsdn, batcher_t *batcher); +void batcher_postfork_parent(tsdn_t *tsdn, batcher_t *batcher); +void batcher_postfork_child(tsdn_t *tsdn, batcher_t *batcher); + +#endif /* JEMALLOC_INTERNAL_BATCHER_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/bin.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/bin.h new file mode 100644 index 000000000..c49afea6d --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/bin.h @@ -0,0 +1,149 @@ +#ifndef JEMALLOC_INTERNAL_BIN_H +#define JEMALLOC_INTERNAL_BIN_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/batcher.h" +#include "jemalloc/internal/bin_stats.h" +#include "jemalloc/internal/bin_types.h" +#include "jemalloc/internal/edata.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/sc.h" + +#define BIN_REMOTE_FREE_ELEMS_MAX 16 + +#ifdef JEMALLOC_JET +extern void (*bin_batching_test_after_push_hook)(size_t idx); +extern void (*bin_batching_test_mid_pop_hook)(size_t elems_to_pop); +extern void (*bin_batching_test_after_unlock_hook)(unsigned slab_dalloc_count, + bool list_empty); +#endif + +#ifdef JEMALLOC_JET +extern unsigned bin_batching_test_ndalloc_slabs_max; +#else +static const unsigned bin_batching_test_ndalloc_slabs_max = (unsigned)-1; +#endif + +JEMALLOC_ALWAYS_INLINE void +bin_batching_test_after_push(size_t idx) { + (void)idx; +#ifdef JEMALLOC_JET + if (bin_batching_test_after_push_hook != NULL) { + bin_batching_test_after_push_hook(idx); + } +#endif +} + +JEMALLOC_ALWAYS_INLINE void +bin_batching_test_mid_pop(size_t elems_to_pop) { + (void)elems_to_pop; +#ifdef JEMALLOC_JET + if (bin_batching_test_mid_pop_hook != NULL) { + bin_batching_test_mid_pop_hook(elems_to_pop); + } +#endif +} + +JEMALLOC_ALWAYS_INLINE void +bin_batching_test_after_unlock(unsigned slab_dalloc_count, bool list_empty) { + (void)slab_dalloc_count; + (void)list_empty; +#ifdef JEMALLOC_JET + if (bin_batching_test_after_unlock_hook != NULL) { + bin_batching_test_after_unlock_hook(slab_dalloc_count, + list_empty); + } +#endif +} + +/* + * A bin contains a set of extents that are currently being used for slab + * allocations. + */ +typedef struct bin_s bin_t; +struct bin_s { + /* All operations on bin_t fields require lock ownership. */ + malloc_mutex_t lock; + + /* + * Bin statistics. These get touched every time the lock is acquired, + * so put them close by in the hopes of getting some cache locality. + */ + bin_stats_t stats; + + /* + * Current slab being used to service allocations of this bin's size + * class. slabcur is independent of slabs_{nonfull,full}; whenever + * slabcur is reassigned, the previous slab must be deallocated or + * inserted into slabs_{nonfull,full}. + */ + edata_t *slabcur; + + /* + * Heap of non-full slabs. This heap is used to assure that new + * allocations come from the non-full slab that is oldest/lowest in + * memory. + */ + edata_heap_t slabs_nonfull; + + /* List used to track full slabs. */ + edata_list_active_t slabs_full; +}; + +typedef struct bin_remote_free_data_s bin_remote_free_data_t; +struct bin_remote_free_data_s { + void *ptr; + edata_t *slab; +}; + +typedef struct bin_with_batch_s bin_with_batch_t; +struct bin_with_batch_s { + bin_t bin; + batcher_t remote_frees; + bin_remote_free_data_t remote_free_data[BIN_REMOTE_FREE_ELEMS_MAX]; +}; + +/* A set of sharded bins of the same size class. */ +typedef struct bins_s bins_t; +struct bins_s { + /* Sharded bins. Dynamically sized. */ + bin_t *bin_shards; +}; + +void bin_shard_sizes_boot(unsigned bin_shard_sizes[SC_NBINS]); +bool bin_update_shard_size(unsigned bin_shards[SC_NBINS], size_t start_size, + size_t end_size, size_t nshards); + +/* Initializes a bin to empty. Returns true on error. */ +bool bin_init(bin_t *bin, unsigned binind); + +/* Forking. */ +void bin_prefork(tsdn_t *tsdn, bin_t *bin, bool has_batch); +void bin_postfork_parent(tsdn_t *tsdn, bin_t *bin, bool has_batch); +void bin_postfork_child(tsdn_t *tsdn, bin_t *bin, bool has_batch); + +/* Stats. */ +static inline void +bin_stats_merge(tsdn_t *tsdn, bin_stats_data_t *dst_bin_stats, bin_t *bin) { + malloc_mutex_lock(tsdn, &bin->lock); + malloc_mutex_prof_accum(tsdn, &dst_bin_stats->mutex_data, &bin->lock); + bin_stats_t *stats = &dst_bin_stats->stats_data; + stats->nmalloc += bin->stats.nmalloc; + stats->ndalloc += bin->stats.ndalloc; + stats->nrequests += bin->stats.nrequests; + stats->curregs += bin->stats.curregs; + stats->nfills += bin->stats.nfills; + stats->nflushes += bin->stats.nflushes; + stats->nslabs += bin->stats.nslabs; + stats->reslabs += bin->stats.reslabs; + stats->curslabs += bin->stats.curslabs; + stats->nonfull_slabs += bin->stats.nonfull_slabs; + + stats->batch_failed_pushes += bin->stats.batch_failed_pushes; + stats->batch_pushes += bin->stats.batch_pushes; + stats->batch_pushed_elems += bin->stats.batch_pushed_elems; + + malloc_mutex_unlock(tsdn, &bin->lock); +} + +#endif /* JEMALLOC_INTERNAL_BIN_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/bin_info.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/bin_info.h new file mode 100644 index 000000000..88d58c916 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/bin_info.h @@ -0,0 +1,62 @@ +#ifndef JEMALLOC_INTERNAL_BIN_INFO_H +#define JEMALLOC_INTERNAL_BIN_INFO_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/bitmap.h" + +/* + * Read-only information associated with each element of arena_t's bins array + * is stored separately, partly to reduce memory usage (only one copy, rather + * than one per arena), but mainly to avoid false cacheline sharing. + * + * Each slab has the following layout: + * + * /--------------------\ + * | region 0 | + * |--------------------| + * | region 1 | + * |--------------------| + * | ... | + * | ... | + * | ... | + * |--------------------| + * | region nregs-1 | + * \--------------------/ + */ +typedef struct bin_info_s bin_info_t; +struct bin_info_s { + /* Size of regions in a slab for this bin's size class. */ + size_t reg_size; + + /* Total size of a slab for this bin's size class. */ + size_t slab_size; + + /* Total number of regions in a slab for this bin's size class. */ + uint32_t nregs; + + /* Number of sharded bins in each arena for this size class. */ + uint32_t n_shards; + + /* + * Metadata used to manipulate bitmaps for slabs associated with this + * bin. + */ + bitmap_info_t bitmap_info; +}; + +/* The maximum size a size class can be and still get batching behavior. */ +extern size_t opt_bin_info_max_batched_size; +/* The number of batches per batched size class. */ +extern size_t opt_bin_info_remote_free_max_batch; +// The max number of pending elems (across all batches) +extern size_t opt_bin_info_remote_free_max; + +extern szind_t bin_info_nbatched_sizes; +extern unsigned bin_info_nbatched_bins; +extern unsigned bin_info_nunbatched_bins; + +extern bin_info_t bin_infos[SC_NBINS]; + +void bin_info_boot(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS]); + +#endif /* JEMALLOC_INTERNAL_BIN_INFO_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/bin_stats.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/bin_stats.h new file mode 100644 index 000000000..334c166de --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/bin_stats.h @@ -0,0 +1,63 @@ +#ifndef JEMALLOC_INTERNAL_BIN_STATS_H +#define JEMALLOC_INTERNAL_BIN_STATS_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/mutex_prof.h" + +typedef struct bin_stats_s bin_stats_t; +struct bin_stats_s { + /* + * Total number of allocation/deallocation requests served directly by + * the bin. Note that tcache may allocate an object, then recycle it + * many times, resulting many increments to nrequests, but only one + * each to nmalloc and ndalloc. + */ + uint64_t nmalloc; + uint64_t ndalloc; + + /* + * Number of allocation requests that correspond to the size of this + * bin. This includes requests served by tcache, though tcache only + * periodically merges into this counter. + */ + uint64_t nrequests; + + /* + * Current number of regions of this size class, including regions + * currently cached by tcache. + */ + size_t curregs; + + /* Number of tcache fills from this bin. */ + uint64_t nfills; + + /* Number of tcache flushes to this bin. */ + uint64_t nflushes; + + /* Total number of slabs created for this bin's size class. */ + uint64_t nslabs; + + /* + * Total number of slabs reused by extracting them from the slabs heap + * for this bin's size class. + */ + uint64_t reslabs; + + /* Current number of slabs in this bin. */ + size_t curslabs; + + /* Current size of nonfull slabs heap in this bin. */ + size_t nonfull_slabs; + + uint64_t batch_pops; + uint64_t batch_failed_pushes; + uint64_t batch_pushes; + uint64_t batch_pushed_elems; +}; + +typedef struct bin_stats_data_s bin_stats_data_t; +struct bin_stats_data_s { + bin_stats_t stats_data; + mutex_prof_data_t mutex_data; +}; +#endif /* JEMALLOC_INTERNAL_BIN_STATS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/bin_types.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/bin_types.h new file mode 100644 index 000000000..5ec22dfdc --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/bin_types.h @@ -0,0 +1,18 @@ +#ifndef JEMALLOC_INTERNAL_BIN_TYPES_H +#define JEMALLOC_INTERNAL_BIN_TYPES_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/sc.h" + +#define BIN_SHARDS_MAX (1 << EDATA_BITS_BINSHARD_WIDTH) +#define N_BIN_SHARDS_DEFAULT 1 + +/* Used in TSD static initializer only. Real init in arena_bind(). */ +#define TSD_BINSHARDS_ZERO_INITIALIZER {{UINT8_MAX}} + +typedef struct tsd_binshards_s tsd_binshards_t; +struct tsd_binshards_s { + uint8_t binshard[SC_NBINS]; +}; + +#endif /* JEMALLOC_INTERNAL_BIN_TYPES_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/bit_util.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/bit_util.h new file mode 100644 index 000000000..c413a75d0 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/bit_util.h @@ -0,0 +1,421 @@ +#ifndef JEMALLOC_INTERNAL_BIT_UTIL_H +#define JEMALLOC_INTERNAL_BIT_UTIL_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/assert.h" + +/* Sanity check. */ +#if !defined(JEMALLOC_INTERNAL_FFSLL) || !defined(JEMALLOC_INTERNAL_FFSL) \ + || !defined(JEMALLOC_INTERNAL_FFS) +# error JEMALLOC_INTERNAL_FFS{,L,LL} should have been defined by configure +#endif + +/* + * Unlike the builtins and posix ffs functions, our ffs requires a non-zero + * input, and returns the position of the lowest bit set (as opposed to the + * posix versions, which return 1 larger than that position and use a return + * value of zero as a sentinel. This tends to simplify logic in callers, and + * allows for consistency with the builtins we build fls on top of. + */ +static inline unsigned +ffs_llu(unsigned long long x) { + util_assume(x != 0); + return JEMALLOC_INTERNAL_FFSLL(x) - 1; +} + +static inline unsigned +ffs_lu(unsigned long x) { + util_assume(x != 0); + return JEMALLOC_INTERNAL_FFSL(x) - 1; +} + +static inline unsigned +ffs_u(unsigned x) { + util_assume(x != 0); + return JEMALLOC_INTERNAL_FFS(x) - 1; +} + +#define DO_FLS_SLOW(x, suffix) do { \ + util_assume(x != 0); \ + x |= (x >> 1); \ + x |= (x >> 2); \ + x |= (x >> 4); \ + x |= (x >> 8); \ + x |= (x >> 16); \ + if (sizeof(x) > 4) { \ + /* \ + * If sizeof(x) is 4, then the expression "x >> 32" \ + * will generate compiler warnings even if the code \ + * never executes. This circumvents the warning, and \ + * gets compiled out in optimized builds. \ + */ \ + int constant_32 = sizeof(x) * 4; \ + x |= (x >> constant_32); \ + } \ + x++; \ + if (x == 0) { \ + return 8 * sizeof(x) - 1; \ + } \ + return ffs_##suffix(x) - 1; \ +} while(0) + +static inline unsigned +fls_llu_slow(unsigned long long x) { + DO_FLS_SLOW(x, llu); +} + +static inline unsigned +fls_lu_slow(unsigned long x) { + DO_FLS_SLOW(x, lu); +} + +static inline unsigned +fls_u_slow(unsigned x) { + DO_FLS_SLOW(x, u); +} + +#undef DO_FLS_SLOW + +#ifdef JEMALLOC_HAVE_BUILTIN_CLZ +static inline unsigned +fls_llu(unsigned long long x) { + util_assume(x != 0); + /* + * Note that the xor here is more naturally written as subtraction; the + * last bit set is the number of bits in the type minus the number of + * leading zero bits. But GCC implements that as: + * bsr edi, edi + * mov eax, 31 + * xor edi, 31 + * sub eax, edi + * If we write it as xor instead, then we get + * bsr eax, edi + * as desired. + */ + return (8 * sizeof(x) - 1) ^ __builtin_clzll(x); +} + +static inline unsigned +fls_lu(unsigned long x) { + util_assume(x != 0); + return (8 * sizeof(x) - 1) ^ __builtin_clzl(x); +} + +static inline unsigned +fls_u(unsigned x) { + util_assume(x != 0); + return (8 * sizeof(x) - 1) ^ __builtin_clz(x); +} +#elif defined(_MSC_VER) + +#if LG_SIZEOF_PTR == 3 +#define DO_BSR64(bit, x) _BitScanReverse64(&bit, x) +#else +/* + * This never actually runs; we're just dodging a compiler error for the + * never-taken branch where sizeof(void *) == 8. + */ +#define DO_BSR64(bit, x) bit = 0; unreachable() +#endif + +#define DO_FLS(x) do { \ + if (x == 0) { \ + return 8 * sizeof(x); \ + } \ + unsigned long bit; \ + if (sizeof(x) == 4) { \ + _BitScanReverse(&bit, (unsigned)x); \ + return (unsigned)bit; \ + } \ + if (sizeof(x) == 8 && sizeof(void *) == 8) { \ + DO_BSR64(bit, x); \ + return (unsigned)bit; \ + } \ + if (sizeof(x) == 8 && sizeof(void *) == 4) { \ + /* Dodge a compiler warning, as above. */ \ + int constant_32 = sizeof(x) * 4; \ + if (_BitScanReverse(&bit, \ + (unsigned)(x >> constant_32))) { \ + return 32 + (unsigned)bit; \ + } else { \ + _BitScanReverse(&bit, (unsigned)x); \ + return (unsigned)bit; \ + } \ + } \ + unreachable(); \ +} while (0) + +static inline unsigned +fls_llu(unsigned long long x) { + DO_FLS(x); +} + +static inline unsigned +fls_lu(unsigned long x) { + DO_FLS(x); +} + +static inline unsigned +fls_u(unsigned x) { + DO_FLS(x); +} + +#undef DO_FLS +#undef DO_BSR64 +#else + +static inline unsigned +fls_llu(unsigned long long x) { + return fls_llu_slow(x); +} + +static inline unsigned +fls_lu(unsigned long x) { + return fls_lu_slow(x); +} + +static inline unsigned +fls_u(unsigned x) { + return fls_u_slow(x); +} +#endif + +#if LG_SIZEOF_LONG_LONG > 3 +# error "Haven't implemented popcount for 16-byte ints." +#endif + +#define DO_POPCOUNT(x, type) do { \ + /* \ + * Algorithm from an old AMD optimization reference manual. \ + * We're putting a little bit more work than you might expect \ + * into the no-instrinsic case, since we only support the \ + * GCC intrinsics spelling of popcount (for now). Detecting \ + * whether or not the popcount builtin is actually useable in \ + * MSVC is nontrivial. \ + */ \ + \ + type bmul = (type)0x0101010101010101ULL; \ + \ + /* \ + * Replace each 2 bits with the sideways sum of the original \ + * values. 0x5 = 0b0101. \ + * \ + * You might expect this to be: \ + * x = (x & 0x55...) + ((x >> 1) & 0x55...). \ + * That costs an extra mask relative to this, though. \ + */ \ + x = x - ((x >> 1) & (0x55U * bmul)); \ + /* Replace each 4 bits with their sideays sum. 0x3 = 0b0011. */\ + x = (x & (bmul * 0x33U)) + ((x >> 2) & (bmul * 0x33U)); \ + /* \ + * Replace each 8 bits with their sideways sum. Note that we \ + * can't overflow within each 4-bit sum here, so we can skip \ + * the initial mask. \ + */ \ + x = (x + (x >> 4)) & (bmul * 0x0FU); \ + /* \ + * None of the partial sums in this multiplication (viewed in \ + * base-256) can overflow into the next digit. So the least \ + * significant byte of the product will be the least \ + * significant byte of the original value, the second least \ + * significant byte will be the sum of the two least \ + * significant bytes of the original value, and so on. \ + * Importantly, the high byte will be the byte-wise sum of all \ + * the bytes of the original value. \ + */ \ + x = x * bmul; \ + x >>= ((sizeof(x) - 1) * 8); \ + return (unsigned)x; \ +} while(0) + +static inline unsigned +popcount_u_slow(unsigned bitmap) { + DO_POPCOUNT(bitmap, unsigned); +} + +static inline unsigned +popcount_lu_slow(unsigned long bitmap) { + DO_POPCOUNT(bitmap, unsigned long); +} + +static inline unsigned +popcount_llu_slow(unsigned long long bitmap) { + DO_POPCOUNT(bitmap, unsigned long long); +} + +#undef DO_POPCOUNT + +static inline unsigned +popcount_u(unsigned bitmap) { +#ifdef JEMALLOC_INTERNAL_POPCOUNT + return JEMALLOC_INTERNAL_POPCOUNT(bitmap); +#else + return popcount_u_slow(bitmap); +#endif +} + +static inline unsigned +popcount_lu(unsigned long bitmap) { +#ifdef JEMALLOC_INTERNAL_POPCOUNTL + return JEMALLOC_INTERNAL_POPCOUNTL(bitmap); +#else + return popcount_lu_slow(bitmap); +#endif +} + +static inline unsigned +popcount_llu(unsigned long long bitmap) { +#ifdef JEMALLOC_INTERNAL_POPCOUNTLL + return JEMALLOC_INTERNAL_POPCOUNTLL(bitmap); +#else + return popcount_llu_slow(bitmap); +#endif +} + +/* + * Clears first unset bit in bitmap, and returns + * place of bit. bitmap *must not* be 0. + */ + +static inline size_t +cfs_lu(unsigned long* bitmap) { + util_assume(*bitmap != 0); + size_t bit = ffs_lu(*bitmap); + *bitmap ^= ZU(1) << bit; + return bit; +} + +static inline unsigned +ffs_zu(size_t x) { +#if LG_SIZEOF_PTR == LG_SIZEOF_INT + return ffs_u(x); +#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG + return ffs_lu(x); +#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG + return ffs_llu(x); +#else +#error No implementation for size_t ffs() +#endif +} + +static inline unsigned +fls_zu(size_t x) { +#if LG_SIZEOF_PTR == LG_SIZEOF_INT + return fls_u(x); +#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG + return fls_lu(x); +#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG + return fls_llu(x); +#else +#error No implementation for size_t fls() +#endif +} + + +static inline unsigned +ffs_u64(uint64_t x) { +#if LG_SIZEOF_LONG == 3 + return ffs_lu(x); +#elif LG_SIZEOF_LONG_LONG == 3 + return ffs_llu(x); +#else +#error No implementation for 64-bit ffs() +#endif +} + +static inline unsigned +fls_u64(uint64_t x) { +#if LG_SIZEOF_LONG == 3 + return fls_lu(x); +#elif LG_SIZEOF_LONG_LONG == 3 + return fls_llu(x); +#else +#error No implementation for 64-bit fls() +#endif +} + +static inline unsigned +ffs_u32(uint32_t x) { +#if LG_SIZEOF_INT == 2 + return ffs_u(x); +#else +#error No implementation for 32-bit ffs() +#endif +} + +static inline unsigned +fls_u32(uint32_t x) { +#if LG_SIZEOF_INT == 2 + return fls_u(x); +#else +#error No implementation for 32-bit fls() +#endif +} + +static inline uint64_t +pow2_ceil_u64(uint64_t x) { + if (unlikely(x <= 1)) { + return x; + } + size_t msb_on_index = fls_u64(x - 1); + /* + * Range-check; it's on the callers to ensure that the result of this + * call won't overflow. + */ + assert(msb_on_index < 63); + return 1ULL << (msb_on_index + 1); +} + +static inline uint32_t +pow2_ceil_u32(uint32_t x) { + if (unlikely(x <= 1)) { + return x; + } + size_t msb_on_index = fls_u32(x - 1); + /* As above. */ + assert(msb_on_index < 31); + return 1U << (msb_on_index + 1); +} + +/* Compute the smallest power of 2 that is >= x. */ +static inline size_t +pow2_ceil_zu(size_t x) { +#if (LG_SIZEOF_PTR == 3) + return pow2_ceil_u64(x); +#else + return pow2_ceil_u32(x); +#endif +} + +static inline unsigned +lg_floor(size_t x) { + util_assume(x != 0); +#if (LG_SIZEOF_PTR == 3) + return fls_u64(x); +#else + return fls_u32(x); +#endif +} + +static inline unsigned +lg_ceil(size_t x) { + return lg_floor(x) + ((x & (x - 1)) == 0 ? 0 : 1); +} + +/* A compile-time version of lg_floor and lg_ceil. */ +#define LG_FLOOR_1(x) 0 +#define LG_FLOOR_2(x) (x < (1ULL << 1) ? LG_FLOOR_1(x) : 1 + LG_FLOOR_1(x >> 1)) +#define LG_FLOOR_4(x) (x < (1ULL << 2) ? LG_FLOOR_2(x) : 2 + LG_FLOOR_2(x >> 2)) +#define LG_FLOOR_8(x) (x < (1ULL << 4) ? LG_FLOOR_4(x) : 4 + LG_FLOOR_4(x >> 4)) +#define LG_FLOOR_16(x) (x < (1ULL << 8) ? LG_FLOOR_8(x) : 8 + LG_FLOOR_8(x >> 8)) +#define LG_FLOOR_32(x) (x < (1ULL << 16) ? LG_FLOOR_16(x) : 16 + LG_FLOOR_16(x >> 16)) +#define LG_FLOOR_64(x) (x < (1ULL << 32) ? LG_FLOOR_32(x) : 32 + LG_FLOOR_32(x >> 32)) +#if LG_SIZEOF_PTR == 2 +# define LG_FLOOR(x) LG_FLOOR_32((x)) +#else +# define LG_FLOOR(x) LG_FLOOR_64((x)) +#endif + +#define LG_CEIL(x) (LG_FLOOR(x) + (((x) & ((x) - 1)) == 0 ? 0 : 1)) + +#endif /* JEMALLOC_INTERNAL_BIT_UTIL_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/bitmap.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/bitmap.h new file mode 100644 index 000000000..e501da475 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/bitmap.h @@ -0,0 +1,369 @@ +#ifndef JEMALLOC_INTERNAL_BITMAP_H +#define JEMALLOC_INTERNAL_BITMAP_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/bit_util.h" +#include "jemalloc/internal/sc.h" + +typedef unsigned long bitmap_t; +#define LG_SIZEOF_BITMAP LG_SIZEOF_LONG + +/* Maximum bitmap bit count is 2^LG_BITMAP_MAXBITS. */ +#if SC_LG_SLAB_MAXREGS > LG_CEIL(SC_NSIZES) +/* Maximum bitmap bit count is determined by maximum regions per slab. */ +# define LG_BITMAP_MAXBITS SC_LG_SLAB_MAXREGS +#else +/* Maximum bitmap bit count is determined by number of extent size classes. */ +# define LG_BITMAP_MAXBITS LG_CEIL(SC_NSIZES) +#endif +#define BITMAP_MAXBITS (ZU(1) << LG_BITMAP_MAXBITS) + +/* Number of bits per group. */ +#define LG_BITMAP_GROUP_NBITS (LG_SIZEOF_BITMAP + 3) +#define BITMAP_GROUP_NBITS (1U << LG_BITMAP_GROUP_NBITS) +#define BITMAP_GROUP_NBITS_MASK (BITMAP_GROUP_NBITS-1) + +/* + * Do some analysis on how big the bitmap is before we use a tree. For a brute + * force linear search, if we would have to call ffs_lu() more than 2^3 times, + * use a tree instead. + */ +#if LG_BITMAP_MAXBITS - LG_BITMAP_GROUP_NBITS > 3 +# define BITMAP_USE_TREE +#endif + +/* Number of groups required to store a given number of bits. */ +#define BITMAP_BITS2GROUPS(nbits) \ + (((nbits) + BITMAP_GROUP_NBITS_MASK) >> LG_BITMAP_GROUP_NBITS) + +/* + * Number of groups required at a particular level for a given number of bits. + */ +#define BITMAP_GROUPS_L0(nbits) \ + BITMAP_BITS2GROUPS(nbits) +#define BITMAP_GROUPS_L1(nbits) \ + BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(nbits)) +#define BITMAP_GROUPS_L2(nbits) \ + BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS((nbits)))) +#define BITMAP_GROUPS_L3(nbits) \ + BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS( \ + BITMAP_BITS2GROUPS((nbits))))) +#define BITMAP_GROUPS_L4(nbits) \ + BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS( \ + BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS((nbits)))))) + +/* + * Assuming the number of levels, number of groups required for a given number + * of bits. + */ +#define BITMAP_GROUPS_1_LEVEL(nbits) \ + BITMAP_GROUPS_L0(nbits) +#define BITMAP_GROUPS_2_LEVEL(nbits) \ + (BITMAP_GROUPS_1_LEVEL(nbits) + BITMAP_GROUPS_L1(nbits)) +#define BITMAP_GROUPS_3_LEVEL(nbits) \ + (BITMAP_GROUPS_2_LEVEL(nbits) + BITMAP_GROUPS_L2(nbits)) +#define BITMAP_GROUPS_4_LEVEL(nbits) \ + (BITMAP_GROUPS_3_LEVEL(nbits) + BITMAP_GROUPS_L3(nbits)) +#define BITMAP_GROUPS_5_LEVEL(nbits) \ + (BITMAP_GROUPS_4_LEVEL(nbits) + BITMAP_GROUPS_L4(nbits)) + +/* + * Maximum number of groups required to support LG_BITMAP_MAXBITS. + */ +#ifdef BITMAP_USE_TREE + +#if LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS +# define BITMAP_GROUPS(nbits) BITMAP_GROUPS_1_LEVEL(nbits) +# define BITMAP_GROUPS_MAX BITMAP_GROUPS_1_LEVEL(BITMAP_MAXBITS) +#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 2 +# define BITMAP_GROUPS(nbits) BITMAP_GROUPS_2_LEVEL(nbits) +# define BITMAP_GROUPS_MAX BITMAP_GROUPS_2_LEVEL(BITMAP_MAXBITS) +#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 3 +# define BITMAP_GROUPS(nbits) BITMAP_GROUPS_3_LEVEL(nbits) +# define BITMAP_GROUPS_MAX BITMAP_GROUPS_3_LEVEL(BITMAP_MAXBITS) +#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 4 +# define BITMAP_GROUPS(nbits) BITMAP_GROUPS_4_LEVEL(nbits) +# define BITMAP_GROUPS_MAX BITMAP_GROUPS_4_LEVEL(BITMAP_MAXBITS) +#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 5 +# define BITMAP_GROUPS(nbits) BITMAP_GROUPS_5_LEVEL(nbits) +# define BITMAP_GROUPS_MAX BITMAP_GROUPS_5_LEVEL(BITMAP_MAXBITS) +#else +# error "Unsupported bitmap size" +#endif + +/* + * Maximum number of levels possible. This could be statically computed based + * on LG_BITMAP_MAXBITS: + * + * #define BITMAP_MAX_LEVELS \ + * (LG_BITMAP_MAXBITS / LG_SIZEOF_BITMAP) \ + * + !!(LG_BITMAP_MAXBITS % LG_SIZEOF_BITMAP) + * + * However, that would not allow the generic BITMAP_INFO_INITIALIZER() macro, so + * instead hardcode BITMAP_MAX_LEVELS to the largest number supported by the + * various cascading macros. The only additional cost this incurs is some + * unused trailing entries in bitmap_info_t structures; the bitmaps themselves + * are not impacted. + */ +#define BITMAP_MAX_LEVELS 5 + +#define BITMAP_INFO_INITIALIZER(nbits) { \ + /* nbits. */ \ + nbits, \ + /* nlevels. */ \ + (BITMAP_GROUPS_L0(nbits) > BITMAP_GROUPS_L1(nbits)) + \ + (BITMAP_GROUPS_L1(nbits) > BITMAP_GROUPS_L2(nbits)) + \ + (BITMAP_GROUPS_L2(nbits) > BITMAP_GROUPS_L3(nbits)) + \ + (BITMAP_GROUPS_L3(nbits) > BITMAP_GROUPS_L4(nbits)) + 1, \ + /* levels. */ \ + { \ + {0}, \ + {BITMAP_GROUPS_L0(nbits)}, \ + {BITMAP_GROUPS_L1(nbits) + BITMAP_GROUPS_L0(nbits)}, \ + {BITMAP_GROUPS_L2(nbits) + BITMAP_GROUPS_L1(nbits) + \ + BITMAP_GROUPS_L0(nbits)}, \ + {BITMAP_GROUPS_L3(nbits) + BITMAP_GROUPS_L2(nbits) + \ + BITMAP_GROUPS_L1(nbits) + BITMAP_GROUPS_L0(nbits)}, \ + {BITMAP_GROUPS_L4(nbits) + BITMAP_GROUPS_L3(nbits) + \ + BITMAP_GROUPS_L2(nbits) + BITMAP_GROUPS_L1(nbits) \ + + BITMAP_GROUPS_L0(nbits)} \ + } \ +} + +#else /* BITMAP_USE_TREE */ + +#define BITMAP_GROUPS(nbits) BITMAP_BITS2GROUPS(nbits) +#define BITMAP_GROUPS_MAX BITMAP_BITS2GROUPS(BITMAP_MAXBITS) + +#define BITMAP_INFO_INITIALIZER(nbits) { \ + /* nbits. */ \ + nbits, \ + /* ngroups. */ \ + BITMAP_BITS2GROUPS(nbits) \ +} + +#endif /* BITMAP_USE_TREE */ + +typedef struct bitmap_level_s { + /* Offset of this level's groups within the array of groups. */ + size_t group_offset; +} bitmap_level_t; + +typedef struct bitmap_info_s { + /* Logical number of bits in bitmap (stored at bottom level). */ + size_t nbits; + +#ifdef BITMAP_USE_TREE + /* Number of levels necessary for nbits. */ + unsigned nlevels; + + /* + * Only the first (nlevels+1) elements are used, and levels are ordered + * bottom to top (e.g. the bottom level is stored in levels[0]). + */ + bitmap_level_t levels[BITMAP_MAX_LEVELS+1]; +#else /* BITMAP_USE_TREE */ + /* Number of groups necessary for nbits. */ + size_t ngroups; +#endif /* BITMAP_USE_TREE */ +} bitmap_info_t; + +void bitmap_info_init(bitmap_info_t *binfo, size_t nbits); +void bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo, bool fill); +size_t bitmap_size(const bitmap_info_t *binfo); + +static inline bool +bitmap_full(bitmap_t *bitmap, const bitmap_info_t *binfo) { +#ifdef BITMAP_USE_TREE + size_t rgoff = binfo->levels[binfo->nlevels].group_offset - 1; + bitmap_t rg = bitmap[rgoff]; + /* The bitmap is full iff the root group is 0. */ + return (rg == 0); +#else + size_t i; + + for (i = 0; i < binfo->ngroups; i++) { + if (bitmap[i] != 0) { + return false; + } + } + return true; +#endif +} + +static inline bool +bitmap_get(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) { + size_t goff; + bitmap_t g; + + assert(bit < binfo->nbits); + goff = bit >> LG_BITMAP_GROUP_NBITS; + g = bitmap[goff]; + return !(g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK))); +} + +static inline void +bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) { + size_t goff; + bitmap_t *gp; + bitmap_t g; + + assert(bit < binfo->nbits); + assert(!bitmap_get(bitmap, binfo, bit)); + goff = bit >> LG_BITMAP_GROUP_NBITS; + gp = &bitmap[goff]; + g = *gp; + assert(g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK))); + g ^= ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK); + *gp = g; + assert(bitmap_get(bitmap, binfo, bit)); +#ifdef BITMAP_USE_TREE + /* Propagate group state transitions up the tree. */ + if (g == 0) { + unsigned i; + for (i = 1; i < binfo->nlevels; i++) { + bit = goff; + goff = bit >> LG_BITMAP_GROUP_NBITS; + gp = &bitmap[binfo->levels[i].group_offset + goff]; + g = *gp; + assert(g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK))); + g ^= ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK); + *gp = g; + if (g != 0) { + break; + } + } + } +#endif +} + +/* ffu: find first unset >= bit. */ +static inline size_t +bitmap_ffu(const bitmap_t *bitmap, const bitmap_info_t *binfo, size_t min_bit) { + assert(min_bit < binfo->nbits); + +#ifdef BITMAP_USE_TREE + size_t bit = 0; + for (unsigned level = binfo->nlevels; level--;) { + size_t lg_bits_per_group = (LG_BITMAP_GROUP_NBITS * (level + + 1)); + bitmap_t group = bitmap[binfo->levels[level].group_offset + (bit + >> lg_bits_per_group)]; + unsigned group_nmask = (unsigned)(((min_bit > bit) ? (min_bit - + bit) : 0) >> (lg_bits_per_group - LG_BITMAP_GROUP_NBITS)); + assert(group_nmask <= BITMAP_GROUP_NBITS); + bitmap_t group_mask = ~((1LU << group_nmask) - 1); + bitmap_t group_masked = group & group_mask; + if (group_masked == 0LU) { + if (group == 0LU) { + return binfo->nbits; + } + /* + * min_bit was preceded by one or more unset bits in + * this group, but there are no other unset bits in this + * group. Try again starting at the first bit of the + * next sibling. This will recurse at most once per + * non-root level. + */ + size_t sib_base = bit + (ZU(1) << lg_bits_per_group); + assert(sib_base > min_bit); + assert(sib_base > bit); + if (sib_base >= binfo->nbits) { + return binfo->nbits; + } + return bitmap_ffu(bitmap, binfo, sib_base); + } + bit += ((size_t)ffs_lu(group_masked)) << + (lg_bits_per_group - LG_BITMAP_GROUP_NBITS); + } + assert(bit >= min_bit); + assert(bit < binfo->nbits); + return bit; +#else + size_t i = min_bit >> LG_BITMAP_GROUP_NBITS; + bitmap_t g = bitmap[i] & ~((1LU << (min_bit & BITMAP_GROUP_NBITS_MASK)) + - 1); + size_t bit; + do { + if (g != 0) { + bit = ffs_lu(g); + return (i << LG_BITMAP_GROUP_NBITS) + bit; + } + i++; + g = bitmap[i]; + } while (i < binfo->ngroups); + return binfo->nbits; +#endif +} + +/* sfu: set first unset. */ +static inline size_t +bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo) { + size_t bit; + bitmap_t g; + unsigned i; + + assert(!bitmap_full(bitmap, binfo)); + +#ifdef BITMAP_USE_TREE + i = binfo->nlevels - 1; + g = bitmap[binfo->levels[i].group_offset]; + bit = ffs_lu(g); + while (i > 0) { + i--; + g = bitmap[binfo->levels[i].group_offset + bit]; + bit = (bit << LG_BITMAP_GROUP_NBITS) + ffs_lu(g); + } +#else + i = 0; + g = bitmap[0]; + while (g == 0) { + i++; + g = bitmap[i]; + } + bit = (i << LG_BITMAP_GROUP_NBITS) + ffs_lu(g); +#endif + bitmap_set(bitmap, binfo, bit); + return bit; +} + +static inline void +bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) { + size_t goff; + bitmap_t *gp; + bitmap_t g; + UNUSED bool propagate; + + assert(bit < binfo->nbits); + assert(bitmap_get(bitmap, binfo, bit)); + goff = bit >> LG_BITMAP_GROUP_NBITS; + gp = &bitmap[goff]; + g = *gp; + propagate = (g == 0); + assert((g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK))) == 0); + g ^= ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK); + *gp = g; + assert(!bitmap_get(bitmap, binfo, bit)); +#ifdef BITMAP_USE_TREE + /* Propagate group state transitions up the tree. */ + if (propagate) { + unsigned i; + for (i = 1; i < binfo->nlevels; i++) { + bit = goff; + goff = bit >> LG_BITMAP_GROUP_NBITS; + gp = &bitmap[binfo->levels[i].group_offset + goff]; + g = *gp; + propagate = (g == 0); + assert((g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK))) + == 0); + g ^= ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK); + *gp = g; + if (!propagate) { + break; + } + } + } +#endif /* BITMAP_USE_TREE */ +} + +#endif /* JEMALLOC_INTERNAL_BITMAP_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/buf_writer.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/buf_writer.h new file mode 100644 index 000000000..fa0ac99cf --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/buf_writer.h @@ -0,0 +1,36 @@ +#ifndef JEMALLOC_INTERNAL_BUF_WRITER_H +#define JEMALLOC_INTERNAL_BUF_WRITER_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_types.h" +#include "jemalloc/internal/tsd_types.h" + +/* + * Note: when using the buffered writer, cbopaque is passed to write_cb only + * when the buffer is flushed. It would make a difference if cbopaque points + * to something that's changing for each write_cb call, or something that + * affects write_cb in a way dependent on the content of the output string. + * However, the most typical usage case in practice is that cbopaque points to + * some "option like" content for the write_cb, so it doesn't matter. + */ + +typedef struct { + write_cb_t *write_cb; + void *cbopaque; + char *buf; + size_t buf_size; + size_t buf_end; + bool internal_buf; +} buf_writer_t; + +bool buf_writer_init(tsdn_t *tsdn, buf_writer_t *buf_writer, + write_cb_t *write_cb, void *cbopaque, char *buf, size_t buf_len); +void buf_writer_flush(buf_writer_t *buf_writer); +write_cb_t buf_writer_cb; +void buf_writer_terminate(tsdn_t *tsdn, buf_writer_t *buf_writer); + +typedef ssize_t (read_cb_t)(void *read_cbopaque, void *buf, size_t limit); +void buf_writer_pipe(buf_writer_t *buf_writer, read_cb_t *read_cb, + void *read_cbopaque); + +#endif /* JEMALLOC_INTERNAL_BUF_WRITER_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/cache_bin.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/cache_bin.h new file mode 100644 index 000000000..a7a5e40ee --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/cache_bin.h @@ -0,0 +1,746 @@ +#ifndef JEMALLOC_INTERNAL_CACHE_BIN_H +#define JEMALLOC_INTERNAL_CACHE_BIN_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_externs.h" +#include "jemalloc/internal/ql.h" +#include "jemalloc/internal/safety_check.h" +#include "jemalloc/internal/sz.h" + +/* + * The cache_bins are the mechanism that the tcache and the arena use to + * communicate. The tcache fills from and flushes to the arena by passing a + * cache_bin_t to fill/flush. When the arena needs to pull stats from the + * tcaches associated with it, it does so by iterating over its + * cache_bin_array_descriptor_t objects and reading out per-bin stats it + * contains. This makes it so that the arena need not know about the existence + * of the tcache at all. + */ + +/* + * The size in bytes of each cache bin stack. We also use this to indicate + * *counts* of individual objects. + */ +typedef uint16_t cache_bin_sz_t; + +#define JUNK_ADDR ((uintptr_t)0x7a7a7a7a7a7a7a7aULL) +/* + * Leave a noticeable mark pattern on the cache bin stack boundaries, in case a + * bug starts leaking those. Make it look like the junk pattern but be distinct + * from it. + */ +static const uintptr_t cache_bin_preceding_junk = JUNK_ADDR; +/* Note: JUNK_ADDR vs. JUNK_ADDR + 1 -- this tells you which pointer leaked. */ +static const uintptr_t cache_bin_trailing_junk = JUNK_ADDR + 1; +/* + * A pointer used to initialize a fake stack_head for disabled small bins + * so that the enabled/disabled assessment does not rely on ncached_max. + */ +extern const uintptr_t disabled_bin; + +/* + * That implies the following value, for the maximum number of items in any + * individual bin. The cache bins track their bounds looking just at the low + * bits of a pointer, compared against a cache_bin_sz_t. So that's + * 1 << (sizeof(cache_bin_sz_t) * 8) + * bytes spread across pointer sized objects to get the maximum. + */ +#define CACHE_BIN_NCACHED_MAX (((size_t)1 << sizeof(cache_bin_sz_t) * 8) \ + / sizeof(void *) - 1) + +/* + * This lives inside the cache_bin (for locality reasons), and is initialized + * alongside it, but is otherwise not modified by any cache bin operations. + * It's logically public and maintained by its callers. + */ +typedef struct cache_bin_stats_s cache_bin_stats_t; +struct cache_bin_stats_s { + /* + * Number of allocation requests that corresponded to the size of this + * bin. + */ + uint64_t nrequests; +}; + +/* + * Read-only information associated with each element of tcache_t's tbins array + * is stored separately, mainly to reduce memory usage. + */ +typedef struct cache_bin_info_s cache_bin_info_t; +struct cache_bin_info_s { + cache_bin_sz_t ncached_max; +}; + +/* + * Responsible for caching allocations associated with a single size. + * + * Several pointers are used to track the stack. To save on metadata bytes, + * only the stack_head is a full sized pointer (which is dereferenced on the + * fastpath), while the others store only the low 16 bits -- this is correct + * because a single stack never takes more space than 2^16 bytes, and at the + * same time only equality checks are performed on the low bits. + * + * (low addr) (high addr) + * |------stashed------|------available------|------cached-----| + * ^ ^ ^ ^ + * low_bound(derived) low_bits_full stack_head low_bits_empty + */ +typedef struct cache_bin_s cache_bin_t; +struct cache_bin_s { + /* + * The stack grows down. Whenever the bin is nonempty, the head points + * to an array entry containing a valid allocation. When it is empty, + * the head points to one element past the owned array. + */ + void **stack_head; + /* + * cur_ptr and stats are both modified frequently. Let's keep them + * close so that they have a higher chance of being on the same + * cacheline, thus less write-backs. + */ + cache_bin_stats_t tstats; + + /* + * The low bits of the address of the first item in the stack that + * hasn't been used since the last GC, to track the low water mark (min + * # of cached items). + * + * Since the stack grows down, this is a higher address than + * low_bits_full. + */ + uint16_t low_bits_low_water; + + /* + * The low bits of the value that stack_head will take on when the array + * is full (of cached & stashed items). But remember that stack_head + * always points to a valid item when the array is nonempty -- this is + * in the array. + * + * Recall that since the stack grows down, this is the lowest available + * address in the array for caching. Only adjusted when stashing items. + */ + uint16_t low_bits_full; + + /* + * The low bits of the value that stack_head will take on when the array + * is empty. + * + * The stack grows down -- this is one past the highest address in the + * array. Immutable after initialization. + */ + uint16_t low_bits_empty; + + /* The maximum number of cached items in the bin. */ + cache_bin_info_t bin_info; +}; + +/* + * The cache_bins live inside the tcache, but the arena (by design) isn't + * supposed to know much about tcache internals. To let the arena iterate over + * associated bins, we keep (with the tcache) a linked list of + * cache_bin_array_descriptor_ts that tell the arena how to find the bins. + */ +typedef struct cache_bin_array_descriptor_s cache_bin_array_descriptor_t; +struct cache_bin_array_descriptor_s { + /* + * The arena keeps a list of the cache bins associated with it, for + * stats collection. + */ + ql_elm(cache_bin_array_descriptor_t) link; + /* Pointers to the tcache bins. */ + cache_bin_t *bins; +}; + +static inline void +cache_bin_array_descriptor_init(cache_bin_array_descriptor_t *descriptor, + cache_bin_t *bins) { + ql_elm_new(descriptor, link); + descriptor->bins = bins; +} + +JEMALLOC_ALWAYS_INLINE bool +cache_bin_nonfast_aligned(const void *ptr) { + if (!config_uaf_detection) { + return false; + } + /* + * Currently we use alignment to decide which pointer to junk & stash on + * dealloc (for catching use-after-free). In some common cases a + * page-aligned check is needed already (sdalloc w/ config_prof), so we + * are getting it more or less for free -- no added instructions on + * free_fastpath. + * + * Another way of deciding which pointer to sample, is adding another + * thread_event to pick one every N bytes. That also adds no cost on + * the fastpath, however it will tend to pick large allocations which is + * not the desired behavior. + */ + return ((uintptr_t)ptr & san_cache_bin_nonfast_mask) == 0; +} + +static inline const void * +cache_bin_disabled_bin_stack(void) { + return &disabled_bin; +} + +/* + * If a cache bin was zero initialized (either because it lives in static or + * thread-local storage, or was memset to 0), this function indicates whether or + * not cache_bin_init was called on it. + */ +static inline bool +cache_bin_still_zero_initialized(cache_bin_t *bin) { + return bin->stack_head == NULL; +} + +static inline bool +cache_bin_disabled(cache_bin_t *bin) { + bool disabled = (bin->stack_head == cache_bin_disabled_bin_stack()); + if (disabled) { + assert((uintptr_t)(*bin->stack_head) == JUNK_ADDR); + } + return disabled; +} + +/* Gets ncached_max without asserting that the bin is enabled. */ +static inline cache_bin_sz_t +cache_bin_ncached_max_get_unsafe(cache_bin_t *bin) { + return bin->bin_info.ncached_max; +} + +/* Returns ncached_max: Upper limit on ncached. */ +static inline cache_bin_sz_t +cache_bin_ncached_max_get(cache_bin_t *bin) { + assert(!cache_bin_disabled(bin)); + return cache_bin_ncached_max_get_unsafe(bin); +} + +/* + * Internal. + * + * Asserts that the pointer associated with earlier is <= the one associated + * with later. + */ +static inline void +cache_bin_assert_earlier(cache_bin_t *bin, uint16_t earlier, uint16_t later) { + if (earlier > later) { + assert(bin->low_bits_full > bin->low_bits_empty); + } +} + +/* + * Internal. + * + * Does difference calculations that handle wraparound correctly. Earlier must + * be associated with the position earlier in memory. + */ +static inline cache_bin_sz_t +cache_bin_diff(cache_bin_t *bin, uint16_t earlier, uint16_t later) { + cache_bin_assert_earlier(bin, earlier, later); + return later - earlier; +} + +/* + * Number of items currently cached in the bin, without checking ncached_max. + */ +static inline cache_bin_sz_t +cache_bin_ncached_get_internal(cache_bin_t *bin) { + cache_bin_sz_t diff = cache_bin_diff(bin, + (uint16_t)(uintptr_t)bin->stack_head, bin->low_bits_empty); + cache_bin_sz_t n = diff / sizeof(void *); + /* + * We have undefined behavior here; if this function is called from the + * arena stats updating code, then stack_head could change from the + * first line to the next one. Morally, these loads should be atomic, + * but compilers won't currently generate comparisons with in-memory + * operands against atomics, and these variables get accessed on the + * fast paths. This should still be "safe" in the sense of generating + * the correct assembly for the foreseeable future, though. + */ + assert(n == 0 || *(bin->stack_head) != NULL); + return n; +} + +/* + * Number of items currently cached in the bin, with checking ncached_max. The + * caller must know that no concurrent modification of the cache_bin is + * possible. + */ +static inline cache_bin_sz_t +cache_bin_ncached_get_local(cache_bin_t *bin) { + cache_bin_sz_t n = cache_bin_ncached_get_internal(bin); + assert(n <= cache_bin_ncached_max_get(bin)); + return n; +} + +/* + * Internal. + * + * A pointer to the position one past the end of the backing array. + * + * Do not call if racy, because both 'bin->stack_head' and 'bin->low_bits_full' + * are subject to concurrent modifications. + */ +static inline void ** +cache_bin_empty_position_get(cache_bin_t *bin) { + cache_bin_sz_t diff = cache_bin_diff(bin, + (uint16_t)(uintptr_t)bin->stack_head, bin->low_bits_empty); + byte_t *empty_bits = (byte_t *)bin->stack_head + diff; + void **ret = (void **)empty_bits; + + assert(ret >= bin->stack_head); + + return ret; +} + +/* + * Internal. + * + * Calculates low bits of the lower bound of the usable cache bin's range (see + * cache_bin_t visual representation above). + * + * No values are concurrently modified, so should be safe to read in a + * multithreaded environment. Currently concurrent access happens only during + * arena statistics collection. + */ +static inline uint16_t +cache_bin_low_bits_low_bound_get(cache_bin_t *bin) { + return (uint16_t)bin->low_bits_empty - + cache_bin_ncached_max_get(bin) * sizeof(void *); +} + +/* + * Internal. + * + * A pointer to the position with the lowest address of the backing array. + */ +static inline void ** +cache_bin_low_bound_get(cache_bin_t *bin) { + cache_bin_sz_t ncached_max = cache_bin_ncached_max_get(bin); + void **ret = cache_bin_empty_position_get(bin) - ncached_max; + assert(ret <= bin->stack_head); + + return ret; +} + +/* + * As the name implies. This is important since it's not correct to try to + * batch fill a nonempty cache bin. + */ +static inline void +cache_bin_assert_empty(cache_bin_t *bin) { + assert(cache_bin_ncached_get_local(bin) == 0); + assert(cache_bin_empty_position_get(bin) == bin->stack_head); +} + +/* + * Get low water, but without any of the correctness checking we do for the + * caller-usable version, if we are temporarily breaking invariants (like + * ncached >= low_water during flush). + */ +static inline cache_bin_sz_t +cache_bin_low_water_get_internal(cache_bin_t *bin) { + return cache_bin_diff(bin, bin->low_bits_low_water, + bin->low_bits_empty) / sizeof(void *); +} + +/* Returns the numeric value of low water in [0, ncached]. */ +static inline cache_bin_sz_t +cache_bin_low_water_get(cache_bin_t *bin) { + cache_bin_sz_t low_water = cache_bin_low_water_get_internal(bin); + assert(low_water <= cache_bin_ncached_max_get(bin)); + assert(low_water <= cache_bin_ncached_get_local(bin)); + + cache_bin_assert_earlier(bin, (uint16_t)(uintptr_t)bin->stack_head, + bin->low_bits_low_water); + + return low_water; +} + +/* + * Indicates that the current cache bin position should be the low water mark + * going forward. + */ +static inline void +cache_bin_low_water_set(cache_bin_t *bin) { + assert(!cache_bin_disabled(bin)); + bin->low_bits_low_water = (uint16_t)(uintptr_t)bin->stack_head; +} + +static inline void +cache_bin_low_water_adjust(cache_bin_t *bin) { + assert(!cache_bin_disabled(bin)); + if (cache_bin_ncached_get_internal(bin) + < cache_bin_low_water_get_internal(bin)) { + cache_bin_low_water_set(bin); + } +} + +JEMALLOC_ALWAYS_INLINE void * +cache_bin_alloc_impl(cache_bin_t *bin, bool *success, bool adjust_low_water) { + /* + * success (instead of ret) should be checked upon the return of this + * function. We avoid checking (ret == NULL) because there is never a + * null stored on the avail stack (which is unknown to the compiler), + * and eagerly checking ret would cause pipeline stall (waiting for the + * cacheline). + */ + + /* + * This may read from the empty position; however the loaded value won't + * be used. It's safe because the stack has one more slot reserved. + */ + void *ret = *bin->stack_head; + uint16_t low_bits = (uint16_t)(uintptr_t)bin->stack_head; + void **new_head = bin->stack_head + 1; + + /* + * Note that the low water mark is at most empty; if we pass this check, + * we know we're non-empty. + */ + if (likely(low_bits != bin->low_bits_low_water)) { + bin->stack_head = new_head; + *success = true; + return ret; + } + if (!adjust_low_water) { + *success = false; + return NULL; + } + /* + * In the fast-path case where we call alloc_easy and then alloc, the + * previous checking and computation is optimized away -- we didn't + * actually commit any of our operations. + */ + if (likely(low_bits != bin->low_bits_empty)) { + bin->stack_head = new_head; + bin->low_bits_low_water = (uint16_t)(uintptr_t)new_head; + *success = true; + return ret; + } + *success = false; + return NULL; +} + +/* + * Allocate an item out of the bin, failing if we're at the low-water mark. + */ +JEMALLOC_ALWAYS_INLINE void * +cache_bin_alloc_easy(cache_bin_t *bin, bool *success) { + /* We don't look at info if we're not adjusting low-water. */ + return cache_bin_alloc_impl(bin, success, false); +} + +/* + * Allocate an item out of the bin, even if we're currently at the low-water + * mark (and failing only if the bin is empty). + */ +JEMALLOC_ALWAYS_INLINE void * +cache_bin_alloc(cache_bin_t *bin, bool *success) { + return cache_bin_alloc_impl(bin, success, true); +} + +JEMALLOC_ALWAYS_INLINE cache_bin_sz_t +cache_bin_alloc_batch(cache_bin_t *bin, size_t num, void **out) { + cache_bin_sz_t n = cache_bin_ncached_get_internal(bin); + if (n > num) { + n = (cache_bin_sz_t)num; + } + memcpy(out, bin->stack_head, n * sizeof(void *)); + bin->stack_head += n; + cache_bin_low_water_adjust(bin); + + return n; +} + +JEMALLOC_ALWAYS_INLINE bool +cache_bin_full(cache_bin_t *bin) { + return ((uint16_t)(uintptr_t)bin->stack_head == bin->low_bits_full); +} + +/* + * Scans the allocated area of the cache_bin for the given pointer up to limit. + * Fires safety_check_fail if the ptr is found and returns true. + */ +JEMALLOC_ALWAYS_INLINE bool +cache_bin_dalloc_safety_checks(cache_bin_t *bin, void *ptr) { + if (!config_debug || opt_debug_double_free_max_scan == 0) { + return false; + } + + cache_bin_sz_t ncached = cache_bin_ncached_get_internal(bin); + unsigned max_scan = opt_debug_double_free_max_scan < ncached + ? opt_debug_double_free_max_scan + : ncached; + + void **cur = bin->stack_head; + void **limit = cur + max_scan; + for (; cur < limit; cur++) { + if (*cur == ptr) { + safety_check_fail( + "Invalid deallocation detected: double free of " + "pointer %p\n", + ptr); + return true; + } + } + return false; +} + +/* + * Free an object into the given bin. Fails only if the bin is full. + */ +JEMALLOC_ALWAYS_INLINE bool +cache_bin_dalloc_easy(cache_bin_t *bin, void *ptr) { + if (unlikely(cache_bin_full(bin))) { + return false; + } + + if (unlikely(cache_bin_dalloc_safety_checks(bin, ptr))) { + return true; + } + + bin->stack_head--; + *bin->stack_head = ptr; + cache_bin_assert_earlier(bin, bin->low_bits_full, + (uint16_t)(uintptr_t)bin->stack_head); + + return true; +} + +/* Returns false if failed to stash (i.e. bin is full). */ +JEMALLOC_ALWAYS_INLINE bool +cache_bin_stash(cache_bin_t *bin, void *ptr) { + if (cache_bin_full(bin)) { + return false; + } + + /* Stash at the full position, in the [full, head) range. */ + uint16_t low_bits_head = (uint16_t)(uintptr_t)bin->stack_head; + /* Wraparound handled as well. */ + uint16_t diff = cache_bin_diff(bin, bin->low_bits_full, low_bits_head); + *(void **)((byte_t *)bin->stack_head - diff) = ptr; + + assert(!cache_bin_full(bin)); + bin->low_bits_full += sizeof(void *); + cache_bin_assert_earlier(bin, bin->low_bits_full, low_bits_head); + + return true; +} + +/* Get the number of stashed pointers. */ +JEMALLOC_ALWAYS_INLINE cache_bin_sz_t +cache_bin_nstashed_get_internal(cache_bin_t *bin) { + cache_bin_sz_t ncached_max = cache_bin_ncached_max_get(bin); + uint16_t low_bits_low_bound = cache_bin_low_bits_low_bound_get(bin); + + cache_bin_sz_t n = cache_bin_diff(bin, low_bits_low_bound, + bin->low_bits_full) / sizeof(void *); + assert(n <= ncached_max); + if (config_debug && n != 0) { + /* Below are for assertions only. */ + void **low_bound = cache_bin_low_bound_get(bin); + + assert((uint16_t)(uintptr_t)low_bound == low_bits_low_bound); + void *stashed = *(low_bound + n - 1); + bool aligned = cache_bin_nonfast_aligned(stashed); +#ifdef JEMALLOC_JET + /* Allow arbitrary pointers to be stashed in tests. */ + aligned = true; +#endif + assert(stashed != NULL && aligned); + } + + return n; +} + +JEMALLOC_ALWAYS_INLINE cache_bin_sz_t +cache_bin_nstashed_get_local(cache_bin_t *bin) { + cache_bin_sz_t n = cache_bin_nstashed_get_internal(bin); + assert(n <= cache_bin_ncached_max_get(bin)); + return n; +} + +/* + * Obtain a racy view of the number of items currently in the cache bin, in the + * presence of possible concurrent modifications. + * + * Note that this is the only racy function in this header. Any other functions + * are assumed to be non-racy. The "racy" term here means accessed from another + * thread (that is not the owner of the specific cache bin). This only happens + * when gathering stats (read-only). The only change because of the racy + * condition is that assertions based on mutable fields are omitted. + * + * It's important to keep in mind that 'bin->stack_head' and + * 'bin->low_bits_full' can be modified concurrently and almost no assertions + * about their values can be made. + * + * This function should not call other utility functions because the racy + * condition may cause unexpected / undefined behaviors in unverified utility + * functions. Currently, this function calls two utility functions + * cache_bin_ncached_max_get and cache_bin_low_bits_low_bound_get because + * they help access values that will not be concurrently modified. + */ +static inline void +cache_bin_nitems_get_remote(cache_bin_t *bin, cache_bin_sz_t *ncached, + cache_bin_sz_t *nstashed) { + /* Racy version of cache_bin_ncached_get_internal. */ + cache_bin_sz_t diff = bin->low_bits_empty - + (uint16_t)(uintptr_t)bin->stack_head; + cache_bin_sz_t n = diff / sizeof(void *); + *ncached = n; + + /* Racy version of cache_bin_nstashed_get_internal. */ + uint16_t low_bits_low_bound = cache_bin_low_bits_low_bound_get(bin); + n = (bin->low_bits_full - low_bits_low_bound) / sizeof(void *); + *nstashed = n; + /* + * Note that cannot assert anything regarding ncached_max because + * it can be configured on the fly and is thus racy. + */ +} + +/* + * Limit how many items can be flushed in a batch (Which is the upper bound + * for the nflush parameter in tcache_bin_flush_impl()). + * This is to avoid stack overflow when we do batch edata look up, which + * reserves a nflush * sizeof(emap_batch_lookup_result_t) stack variable. + */ +#define CACHE_BIN_NFLUSH_BATCH_MAX (VARIABLE_ARRAY_SIZE_MAX >> LG_SIZEOF_PTR) + +/* + * Filling and flushing are done in batch, on arrays of void *s. For filling, + * the arrays go forward, and can be accessed with ordinary array arithmetic. + * For flushing, we work from the end backwards, and so need to use special + * accessors that invert the usual ordering. + * + * This is important for maintaining first-fit; the arena code fills with + * earliest objects first, and so those are the ones we should return first for + * cache_bin_alloc calls. When flushing, we should flush the objects that we + * wish to return later; those at the end of the array. This is better for the + * first-fit heuristic as well as for cache locality; the most recently freed + * objects are the ones most likely to still be in cache. + * + * This all sounds very hand-wavey and theoretical, but reverting the ordering + * on one or the other pathway leads to measurable slowdowns. + */ + +typedef struct cache_bin_ptr_array_s cache_bin_ptr_array_t; +struct cache_bin_ptr_array_s { + cache_bin_sz_t n; + void **ptr; +}; + +/* + * Declare a cache_bin_ptr_array_t sufficient for nval items. + * + * In the current implementation, this could be just part of a + * cache_bin_ptr_array_init_... call, since we reuse the cache bin stack memory. + * Indirecting behind a macro, though, means experimenting with linked-list + * representations is easy (since they'll require an alloca in the calling + * frame). + */ +#define CACHE_BIN_PTR_ARRAY_DECLARE(name, nval) \ + cache_bin_ptr_array_t name; \ + name.n = (nval) + +/* + * Start a fill. The bin must be empty, and This must be followed by a + * finish_fill call before doing any alloc/dalloc operations on the bin. + */ +static inline void +cache_bin_init_ptr_array_for_fill(cache_bin_t *bin, cache_bin_ptr_array_t *arr, + cache_bin_sz_t nfill) { + cache_bin_assert_empty(bin); + arr->ptr = cache_bin_empty_position_get(bin) - nfill; +} + +/* + * While nfill in cache_bin_init_ptr_array_for_fill is the number we *intend* to + * fill, nfilled here is the number we actually filled (which may be less, in + * case of OOM. + */ +static inline void +cache_bin_finish_fill(cache_bin_t *bin, cache_bin_ptr_array_t *arr, + cache_bin_sz_t nfilled) { + cache_bin_assert_empty(bin); + void **empty_position = cache_bin_empty_position_get(bin); + if (nfilled < arr->n) { + memmove(empty_position - nfilled, empty_position - arr->n, + nfilled * sizeof(void *)); + } + bin->stack_head = empty_position - nfilled; +} + +/* + * Same deal, but with flush. Unlike fill (which can fail), the user must flush + * everything we give them. + */ +static inline void +cache_bin_init_ptr_array_for_flush(cache_bin_t *bin, + cache_bin_ptr_array_t *arr, cache_bin_sz_t nflush) { + arr->ptr = cache_bin_empty_position_get(bin) - nflush; + assert(cache_bin_ncached_get_local(bin) == 0 + || *arr->ptr != NULL); +} + +static inline void +cache_bin_finish_flush(cache_bin_t *bin, cache_bin_ptr_array_t *arr, + cache_bin_sz_t nflushed) { + unsigned rem = cache_bin_ncached_get_local(bin) - nflushed; + memmove(bin->stack_head + nflushed, bin->stack_head, + rem * sizeof(void *)); + bin->stack_head += nflushed; + cache_bin_low_water_adjust(bin); +} + +static inline void +cache_bin_init_ptr_array_for_stashed(cache_bin_t *bin, szind_t binind, + cache_bin_ptr_array_t *arr, cache_bin_sz_t nstashed) { + assert(nstashed > 0); + assert(cache_bin_nstashed_get_local(bin) == nstashed); + + void **low_bound = cache_bin_low_bound_get(bin); + arr->ptr = low_bound; + assert(*arr->ptr != NULL); +} + +static inline void +cache_bin_finish_flush_stashed(cache_bin_t *bin) { + void **low_bound = cache_bin_low_bound_get(bin); + + /* Reset the bin local full position. */ + bin->low_bits_full = (uint16_t)(uintptr_t)low_bound; + assert(cache_bin_nstashed_get_local(bin) == 0); +} + +/* + * Initialize a cache_bin_info to represent up to the given number of items in + * the cache_bins it is associated with. + */ +void cache_bin_info_init(cache_bin_info_t *bin_info, + cache_bin_sz_t ncached_max); +/* + * Given an array of initialized cache_bin_info_ts, determine how big an + * allocation is required to initialize a full set of cache_bin_ts. + */ +void cache_bin_info_compute_alloc(const cache_bin_info_t *infos, + szind_t ninfos, size_t *size, size_t *alignment); + +/* + * Actually initialize some cache bins. Callers should allocate the backing + * memory indicated by a call to cache_bin_compute_alloc. They should then + * preincrement, call init once for each bin and info, and then call + * cache_bin_postincrement. *alloc_cur will then point immediately past the end + * of the allocation. + */ +void cache_bin_preincrement(const cache_bin_info_t *infos, szind_t ninfos, + void *alloc, size_t *cur_offset); +void cache_bin_postincrement(void *alloc, size_t *cur_offset); +void cache_bin_init(cache_bin_t *bin, const cache_bin_info_t *info, + void *alloc, size_t *cur_offset); +void cache_bin_init_disabled(cache_bin_t *bin, cache_bin_sz_t ncached_max); + +bool cache_bin_stack_use_thp(void); + +#endif /* JEMALLOC_INTERNAL_CACHE_BIN_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/ckh.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/ckh.h new file mode 100644 index 000000000..8e9d7fedd --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/ckh.h @@ -0,0 +1,102 @@ +#ifndef JEMALLOC_INTERNAL_CKH_H +#define JEMALLOC_INTERNAL_CKH_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/tsd.h" + +/* Cuckoo hashing implementation. Skip to the end for the interface. */ + +/******************************************************************************/ +/* INTERNAL DEFINITIONS -- IGNORE */ +/******************************************************************************/ + +/* Maintain counters used to get an idea of performance. */ +/* #define CKH_COUNT */ +/* Print counter values in ckh_delete() (requires CKH_COUNT). */ +/* #define CKH_VERBOSE */ + +/* + * There are 2^LG_CKH_BUCKET_CELLS cells in each hash table bucket. Try to fit + * one bucket per L1 cache line. + */ +#define LG_CKH_BUCKET_CELLS (LG_CACHELINE - LG_SIZEOF_PTR - 1) + +/* Typedefs to allow easy function pointer passing. */ +typedef void ckh_hash_t (const void *, size_t[2]); +typedef bool ckh_keycomp_t (const void *, const void *); + +/* Hash table cell. */ +typedef struct { + const void *key; + const void *data; +} ckhc_t; + +/* The hash table itself. */ +typedef struct { +#ifdef CKH_COUNT + /* Counters used to get an idea of performance. */ + uint64_t ngrows; + uint64_t nshrinks; + uint64_t nshrinkfails; + uint64_t ninserts; + uint64_t nrelocs; +#endif + + /* Used for pseudo-random number generation. */ + uint64_t prng_state; + + /* Total number of items. */ + size_t count; + + /* + * Minimum and current number of hash table buckets. There are + * 2^LG_CKH_BUCKET_CELLS cells per bucket. + */ + unsigned lg_minbuckets; + unsigned lg_curbuckets; + + /* Hash and comparison functions. */ + ckh_hash_t *hash; + ckh_keycomp_t *keycomp; + + /* Hash table with 2^lg_curbuckets buckets. */ + ckhc_t *tab; +} ckh_t; + +/******************************************************************************/ +/* BEGIN PUBLIC API */ +/******************************************************************************/ + +/* Lifetime management. Minitems is the initial capacity. */ +bool ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *hash, + ckh_keycomp_t *keycomp); +void ckh_delete(tsd_t *tsd, ckh_t *ckh); + +/* Get the number of elements in the set. */ +size_t ckh_count(ckh_t *ckh); + +/* + * To iterate over the elements in the table, initialize *tabind to 0 and call + * this function until it returns true. Each call that returns false will + * update *key and *data to the next element in the table, assuming the pointers + * are non-NULL. + */ +bool ckh_iter(ckh_t *ckh, size_t *tabind, void **key, void **data); + +/* + * Basic hash table operations -- insert, removal, lookup. For ckh_remove and + * ckh_search, key or data can be NULL. The hash-table only stores pointers to + * the key and value, and doesn't do any lifetime management. + */ +bool ckh_insert(tsd_t *tsd, ckh_t *ckh, const void *key, const void *data); +bool ckh_remove(tsd_t *tsd, ckh_t *ckh, const void *searchkey, void **key, + void **data); +bool ckh_search(ckh_t *ckh, const void *searchkey, void **key, void **data); + +/* Some useful hash and comparison functions for strings and pointers. */ +void ckh_string_hash(const void *key, size_t r_hash[2]); +bool ckh_string_keycomp(const void *k1, const void *k2); +void ckh_pointer_hash(const void *key, size_t r_hash[2]); +bool ckh_pointer_keycomp(const void *k1, const void *k2); + +#endif /* JEMALLOC_INTERNAL_CKH_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/counter.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/counter.h new file mode 100644 index 000000000..74e307013 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/counter.h @@ -0,0 +1,36 @@ +#ifndef JEMALLOC_INTERNAL_COUNTER_H +#define JEMALLOC_INTERNAL_COUNTER_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/lockedint.h" +#include "jemalloc/internal/mutex.h" + +typedef struct counter_accum_s { + LOCKEDINT_MTX_DECLARE(mtx) + locked_u64_t accumbytes; + uint64_t interval; +} counter_accum_t; + +JEMALLOC_ALWAYS_INLINE bool +counter_accum(tsdn_t *tsdn, counter_accum_t *counter, uint64_t bytes) { + uint64_t interval = counter->interval; + assert(interval > 0); + LOCKEDINT_MTX_LOCK(tsdn, counter->mtx); + /* + * If the event moves fast enough (and/or if the event handling is slow + * enough), extreme overflow can cause counter trigger coalescing. + * This is an intentional mechanism that avoids rate-limiting + * allocation. + */ + bool overflow = locked_inc_mod_u64(tsdn, LOCKEDINT_MTX(counter->mtx), + &counter->accumbytes, bytes, interval); + LOCKEDINT_MTX_UNLOCK(tsdn, counter->mtx); + return overflow; +} + +bool counter_accum_init(counter_accum_t *counter, uint64_t interval); +void counter_prefork(tsdn_t *tsdn, counter_accum_t *counter); +void counter_postfork_parent(tsdn_t *tsdn, counter_accum_t *counter); +void counter_postfork_child(tsdn_t *tsdn, counter_accum_t *counter); + +#endif /* JEMALLOC_INTERNAL_COUNTER_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/ctl.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/ctl.h new file mode 100644 index 000000000..1f124bfcf --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/ctl.h @@ -0,0 +1,166 @@ +#ifndef JEMALLOC_INTERNAL_CTL_H +#define JEMALLOC_INTERNAL_CTL_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/arena_stats.h" +#include "jemalloc/internal/background_thread_structs.h" +#include "jemalloc/internal/bin_stats.h" +#include "jemalloc/internal/jemalloc_internal_types.h" +#include "jemalloc/internal/malloc_io.h" +#include "jemalloc/internal/mutex_prof.h" +#include "jemalloc/internal/ql.h" +#include "jemalloc/internal/sc.h" +#include "jemalloc/internal/stats.h" + +/* Maximum ctl tree depth. */ +#define CTL_MAX_DEPTH 7 +#define CTL_MULTI_SETTING_MAX_LEN 1000 + +typedef struct ctl_node_s { + bool named; +} ctl_node_t; + +typedef struct ctl_named_node_s { + ctl_node_t node; + const char *name; + /* If (nchildren == 0), this is a terminal node. */ + size_t nchildren; + const ctl_node_t *children; + int (*ctl)(tsd_t *, const size_t *, size_t, void *, size_t *, void *, + size_t); +} ctl_named_node_t; + +typedef struct ctl_indexed_node_s { + struct ctl_node_s node; + const ctl_named_node_t *(*index)(tsdn_t *, const size_t *, size_t, + size_t); +} ctl_indexed_node_t; + +typedef struct ctl_arena_stats_s { + arena_stats_t astats; + + /* Aggregate stats for small size classes, based on bin stats. */ + size_t allocated_small; + uint64_t nmalloc_small; + uint64_t ndalloc_small; + uint64_t nrequests_small; + uint64_t nfills_small; + uint64_t nflushes_small; + + bin_stats_data_t bstats[SC_NBINS]; + arena_stats_large_t lstats[SC_NSIZES - SC_NBINS]; + pac_estats_t estats[SC_NPSIZES]; + hpa_shard_stats_t hpastats; + sec_stats_t secstats; +} ctl_arena_stats_t; + +typedef struct ctl_stats_s { + size_t allocated; + size_t active; + size_t metadata; + size_t metadata_edata; + size_t metadata_rtree; + size_t metadata_thp; + size_t resident; + size_t mapped; + size_t retained; + + background_thread_stats_t background_thread; + mutex_prof_data_t mutex_prof_data[mutex_prof_num_global_mutexes]; +} ctl_stats_t; + +typedef struct ctl_arena_s ctl_arena_t; +struct ctl_arena_s { + unsigned arena_ind; + bool initialized; + ql_elm(ctl_arena_t) destroyed_link; + + /* Basic stats, supported even if !config_stats. */ + unsigned nthreads; + const char *dss; + ssize_t dirty_decay_ms; + ssize_t muzzy_decay_ms; + size_t pactive; + size_t pdirty; + size_t pmuzzy; + + /* NULL if !config_stats. */ + ctl_arena_stats_t *astats; +}; + +typedef struct ctl_arenas_s { + uint64_t epoch; + unsigned narenas; + ql_head(ctl_arena_t) destroyed; + + /* + * Element 0 corresponds to merged stats for extant arenas (accessed via + * MALLCTL_ARENAS_ALL), element 1 corresponds to merged stats for + * destroyed arenas (accessed via MALLCTL_ARENAS_DESTROYED), and the + * remaining MALLOCX_ARENA_LIMIT elements correspond to arenas. + */ + ctl_arena_t *arenas[2 + MALLOCX_ARENA_LIMIT]; +} ctl_arenas_t; + +int ctl_byname(tsd_t *tsd, const char *name, void *oldp, size_t *oldlenp, + void *newp, size_t newlen); +int ctl_nametomib(tsd_t *tsd, const char *name, size_t *mibp, size_t *miblenp); +int ctl_bymib(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp, + size_t *oldlenp, void *newp, size_t newlen); +int ctl_mibnametomib(tsd_t *tsd, size_t *mib, size_t miblen, const char *name, + size_t *miblenp); +int ctl_bymibname(tsd_t *tsd, size_t *mib, size_t miblen, const char *name, + size_t *miblenp, void *oldp, size_t *oldlenp, void *newp, size_t newlen); +bool ctl_boot(void); +void ctl_prefork(tsdn_t *tsdn); +void ctl_postfork_parent(tsdn_t *tsdn); +void ctl_postfork_child(tsdn_t *tsdn); +void ctl_mtx_assert_held(tsdn_t *tsdn); + +#define xmallctl(name, oldp, oldlenp, newp, newlen) do { \ + if (je_mallctl(name, oldp, oldlenp, newp, newlen) \ + != 0) { \ + malloc_printf( \ + ": Failure in xmallctl(\"%s\", ...)\n", \ + name); \ + abort(); \ + } \ +} while (0) + +#define xmallctlnametomib(name, mibp, miblenp) do { \ + if (je_mallctlnametomib(name, mibp, miblenp) != 0) { \ + malloc_printf(": Failure in " \ + "xmallctlnametomib(\"%s\", ...)\n", name); \ + abort(); \ + } \ +} while (0) + +#define xmallctlbymib(mib, miblen, oldp, oldlenp, newp, newlen) do { \ + if (je_mallctlbymib(mib, miblen, oldp, oldlenp, newp, \ + newlen) != 0) { \ + malloc_write( \ + ": Failure in xmallctlbymib()\n"); \ + abort(); \ + } \ +} while (0) + +#define xmallctlmibnametomib(mib, miblen, name, miblenp) do { \ + if (ctl_mibnametomib(tsd_fetch(), mib, miblen, name, miblenp) \ + != 0) { \ + malloc_write( \ + ": Failure in ctl_mibnametomib()\n"); \ + abort(); \ + } \ +} while (0) + +#define xmallctlbymibname(mib, miblen, name, miblenp, oldp, oldlenp, \ + newp, newlen) do { \ + if (ctl_bymibname(tsd_fetch(), mib, miblen, name, miblenp, \ + oldp, oldlenp, newp, newlen) != 0) { \ + malloc_write( \ + ": Failure in ctl_bymibname()\n"); \ + abort(); \ + } \ +} while (0) + +#endif /* JEMALLOC_INTERNAL_CTL_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/decay.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/decay.h new file mode 100644 index 000000000..74be55dae --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/decay.h @@ -0,0 +1,188 @@ +#ifndef JEMALLOC_INTERNAL_DECAY_H +#define JEMALLOC_INTERNAL_DECAY_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/smoothstep.h" + +#define DECAY_UNBOUNDED_TIME_TO_PURGE ((uint64_t)-1) + +/* + * The decay_t computes the number of pages we should purge at any given time. + * Page allocators inform a decay object when pages enter a decay-able state + * (i.e. dirty or muzzy), and query it to determine how many pages should be + * purged at any given time. + * + * This is mostly a single-threaded data structure and doesn't care about + * synchronization at all; it's the caller's responsibility to manage their + * synchronization on their own. There are two exceptions: + * 1) It's OK to racily call decay_ms_read (i.e. just the simplest state query). + * 2) The mtx and purging fields live (and are initialized) here, but are + * logically owned by the page allocator. This is just a convenience (since + * those fields would be duplicated for both the dirty and muzzy states + * otherwise). + */ +typedef struct decay_s decay_t; +struct decay_s { + /* Synchronizes all non-atomic fields. */ + malloc_mutex_t mtx; + /* + * True if a thread is currently purging the extents associated with + * this decay structure. + */ + bool purging; + /* + * Approximate time in milliseconds from the creation of a set of unused + * dirty pages until an equivalent set of unused dirty pages is purged + * and/or reused. + */ + atomic_zd_t time_ms; + /* time / SMOOTHSTEP_NSTEPS. */ + nstime_t interval; + /* + * Time at which the current decay interval logically started. We do + * not actually advance to a new epoch until sometime after it starts + * because of scheduling and computation delays, and it is even possible + * to completely skip epochs. In all cases, during epoch advancement we + * merge all relevant activity into the most recently recorded epoch. + */ + nstime_t epoch; + /* Deadline randomness generator. */ + uint64_t jitter_state; + /* + * Deadline for current epoch. This is the sum of interval and per + * epoch jitter which is a uniform random variable in [0..interval). + * Epochs always advance by precise multiples of interval, but we + * randomize the deadline to reduce the likelihood of arenas purging in + * lockstep. + */ + nstime_t deadline; + /* + * The number of pages we cap ourselves at in the current epoch, per + * decay policies. Updated on an epoch change. After an epoch change, + * the caller should take steps to try to purge down to this amount. + */ + size_t npages_limit; + /* + * Number of unpurged pages at beginning of current epoch. During epoch + * advancement we use the delta between arena->decay_*.nunpurged and + * ecache_npages_get(&arena->ecache_*) to determine how many dirty pages, + * if any, were generated. + */ + size_t nunpurged; + /* + * Trailing log of how many unused dirty pages were generated during + * each of the past SMOOTHSTEP_NSTEPS decay epochs, where the last + * element is the most recent epoch. Corresponding epoch times are + * relative to epoch. + * + * Updated only on epoch advance, triggered by + * decay_maybe_advance_epoch, below. + */ + size_t backlog[SMOOTHSTEP_NSTEPS]; + + /* Peak number of pages in associated extents. Used for debug only. */ + uint64_t ceil_npages; +}; + +/* + * The current decay time setting. This is the only public access to a decay_t + * that's allowed without holding mtx. + */ +static inline ssize_t +decay_ms_read(const decay_t *decay) { + return atomic_load_zd(&decay->time_ms, ATOMIC_RELAXED); +} + +/* + * See the comment on the struct field -- the limit on pages we should allow in + * this decay state this epoch. + */ +static inline size_t +decay_npages_limit_get(const decay_t *decay) { + return decay->npages_limit; +} + +/* How many unused dirty pages were generated during the last epoch. */ +static inline size_t +decay_epoch_npages_delta(const decay_t *decay) { + return decay->backlog[SMOOTHSTEP_NSTEPS - 1]; +} + +/* + * Current epoch duration, in nanoseconds. Given that new epochs are started + * somewhat haphazardly, this is not necessarily exactly the time between any + * two calls to decay_maybe_advance_epoch; see the comments on fields in the + * decay_t. + */ +static inline uint64_t +decay_epoch_duration_ns(const decay_t *decay) { + return nstime_ns(&decay->interval); +} + +static inline bool +decay_immediately(const decay_t *decay) { + ssize_t decay_ms = decay_ms_read(decay); + return decay_ms == 0; +} + +static inline bool +decay_disabled(const decay_t *decay) { + ssize_t decay_ms = decay_ms_read(decay); + return decay_ms < 0; +} + +/* Returns true if decay is enabled and done gradually. */ +static inline bool +decay_gradually(const decay_t *decay) { + ssize_t decay_ms = decay_ms_read(decay); + return decay_ms > 0; +} + +/* + * Returns true if the passed in decay time setting is valid. + * < -1 : invalid + * -1 : never decay + * 0 : decay immediately + * > 0 : some positive decay time, up to a maximum allowed value of + * NSTIME_SEC_MAX * 1000, which corresponds to decaying somewhere in the early + * 27th century. By that time, we expect to have implemented alternate purging + * strategies. + */ +bool decay_ms_valid(ssize_t decay_ms); + +/* + * As a precondition, the decay_t must be zeroed out (as if with memset). + * + * Returns true on error. + */ +bool decay_init(decay_t *decay, nstime_t *cur_time, ssize_t decay_ms); + +/* + * Given an already-initialized decay_t, reinitialize it with the given decay + * time. The decay_t must have previously been initialized (and should not then + * be zeroed). + */ +void decay_reinit(decay_t *decay, nstime_t *cur_time, ssize_t decay_ms); + +/* + * Compute how many of 'npages_new' pages we would need to purge in 'time'. + */ +uint64_t decay_npages_purge_in(decay_t *decay, nstime_t *time, + size_t npages_new); + +/* Returns true if the epoch advanced and there are pages to purge. */ +bool decay_maybe_advance_epoch(decay_t *decay, nstime_t *new_time, + size_t current_npages); + +/* + * Calculates wait time until a number of pages in the interval + * [0.5 * npages_threshold .. 1.5 * npages_threshold] should be purged. + * + * Returns number of nanoseconds or DECAY_UNBOUNDED_TIME_TO_PURGE in case of + * indefinite wait. + */ +uint64_t decay_ns_until_purge(decay_t *decay, size_t npages_current, + uint64_t npages_threshold); + +#endif /* JEMALLOC_INTERNAL_DECAY_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/div.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/div.h new file mode 100644 index 000000000..56d5f463f --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/div.h @@ -0,0 +1,42 @@ +#ifndef JEMALLOC_INTERNAL_DIV_H +#define JEMALLOC_INTERNAL_DIV_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/assert.h" + +/* + * This module does the division that computes the index of a region in a slab, + * given its offset relative to the base. + * That is, given a divisor d, an n = i * d (all integers), we'll return i. + * We do some pre-computation to do this more quickly than a CPU division + * instruction. + * We bound n < 2^32, and don't support dividing by one. + */ + +typedef struct div_info_s div_info_t; +struct div_info_s { + uint32_t magic; +#ifdef JEMALLOC_DEBUG + size_t d; +#endif +}; + +void div_init(div_info_t *div_info, size_t divisor); + +static inline size_t +div_compute(div_info_t *div_info, size_t n) { + assert(n <= (uint32_t)-1); + /* + * This generates, e.g. mov; imul; shr on x86-64. On a 32-bit machine, + * the compilers I tried were all smart enough to turn this into the + * appropriate "get the high 32 bits of the result of a multiply" (e.g. + * mul; mov edx eax; on x86, umull on arm, etc.). + */ + size_t i = ((uint64_t)n * (uint64_t)div_info->magic) >> 32; +#ifdef JEMALLOC_DEBUG + assert(i * div_info->d == n); +#endif + return i; +} + +#endif /* JEMALLOC_INTERNAL_DIV_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/ecache.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/ecache.h new file mode 100644 index 000000000..2bd74fdef --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/ecache.h @@ -0,0 +1,56 @@ +#ifndef JEMALLOC_INTERNAL_ECACHE_H +#define JEMALLOC_INTERNAL_ECACHE_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/eset.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/san.h" + +typedef struct ecache_s ecache_t; +struct ecache_s { + malloc_mutex_t mtx; + eset_t eset; + eset_t guarded_eset; + /* All stored extents must be in the same state. */ + extent_state_t state; + /* The index of the ehooks the ecache is associated with. */ + unsigned ind; + /* + * If true, delay coalescing until eviction; otherwise coalesce during + * deallocation. + */ + bool delay_coalesce; +}; + +static inline size_t +ecache_npages_get(ecache_t *ecache) { + return eset_npages_get(&ecache->eset) + + eset_npages_get(&ecache->guarded_eset); +} + +/* Get the number of extents in the given page size index. */ +static inline size_t +ecache_nextents_get(ecache_t *ecache, pszind_t ind) { + return eset_nextents_get(&ecache->eset, ind) + + eset_nextents_get(&ecache->guarded_eset, ind); +} + +/* Get the sum total bytes of the extents in the given page size index. */ +static inline size_t +ecache_nbytes_get(ecache_t *ecache, pszind_t ind) { + return eset_nbytes_get(&ecache->eset, ind) + + eset_nbytes_get(&ecache->guarded_eset, ind); +} + +static inline unsigned +ecache_ind_get(ecache_t *ecache) { + return ecache->ind; +} + +bool ecache_init(tsdn_t *tsdn, ecache_t *ecache, extent_state_t state, + unsigned ind, bool delay_coalesce); +void ecache_prefork(tsdn_t *tsdn, ecache_t *ecache); +void ecache_postfork_parent(tsdn_t *tsdn, ecache_t *ecache); +void ecache_postfork_child(tsdn_t *tsdn, ecache_t *ecache); + +#endif /* JEMALLOC_INTERNAL_ECACHE_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/edata.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/edata.h new file mode 100644 index 000000000..17befd92c --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/edata.h @@ -0,0 +1,707 @@ +#ifndef JEMALLOC_INTERNAL_EDATA_H +#define JEMALLOC_INTERNAL_EDATA_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/bin_info.h" +#include "jemalloc/internal/bit_util.h" +#include "jemalloc/internal/hpdata.h" +#include "jemalloc/internal/nstime.h" +#include "jemalloc/internal/ph.h" +#include "jemalloc/internal/prof_types.h" +#include "jemalloc/internal/ql.h" +#include "jemalloc/internal/sc.h" +#include "jemalloc/internal/slab_data.h" +#include "jemalloc/internal/sz.h" +#include "jemalloc/internal/typed_list.h" + +/* + * sizeof(edata_t) is 128 bytes on 64-bit architectures. Ensure the alignment + * to free up the low bits in the rtree leaf. + */ +#define EDATA_ALIGNMENT 128 + +enum extent_state_e { + extent_state_active = 0, + extent_state_dirty = 1, + extent_state_muzzy = 2, + extent_state_retained = 3, + extent_state_transition = 4, /* States below are intermediate. */ + extent_state_merging = 5, + extent_state_max = 5 /* Sanity checking only. */ +}; +typedef enum extent_state_e extent_state_t; + +enum extent_head_state_e { + EXTENT_NOT_HEAD, + EXTENT_IS_HEAD /* See comments in ehooks_default_merge_impl(). */ +}; +typedef enum extent_head_state_e extent_head_state_t; + +/* + * Which implementation of the page allocator interface, (PAI, defined in + * pai.h) owns the given extent? + */ +enum extent_pai_e { + EXTENT_PAI_PAC = 0, + EXTENT_PAI_HPA = 1 +}; +typedef enum extent_pai_e extent_pai_t; + +struct e_prof_info_s { + /* Time when this was allocated. */ + nstime_t e_prof_alloc_time; + /* Allocation request size. */ + size_t e_prof_alloc_size; + /* Points to a prof_tctx_t. */ + atomic_p_t e_prof_tctx; + /* + * Points to a prof_recent_t for the allocation; NULL + * means the recent allocation record no longer exists. + * Protected by prof_recent_alloc_mtx. + */ + atomic_p_t e_prof_recent_alloc; +}; +typedef struct e_prof_info_s e_prof_info_t; + +/* + * The information about a particular edata that lives in an emap. Space is + * more precious there (the information, plus the edata pointer, has to live in + * a 64-bit word if we want to enable a packed representation. + * + * There are two things that are special about the information here: + * - It's quicker to access. You have one fewer pointer hop, since finding the + * edata_t associated with an item always requires accessing the rtree leaf in + * which this data is stored. + * - It can be read unsynchronized, and without worrying about lifetime issues. + */ +typedef struct edata_map_info_s edata_map_info_t; +struct edata_map_info_s { + bool slab; + szind_t szind; +}; + +typedef struct edata_cmp_summary_s edata_cmp_summary_t; +struct edata_cmp_summary_s { + uint64_t sn; + uintptr_t addr; +}; + +/* Extent (span of pages). Use accessor functions for e_* fields. */ +typedef struct edata_s edata_t; +ph_structs(edata_avail, edata_t); +ph_structs(edata_heap, edata_t); +struct edata_s { + /* + * Bitfield containing several fields: + * + * a: arena_ind + * b: slab + * c: committed + * p: pai + * z: zeroed + * g: guarded + * t: state + * i: szind + * f: nfree + * s: bin_shard + * + * 00000000 ... 0000ssss ssffffff ffffiiii iiiitttg zpcbaaaa aaaaaaaa + * + * arena_ind: Arena from which this extent came, or all 1 bits if + * unassociated. + * + * slab: The slab flag indicates whether the extent is used for a slab + * of small regions. This helps differentiate small size classes, + * and it indicates whether interior pointers can be looked up via + * iealloc(). + * + * committed: The committed flag indicates whether physical memory is + * committed to the extent, whether explicitly or implicitly + * as on a system that overcommits and satisfies physical + * memory needs on demand via soft page faults. + * + * pai: The pai flag is an extent_pai_t. + * + * zeroed: The zeroed flag is used by extent recycling code to track + * whether memory is zero-filled. + * + * guarded: The guarded flag is use by the sanitizer to track whether + * the extent has page guards around it. + * + * state: The state flag is an extent_state_t. + * + * szind: The szind flag indicates usable size class index for + * allocations residing in this extent, regardless of whether the + * extent is a slab. Extent size and usable size often differ + * even for non-slabs, either due to sz_large_pad or promotion of + * sampled small regions. + * + * nfree: Number of free regions in slab. + * + * bin_shard: the shard of the bin from which this extent came. + */ + uint64_t e_bits; +#define MASK(CURRENT_FIELD_WIDTH, CURRENT_FIELD_SHIFT) ((((((uint64_t)0x1U) << (CURRENT_FIELD_WIDTH)) - 1)) << (CURRENT_FIELD_SHIFT)) + +#define EDATA_BITS_ARENA_WIDTH MALLOCX_ARENA_BITS +#define EDATA_BITS_ARENA_SHIFT 0 +#define EDATA_BITS_ARENA_MASK MASK(EDATA_BITS_ARENA_WIDTH, EDATA_BITS_ARENA_SHIFT) + +#define EDATA_BITS_SLAB_WIDTH 1 +#define EDATA_BITS_SLAB_SHIFT (EDATA_BITS_ARENA_WIDTH + EDATA_BITS_ARENA_SHIFT) +#define EDATA_BITS_SLAB_MASK MASK(EDATA_BITS_SLAB_WIDTH, EDATA_BITS_SLAB_SHIFT) + +#define EDATA_BITS_COMMITTED_WIDTH 1 +#define EDATA_BITS_COMMITTED_SHIFT (EDATA_BITS_SLAB_WIDTH + EDATA_BITS_SLAB_SHIFT) +#define EDATA_BITS_COMMITTED_MASK MASK(EDATA_BITS_COMMITTED_WIDTH, EDATA_BITS_COMMITTED_SHIFT) + +#define EDATA_BITS_PAI_WIDTH 1 +#define EDATA_BITS_PAI_SHIFT (EDATA_BITS_COMMITTED_WIDTH + EDATA_BITS_COMMITTED_SHIFT) +#define EDATA_BITS_PAI_MASK MASK(EDATA_BITS_PAI_WIDTH, EDATA_BITS_PAI_SHIFT) + +#define EDATA_BITS_ZEROED_WIDTH 1 +#define EDATA_BITS_ZEROED_SHIFT (EDATA_BITS_PAI_WIDTH + EDATA_BITS_PAI_SHIFT) +#define EDATA_BITS_ZEROED_MASK MASK(EDATA_BITS_ZEROED_WIDTH, EDATA_BITS_ZEROED_SHIFT) + +#define EDATA_BITS_GUARDED_WIDTH 1 +#define EDATA_BITS_GUARDED_SHIFT (EDATA_BITS_ZEROED_WIDTH + EDATA_BITS_ZEROED_SHIFT) +#define EDATA_BITS_GUARDED_MASK MASK(EDATA_BITS_GUARDED_WIDTH, EDATA_BITS_GUARDED_SHIFT) + +#define EDATA_BITS_STATE_WIDTH 3 +#define EDATA_BITS_STATE_SHIFT (EDATA_BITS_GUARDED_WIDTH + EDATA_BITS_GUARDED_SHIFT) +#define EDATA_BITS_STATE_MASK MASK(EDATA_BITS_STATE_WIDTH, EDATA_BITS_STATE_SHIFT) + +#define EDATA_BITS_SZIND_WIDTH LG_CEIL(SC_NSIZES) +#define EDATA_BITS_SZIND_SHIFT (EDATA_BITS_STATE_WIDTH + EDATA_BITS_STATE_SHIFT) +#define EDATA_BITS_SZIND_MASK MASK(EDATA_BITS_SZIND_WIDTH, EDATA_BITS_SZIND_SHIFT) + +#define EDATA_BITS_NFREE_WIDTH (SC_LG_SLAB_MAXREGS + 1) +#define EDATA_BITS_NFREE_SHIFT (EDATA_BITS_SZIND_WIDTH + EDATA_BITS_SZIND_SHIFT) +#define EDATA_BITS_NFREE_MASK MASK(EDATA_BITS_NFREE_WIDTH, EDATA_BITS_NFREE_SHIFT) + +#define EDATA_BITS_BINSHARD_WIDTH 6 +#define EDATA_BITS_BINSHARD_SHIFT (EDATA_BITS_NFREE_WIDTH + EDATA_BITS_NFREE_SHIFT) +#define EDATA_BITS_BINSHARD_MASK MASK(EDATA_BITS_BINSHARD_WIDTH, EDATA_BITS_BINSHARD_SHIFT) + +#define EDATA_BITS_IS_HEAD_WIDTH 1 +#define EDATA_BITS_IS_HEAD_SHIFT (EDATA_BITS_BINSHARD_WIDTH + EDATA_BITS_BINSHARD_SHIFT) +#define EDATA_BITS_IS_HEAD_MASK MASK(EDATA_BITS_IS_HEAD_WIDTH, EDATA_BITS_IS_HEAD_SHIFT) + + /* Pointer to the extent that this structure is responsible for. */ + void *e_addr; + + union { + /* + * Extent size and serial number associated with the extent + * structure (different than the serial number for the extent at + * e_addr). + * + * ssssssss [...] ssssssss ssssnnnn nnnnnnnn + */ + size_t e_size_esn; + #define EDATA_SIZE_MASK ((size_t)~(PAGE-1)) + #define EDATA_ESN_MASK ((size_t)PAGE-1) + /* Base extent size, which may not be a multiple of PAGE. */ + size_t e_bsize; + }; + + /* + * If this edata is a user allocation from an HPA, it comes out of some + * pageslab (we don't yet support huegpage allocations that don't fit + * into pageslabs). This tracks it. + */ + hpdata_t *e_ps; + + /* + * Serial number. These are not necessarily unique; splitting an extent + * results in two extents with the same serial number. + */ + uint64_t e_sn; + + union { + /* + * List linkage used when the edata_t is active; either in + * arena's large allocations or bin_t's slabs_full. + */ + ql_elm(edata_t) ql_link_active; + /* + * Pairing heap linkage. Used whenever the extent is inactive + * (in the page allocators), or when it is active and in + * slabs_nonfull, or when the edata_t is unassociated with an + * extent and sitting in an edata_cache. + */ + union { + edata_heap_link_t heap_link; + edata_avail_link_t avail_link; + }; + }; + + union { + /* + * List linkage used when the extent is inactive: + * - Stashed dirty extents + * - Ecache LRU functionality. + */ + ql_elm(edata_t) ql_link_inactive; + /* Small region slab metadata. */ + slab_data_t e_slab_data; + + /* Profiling data, used for large objects. */ + e_prof_info_t e_prof_info; + }; +}; + +TYPED_LIST(edata_list_active, edata_t, ql_link_active) +TYPED_LIST(edata_list_inactive, edata_t, ql_link_inactive) + +static inline unsigned +edata_arena_ind_get(const edata_t *edata) { + unsigned arena_ind = (unsigned)((edata->e_bits & + EDATA_BITS_ARENA_MASK) >> EDATA_BITS_ARENA_SHIFT); + assert(arena_ind < MALLOCX_ARENA_LIMIT); + + return arena_ind; +} + +static inline szind_t +edata_szind_get_maybe_invalid(const edata_t *edata) { + szind_t szind = (szind_t)((edata->e_bits & EDATA_BITS_SZIND_MASK) >> + EDATA_BITS_SZIND_SHIFT); + assert(szind <= SC_NSIZES); + return szind; +} + +static inline szind_t +edata_szind_get(const edata_t *edata) { + szind_t szind = edata_szind_get_maybe_invalid(edata); + assert(szind < SC_NSIZES); /* Never call when "invalid". */ + return szind; +} + +static inline size_t +edata_usize_get(const edata_t *edata) { + return sz_index2size(edata_szind_get(edata)); +} + +static inline unsigned +edata_binshard_get(const edata_t *edata) { + unsigned binshard = (unsigned)((edata->e_bits & + EDATA_BITS_BINSHARD_MASK) >> EDATA_BITS_BINSHARD_SHIFT); + assert(binshard < bin_infos[edata_szind_get(edata)].n_shards); + return binshard; +} + +static inline uint64_t +edata_sn_get(const edata_t *edata) { + return edata->e_sn; +} + +static inline extent_state_t +edata_state_get(const edata_t *edata) { + return (extent_state_t)((edata->e_bits & EDATA_BITS_STATE_MASK) >> + EDATA_BITS_STATE_SHIFT); +} + +static inline bool +edata_guarded_get(const edata_t *edata) { + return (bool)((edata->e_bits & EDATA_BITS_GUARDED_MASK) >> + EDATA_BITS_GUARDED_SHIFT); +} + +static inline bool +edata_zeroed_get(const edata_t *edata) { + return (bool)((edata->e_bits & EDATA_BITS_ZEROED_MASK) >> + EDATA_BITS_ZEROED_SHIFT); +} + +static inline bool +edata_committed_get(const edata_t *edata) { + return (bool)((edata->e_bits & EDATA_BITS_COMMITTED_MASK) >> + EDATA_BITS_COMMITTED_SHIFT); +} + +static inline extent_pai_t +edata_pai_get(const edata_t *edata) { + return (extent_pai_t)((edata->e_bits & EDATA_BITS_PAI_MASK) >> + EDATA_BITS_PAI_SHIFT); +} + +static inline bool +edata_slab_get(const edata_t *edata) { + return (bool)((edata->e_bits & EDATA_BITS_SLAB_MASK) >> + EDATA_BITS_SLAB_SHIFT); +} + +static inline unsigned +edata_nfree_get(const edata_t *edata) { + assert(edata_slab_get(edata)); + return (unsigned)((edata->e_bits & EDATA_BITS_NFREE_MASK) >> + EDATA_BITS_NFREE_SHIFT); +} + +static inline void * +edata_base_get(const edata_t *edata) { + assert(edata->e_addr == PAGE_ADDR2BASE(edata->e_addr) || + !edata_slab_get(edata)); + return PAGE_ADDR2BASE(edata->e_addr); +} + +static inline void * +edata_addr_get(const edata_t *edata) { + assert(edata->e_addr == PAGE_ADDR2BASE(edata->e_addr) || + !edata_slab_get(edata)); + return edata->e_addr; +} + +static inline size_t +edata_size_get(const edata_t *edata) { + return (edata->e_size_esn & EDATA_SIZE_MASK); +} + +static inline size_t +edata_esn_get(const edata_t *edata) { + return (edata->e_size_esn & EDATA_ESN_MASK); +} + +static inline size_t +edata_bsize_get(const edata_t *edata) { + return edata->e_bsize; +} + +static inline hpdata_t * +edata_ps_get(const edata_t *edata) { + assert(edata_pai_get(edata) == EXTENT_PAI_HPA); + return edata->e_ps; +} + +static inline void * +edata_before_get(const edata_t *edata) { + return (void *)((byte_t *)edata_base_get(edata) - PAGE); +} + +static inline void * +edata_last_get(const edata_t *edata) { + return (void *)((byte_t *)edata_base_get(edata) + + edata_size_get(edata) - PAGE); +} + +static inline void * +edata_past_get(const edata_t *edata) { + return (void *)((byte_t *)edata_base_get(edata) + + edata_size_get(edata)); +} + +static inline slab_data_t * +edata_slab_data_get(edata_t *edata) { + assert(edata_slab_get(edata)); + return &edata->e_slab_data; +} + +static inline const slab_data_t * +edata_slab_data_get_const(const edata_t *edata) { + assert(edata_slab_get(edata)); + return &edata->e_slab_data; +} + +static inline prof_tctx_t * +edata_prof_tctx_get(const edata_t *edata) { + return (prof_tctx_t *)atomic_load_p(&edata->e_prof_info.e_prof_tctx, + ATOMIC_ACQUIRE); +} + +static inline const nstime_t * +edata_prof_alloc_time_get(const edata_t *edata) { + return &edata->e_prof_info.e_prof_alloc_time; +} + +static inline size_t +edata_prof_alloc_size_get(const edata_t *edata) { + return edata->e_prof_info.e_prof_alloc_size; +} + +static inline prof_recent_t * +edata_prof_recent_alloc_get_dont_call_directly(const edata_t *edata) { + return (prof_recent_t *)atomic_load_p( + &edata->e_prof_info.e_prof_recent_alloc, ATOMIC_RELAXED); +} + +static inline void +edata_arena_ind_set(edata_t *edata, unsigned arena_ind) { + edata->e_bits = (edata->e_bits & ~EDATA_BITS_ARENA_MASK) | + ((uint64_t)arena_ind << EDATA_BITS_ARENA_SHIFT); +} + +static inline void +edata_binshard_set(edata_t *edata, unsigned binshard) { + /* The assertion assumes szind is set already. */ + assert(binshard < bin_infos[edata_szind_get(edata)].n_shards); + edata->e_bits = (edata->e_bits & ~EDATA_BITS_BINSHARD_MASK) | + ((uint64_t)binshard << EDATA_BITS_BINSHARD_SHIFT); +} + +static inline void +edata_addr_set(edata_t *edata, void *addr) { + edata->e_addr = addr; +} + +static inline void +edata_size_set(edata_t *edata, size_t size) { + assert((size & ~EDATA_SIZE_MASK) == 0); + edata->e_size_esn = size | (edata->e_size_esn & ~EDATA_SIZE_MASK); +} + +static inline void +edata_esn_set(edata_t *edata, size_t esn) { + edata->e_size_esn = (edata->e_size_esn & ~EDATA_ESN_MASK) | (esn & + EDATA_ESN_MASK); +} + +static inline void +edata_bsize_set(edata_t *edata, size_t bsize) { + edata->e_bsize = bsize; +} + +static inline void +edata_ps_set(edata_t *edata, hpdata_t *ps) { + assert(edata_pai_get(edata) == EXTENT_PAI_HPA); + edata->e_ps = ps; +} + +static inline void +edata_szind_set(edata_t *edata, szind_t szind) { + assert(szind <= SC_NSIZES); /* SC_NSIZES means "invalid". */ + edata->e_bits = (edata->e_bits & ~EDATA_BITS_SZIND_MASK) | + ((uint64_t)szind << EDATA_BITS_SZIND_SHIFT); +} + +static inline void +edata_nfree_set(edata_t *edata, unsigned nfree) { + assert(edata_slab_get(edata)); + edata->e_bits = (edata->e_bits & ~EDATA_BITS_NFREE_MASK) | + ((uint64_t)nfree << EDATA_BITS_NFREE_SHIFT); +} + +static inline void +edata_nfree_binshard_set(edata_t *edata, unsigned nfree, unsigned binshard) { + /* The assertion assumes szind is set already. */ + assert(binshard < bin_infos[edata_szind_get(edata)].n_shards); + edata->e_bits = (edata->e_bits & + (~EDATA_BITS_NFREE_MASK & ~EDATA_BITS_BINSHARD_MASK)) | + ((uint64_t)binshard << EDATA_BITS_BINSHARD_SHIFT) | + ((uint64_t)nfree << EDATA_BITS_NFREE_SHIFT); +} + +static inline void +edata_nfree_inc(edata_t *edata) { + assert(edata_slab_get(edata)); + edata->e_bits += ((uint64_t)1U << EDATA_BITS_NFREE_SHIFT); +} + +static inline void +edata_nfree_dec(edata_t *edata) { + assert(edata_slab_get(edata)); + edata->e_bits -= ((uint64_t)1U << EDATA_BITS_NFREE_SHIFT); +} + +static inline void +edata_nfree_sub(edata_t *edata, uint64_t n) { + assert(edata_slab_get(edata)); + edata->e_bits -= (n << EDATA_BITS_NFREE_SHIFT); +} + +static inline void +edata_sn_set(edata_t *edata, uint64_t sn) { + edata->e_sn = sn; +} + +static inline void +edata_state_set(edata_t *edata, extent_state_t state) { + edata->e_bits = (edata->e_bits & ~EDATA_BITS_STATE_MASK) | + ((uint64_t)state << EDATA_BITS_STATE_SHIFT); +} + +static inline void +edata_guarded_set(edata_t *edata, bool guarded) { + edata->e_bits = (edata->e_bits & ~EDATA_BITS_GUARDED_MASK) | + ((uint64_t)guarded << EDATA_BITS_GUARDED_SHIFT); +} + +static inline void +edata_zeroed_set(edata_t *edata, bool zeroed) { + edata->e_bits = (edata->e_bits & ~EDATA_BITS_ZEROED_MASK) | + ((uint64_t)zeroed << EDATA_BITS_ZEROED_SHIFT); +} + +static inline void +edata_committed_set(edata_t *edata, bool committed) { + edata->e_bits = (edata->e_bits & ~EDATA_BITS_COMMITTED_MASK) | + ((uint64_t)committed << EDATA_BITS_COMMITTED_SHIFT); +} + +static inline void +edata_pai_set(edata_t *edata, extent_pai_t pai) { + edata->e_bits = (edata->e_bits & ~EDATA_BITS_PAI_MASK) | + ((uint64_t)pai << EDATA_BITS_PAI_SHIFT); +} + +static inline void +edata_slab_set(edata_t *edata, bool slab) { + edata->e_bits = (edata->e_bits & ~EDATA_BITS_SLAB_MASK) | + ((uint64_t)slab << EDATA_BITS_SLAB_SHIFT); +} + +static inline void +edata_prof_tctx_set(edata_t *edata, prof_tctx_t *tctx) { + atomic_store_p(&edata->e_prof_info.e_prof_tctx, tctx, ATOMIC_RELEASE); +} + +static inline void +edata_prof_alloc_time_set(edata_t *edata, nstime_t *t) { + nstime_copy(&edata->e_prof_info.e_prof_alloc_time, t); +} + +static inline void +edata_prof_alloc_size_set(edata_t *edata, size_t size) { + edata->e_prof_info.e_prof_alloc_size = size; +} + +static inline void +edata_prof_recent_alloc_set_dont_call_directly(edata_t *edata, + prof_recent_t *recent_alloc) { + atomic_store_p(&edata->e_prof_info.e_prof_recent_alloc, recent_alloc, + ATOMIC_RELAXED); +} + +static inline bool +edata_is_head_get(edata_t *edata) { + return (bool)((edata->e_bits & EDATA_BITS_IS_HEAD_MASK) >> + EDATA_BITS_IS_HEAD_SHIFT); +} + +static inline void +edata_is_head_set(edata_t *edata, bool is_head) { + edata->e_bits = (edata->e_bits & ~EDATA_BITS_IS_HEAD_MASK) | + ((uint64_t)is_head << EDATA_BITS_IS_HEAD_SHIFT); +} + +static inline bool +edata_state_in_transition(extent_state_t state) { + return state >= extent_state_transition; +} + +/* + * Because this function is implemented as a sequence of bitfield modifications, + * even though each individual bit is properly initialized, we technically read + * uninitialized data within it. This is mostly fine, since most callers get + * their edatas from zeroing sources, but callers who make stack edata_ts need + * to manually zero them. + */ +static inline void +edata_init(edata_t *edata, unsigned arena_ind, void *addr, size_t size, + bool slab, szind_t szind, uint64_t sn, extent_state_t state, bool zeroed, + bool committed, extent_pai_t pai, extent_head_state_t is_head) { + assert(addr == PAGE_ADDR2BASE(addr) || !slab); + + edata_arena_ind_set(edata, arena_ind); + edata_addr_set(edata, addr); + edata_size_set(edata, size); + edata_slab_set(edata, slab); + edata_szind_set(edata, szind); + edata_sn_set(edata, sn); + edata_state_set(edata, state); + edata_guarded_set(edata, false); + edata_zeroed_set(edata, zeroed); + edata_committed_set(edata, committed); + edata_pai_set(edata, pai); + edata_is_head_set(edata, is_head == EXTENT_IS_HEAD); + if (config_prof) { + edata_prof_tctx_set(edata, NULL); + } +} + +static inline void +edata_binit(edata_t *edata, void *addr, size_t bsize, uint64_t sn, + bool reused) { + edata_arena_ind_set(edata, (1U << MALLOCX_ARENA_BITS) - 1); + edata_addr_set(edata, addr); + edata_bsize_set(edata, bsize); + edata_slab_set(edata, false); + edata_szind_set(edata, SC_NSIZES); + edata_sn_set(edata, sn); + edata_state_set(edata, extent_state_active); + /* See comments in base_edata_is_reused. */ + edata_guarded_set(edata, reused); + edata_zeroed_set(edata, true); + edata_committed_set(edata, true); + /* + * This isn't strictly true, but base allocated extents never get + * deallocated and can't be looked up in the emap, but no sense in + * wasting a state bit to encode this fact. + */ + edata_pai_set(edata, EXTENT_PAI_PAC); +} + +static inline int +edata_esn_comp(const edata_t *a, const edata_t *b) { + size_t a_esn = edata_esn_get(a); + size_t b_esn = edata_esn_get(b); + + return (a_esn > b_esn) - (a_esn < b_esn); +} + +static inline int +edata_ead_comp(const edata_t *a, const edata_t *b) { + uintptr_t a_eaddr = (uintptr_t)a; + uintptr_t b_eaddr = (uintptr_t)b; + + return (a_eaddr > b_eaddr) - (a_eaddr < b_eaddr); +} + +static inline edata_cmp_summary_t +edata_cmp_summary_get(const edata_t *edata) { + edata_cmp_summary_t result; + result.sn = edata_sn_get(edata); + result.addr = (uintptr_t)edata_addr_get(edata); + return result; +} + +static inline int +edata_cmp_summary_comp(edata_cmp_summary_t a, edata_cmp_summary_t b) { + /* + * Logically, what we're doing here is comparing based on `.sn`, and + * falling back to comparing on `.addr` in the case that `a.sn == b.sn`. + * We accomplish this by multiplying the result of the `.sn` comparison + * by 2, so that so long as it is not 0, it will dominate the `.addr` + * comparison in determining the sign of the returned result value. + * The justification for doing things this way is that this is + * branchless - all of the branches that would be present in a + * straightforward implementation are common cases, and thus the branch + * prediction accuracy is not great. As a result, this implementation + * is measurably faster (by around 30%). + */ + return (2 * ((a.sn > b.sn) - (a.sn < b.sn))) + + ((a.addr > b.addr) - (a.addr < b.addr)); +} + +static inline int +edata_snad_comp(const edata_t *a, const edata_t *b) { + edata_cmp_summary_t a_cmp = edata_cmp_summary_get(a); + edata_cmp_summary_t b_cmp = edata_cmp_summary_get(b); + + return edata_cmp_summary_comp(a_cmp, b_cmp); +} + +static inline int +edata_esnead_comp(const edata_t *a, const edata_t *b) { + /* + * Similar to `edata_cmp_summary_comp`, we've opted for a + * branchless implementation for the sake of performance. + */ + return (2 * edata_esn_comp(a, b)) + edata_ead_comp(a, b); +} + +ph_proto(, edata_avail, edata_t) +ph_proto(, edata_heap, edata_t) + +#endif /* JEMALLOC_INTERNAL_EDATA_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/edata_cache.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/edata_cache.h new file mode 100644 index 000000000..b2c7b4f1d --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/edata_cache.h @@ -0,0 +1,50 @@ +#ifndef JEMALLOC_INTERNAL_EDATA_CACHE_H +#define JEMALLOC_INTERNAL_EDATA_CACHE_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/base.h" + +/* For tests only. */ +#define EDATA_CACHE_FAST_FILL 4 + +/* + * A cache of edata_t structures allocated via base_alloc_edata (as opposed to + * the underlying extents they describe). The contents of returned edata_t + * objects are garbage and cannot be relied upon. + */ + +typedef struct edata_cache_s edata_cache_t; +struct edata_cache_s { + edata_avail_t avail; + atomic_zu_t count; + malloc_mutex_t mtx; + base_t *base; +}; + +bool edata_cache_init(edata_cache_t *edata_cache, base_t *base); +edata_t *edata_cache_get(tsdn_t *tsdn, edata_cache_t *edata_cache); +void edata_cache_put(tsdn_t *tsdn, edata_cache_t *edata_cache, edata_t *edata); + +void edata_cache_prefork(tsdn_t *tsdn, edata_cache_t *edata_cache); +void edata_cache_postfork_parent(tsdn_t *tsdn, edata_cache_t *edata_cache); +void edata_cache_postfork_child(tsdn_t *tsdn, edata_cache_t *edata_cache); + +/* + * An edata_cache_small is like an edata_cache, but it relies on external + * synchronization and avoids first-fit strategies. + */ + +typedef struct edata_cache_fast_s edata_cache_fast_t; +struct edata_cache_fast_s { + edata_list_inactive_t list; + edata_cache_t *fallback; + bool disabled; +}; + +void edata_cache_fast_init(edata_cache_fast_t *ecs, edata_cache_t *fallback); +edata_t *edata_cache_fast_get(tsdn_t *tsdn, edata_cache_fast_t *ecs); +void edata_cache_fast_put(tsdn_t *tsdn, edata_cache_fast_t *ecs, + edata_t *edata); +void edata_cache_fast_disable(tsdn_t *tsdn, edata_cache_fast_t *ecs); + +#endif /* JEMALLOC_INTERNAL_EDATA_CACHE_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/ehooks.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/ehooks.h new file mode 100644 index 000000000..947e056c4 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/ehooks.h @@ -0,0 +1,415 @@ +#ifndef JEMALLOC_INTERNAL_EHOOKS_H +#define JEMALLOC_INTERNAL_EHOOKS_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/extent_mmap.h" +#include "jemalloc/internal/tsd.h" +#include "jemalloc/internal/tsd_types.h" + +/* + * This module is the internal interface to the extent hooks (both + * user-specified and external). Eventually, this will give us the flexibility + * to use multiple different versions of user-visible extent-hook APIs under a + * single user interface. + * + * Current API expansions (not available to anyone but the default hooks yet): + * - Head state tracking. Hooks can decide whether or not to merge two + * extents based on whether or not one of them is the head (i.e. was + * allocated on its own). The later extent loses its "head" status. + */ + +extern const extent_hooks_t ehooks_default_extent_hooks; + +typedef struct ehooks_s ehooks_t; +struct ehooks_s { + /* + * The user-visible id that goes with the ehooks (i.e. that of the base + * they're a part of, the associated arena's index within the arenas + * array). + */ + unsigned ind; + /* Logically an extent_hooks_t *. */ + atomic_p_t ptr; +}; + +extern const extent_hooks_t ehooks_default_extent_hooks; + +/* + * These are not really part of the public API. Each hook has a fast-path for + * the default-hooks case that can avoid various small inefficiencies: + * - Forgetting tsd and then calling tsd_get within the hook. + * - Getting more state than necessary out of the extent_t. + * - Doing arena_ind -> arena -> arena_ind lookups. + * By making the calls to these functions visible to the compiler, it can move + * those extra bits of computation down below the fast-paths where they get ignored. + */ +void *ehooks_default_alloc_impl(tsdn_t *tsdn, void *new_addr, size_t size, + size_t alignment, bool *zero, bool *commit, unsigned arena_ind); +bool ehooks_default_dalloc_impl(void *addr, size_t size); +void ehooks_default_destroy_impl(void *addr, size_t size); +bool ehooks_default_commit_impl(void *addr, size_t offset, size_t length); +bool ehooks_default_decommit_impl(void *addr, size_t offset, size_t length); +#ifdef PAGES_CAN_PURGE_LAZY +bool ehooks_default_purge_lazy_impl(void *addr, size_t offset, size_t length); +#endif +#ifdef PAGES_CAN_PURGE_FORCED +bool ehooks_default_purge_forced_impl(void *addr, size_t offset, size_t length); +#endif +bool ehooks_default_split_impl(void); +/* + * Merge is the only default extent hook we declare -- see the comment in + * ehooks_merge. + */ +bool ehooks_default_merge(extent_hooks_t *extent_hooks, void *addr_a, + size_t size_a, void *addr_b, size_t size_b, bool committed, + unsigned arena_ind); +bool ehooks_default_merge_impl(tsdn_t *tsdn, void *addr_a, void *addr_b); +void ehooks_default_zero_impl(void *addr, size_t size); +void ehooks_default_guard_impl(void *guard1, void *guard2); +void ehooks_default_unguard_impl(void *guard1, void *guard2); + +/* + * We don't officially support reentrancy from wtihin the extent hooks. But + * various people who sit within throwing distance of the jemalloc team want + * that functionality in certain limited cases. The default reentrancy guards + * assert that we're not reentrant from a0 (since it's the bootstrap arena, + * where reentrant allocations would be redirected), which we would incorrectly + * trigger in cases where a0 has extent hooks (those hooks themselves can't be + * reentrant, then, but there are reasonable uses for such functionality, like + * putting internal metadata on hugepages). Therefore, we use the raw + * reentrancy guards. + * + * Eventually, we need to think more carefully about whether and where we + * support allocating from within extent hooks (and what that means for things + * like profiling, stats collection, etc.), and document what the guarantee is. + */ +static inline void +ehooks_pre_reentrancy(tsdn_t *tsdn) { + tsd_t *tsd = tsdn_null(tsdn) ? tsd_fetch() : tsdn_tsd(tsdn); + tsd_pre_reentrancy_raw(tsd); +} + +static inline void +ehooks_post_reentrancy(tsdn_t *tsdn) { + tsd_t *tsd = tsdn_null(tsdn) ? tsd_fetch() : tsdn_tsd(tsdn); + tsd_post_reentrancy_raw(tsd); +} + +/* Beginning of the public API. */ +void ehooks_init(ehooks_t *ehooks, extent_hooks_t *extent_hooks, unsigned ind); + +static inline unsigned +ehooks_ind_get(const ehooks_t *ehooks) { + return ehooks->ind; +} + +static inline void +ehooks_set_extent_hooks_ptr(ehooks_t *ehooks, extent_hooks_t *extent_hooks) { + atomic_store_p(&ehooks->ptr, extent_hooks, ATOMIC_RELEASE); +} + +static inline extent_hooks_t * +ehooks_get_extent_hooks_ptr(ehooks_t *ehooks) { + return (extent_hooks_t *)atomic_load_p(&ehooks->ptr, ATOMIC_ACQUIRE); +} + +static inline bool +ehooks_are_default(ehooks_t *ehooks) { + return ehooks_get_extent_hooks_ptr(ehooks) == + &ehooks_default_extent_hooks; +} + +/* + * In some cases, a caller needs to allocate resources before attempting to call + * a hook. If that hook is doomed to fail, this is wasteful. We therefore + * include some checks for such cases. + */ +static inline bool +ehooks_dalloc_will_fail(ehooks_t *ehooks) { + if (ehooks_are_default(ehooks)) { + return opt_retain; + } else { + return ehooks_get_extent_hooks_ptr(ehooks)->dalloc == NULL; + } +} + +static inline bool +ehooks_split_will_fail(ehooks_t *ehooks) { + return ehooks_get_extent_hooks_ptr(ehooks)->split == NULL; +} + +static inline bool +ehooks_merge_will_fail(ehooks_t *ehooks) { + return ehooks_get_extent_hooks_ptr(ehooks)->merge == NULL; +} + +static inline bool +ehooks_guard_will_fail(ehooks_t *ehooks) { + /* + * Before the guard hooks are officially introduced, limit the use to + * the default hooks only. + */ + return !ehooks_are_default(ehooks); +} + +/* + * Some hooks are required to return zeroed memory in certain situations. In + * debug mode, we do some heuristic checks that they did what they were supposed + * to. + * + * This isn't really ehooks-specific (i.e. anyone can check for zeroed memory). + * But incorrect zero information indicates an ehook bug. + */ +static inline void +ehooks_debug_zero_check(void *addr, size_t size) { + assert(((uintptr_t)addr & PAGE_MASK) == 0); + assert((size & PAGE_MASK) == 0); + assert(size > 0); + if (config_debug) { + /* Check the whole first page. */ + size_t *p = (size_t *)addr; + for (size_t i = 0; i < PAGE / sizeof(size_t); i++) { + assert(p[i] == 0); + } + /* + * And 4 spots within. There's a tradeoff here; the larger + * this number, the more likely it is that we'll catch a bug + * where ehooks return a sparsely non-zero range. But + * increasing the number of checks also increases the number of + * page faults in debug mode. FreeBSD does much of their + * day-to-day development work in debug mode, so we don't want + * even the debug builds to be too slow. + */ + const size_t nchecks = 4; + assert(PAGE >= sizeof(size_t) * nchecks); + for (size_t i = 0; i < nchecks; ++i) { + assert(p[i * (size / sizeof(size_t) / nchecks)] == 0); + } + } +} + + +static inline void * +ehooks_alloc(tsdn_t *tsdn, ehooks_t *ehooks, void *new_addr, size_t size, + size_t alignment, bool *zero, bool *commit) { + bool orig_zero = *zero; + void *ret; + extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); + if (extent_hooks == &ehooks_default_extent_hooks) { + ret = ehooks_default_alloc_impl(tsdn, new_addr, size, + alignment, zero, commit, ehooks_ind_get(ehooks)); + } else { + ehooks_pre_reentrancy(tsdn); + ret = extent_hooks->alloc(extent_hooks, new_addr, size, + alignment, zero, commit, ehooks_ind_get(ehooks)); + ehooks_post_reentrancy(tsdn); + } + assert(new_addr == NULL || ret == NULL || new_addr == ret); + assert(!orig_zero || *zero); + if (*zero && ret != NULL) { + ehooks_debug_zero_check(ret, size); + } + return ret; +} + +static inline bool +ehooks_dalloc(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size, + bool committed) { + extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); + if (extent_hooks == &ehooks_default_extent_hooks) { + return ehooks_default_dalloc_impl(addr, size); + } else if (extent_hooks->dalloc == NULL) { + return true; + } else { + ehooks_pre_reentrancy(tsdn); + bool err = extent_hooks->dalloc(extent_hooks, addr, size, + committed, ehooks_ind_get(ehooks)); + ehooks_post_reentrancy(tsdn); + return err; + } +} + +static inline void +ehooks_destroy(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size, + bool committed) { + extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); + if (extent_hooks == &ehooks_default_extent_hooks) { + ehooks_default_destroy_impl(addr, size); + } else if (extent_hooks->destroy == NULL) { + /* Do nothing. */ + } else { + ehooks_pre_reentrancy(tsdn); + extent_hooks->destroy(extent_hooks, addr, size, committed, + ehooks_ind_get(ehooks)); + ehooks_post_reentrancy(tsdn); + } +} + +static inline bool +ehooks_commit(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size, + size_t offset, size_t length) { + extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); + bool err; + if (extent_hooks == &ehooks_default_extent_hooks) { + err = ehooks_default_commit_impl(addr, offset, length); + } else if (extent_hooks->commit == NULL) { + err = true; + } else { + ehooks_pre_reentrancy(tsdn); + err = extent_hooks->commit(extent_hooks, addr, size, + offset, length, ehooks_ind_get(ehooks)); + ehooks_post_reentrancy(tsdn); + } + if (!err) { + ehooks_debug_zero_check(addr, size); + } + return err; +} + +static inline bool +ehooks_decommit(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size, + size_t offset, size_t length) { + extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); + if (extent_hooks == &ehooks_default_extent_hooks) { + return ehooks_default_decommit_impl(addr, offset, length); + } else if (extent_hooks->decommit == NULL) { + return true; + } else { + ehooks_pre_reentrancy(tsdn); + bool err = extent_hooks->decommit(extent_hooks, addr, size, + offset, length, ehooks_ind_get(ehooks)); + ehooks_post_reentrancy(tsdn); + return err; + } +} + +static inline bool +ehooks_purge_lazy(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size, + size_t offset, size_t length) { + extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); +#ifdef PAGES_CAN_PURGE_LAZY + if (extent_hooks == &ehooks_default_extent_hooks) { + return ehooks_default_purge_lazy_impl(addr, offset, length); + } +#endif + if (extent_hooks->purge_lazy == NULL) { + return true; + } else { + ehooks_pre_reentrancy(tsdn); + bool err = extent_hooks->purge_lazy(extent_hooks, addr, size, + offset, length, ehooks_ind_get(ehooks)); + ehooks_post_reentrancy(tsdn); + return err; + } +} + +static inline bool +ehooks_purge_forced(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size, + size_t offset, size_t length) { + extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); + /* + * It would be correct to have a ehooks_debug_zero_check call at the end + * of this function; purge_forced is required to zero. But checking + * would touch the page in question, which may have performance + * consequences (imagine the hooks are using hugepages, with a global + * zero page off). Even in debug mode, it's usually a good idea to + * avoid cases that can dramatically increase memory consumption. + */ +#ifdef PAGES_CAN_PURGE_FORCED + if (extent_hooks == &ehooks_default_extent_hooks) { + return ehooks_default_purge_forced_impl(addr, offset, length); + } +#endif + if (extent_hooks->purge_forced == NULL) { + return true; + } else { + ehooks_pre_reentrancy(tsdn); + bool err = extent_hooks->purge_forced(extent_hooks, addr, size, + offset, length, ehooks_ind_get(ehooks)); + ehooks_post_reentrancy(tsdn); + return err; + } +} + +static inline bool +ehooks_split(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size, + size_t size_a, size_t size_b, bool committed) { + extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); + if (ehooks_are_default(ehooks)) { + return ehooks_default_split_impl(); + } else if (extent_hooks->split == NULL) { + return true; + } else { + ehooks_pre_reentrancy(tsdn); + bool err = extent_hooks->split(extent_hooks, addr, size, size_a, + size_b, committed, ehooks_ind_get(ehooks)); + ehooks_post_reentrancy(tsdn); + return err; + } +} + +static inline bool +ehooks_merge(tsdn_t *tsdn, ehooks_t *ehooks, void *addr_a, size_t size_a, + void *addr_b, size_t size_b, bool committed) { + extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); + if (extent_hooks == &ehooks_default_extent_hooks) { + return ehooks_default_merge_impl(tsdn, addr_a, addr_b); + } else if (extent_hooks->merge == NULL) { + return true; + } else { + ehooks_pre_reentrancy(tsdn); + bool err = extent_hooks->merge(extent_hooks, addr_a, size_a, + addr_b, size_b, committed, ehooks_ind_get(ehooks)); + ehooks_post_reentrancy(tsdn); + return err; + } +} + +static inline void +ehooks_zero(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size) { + extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); + if (extent_hooks == &ehooks_default_extent_hooks) { + ehooks_default_zero_impl(addr, size); + } else { + /* + * It would be correct to try using the user-provided purge + * hooks (since they are required to have zeroed the extent if + * they indicate success), but we don't necessarily know their + * cost. We'll be conservative and use memset. + */ + memset(addr, 0, size); + } +} + +static inline bool +ehooks_guard(tsdn_t *tsdn, ehooks_t *ehooks, void *guard1, void *guard2) { + bool err; + extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); + + if (extent_hooks == &ehooks_default_extent_hooks) { + ehooks_default_guard_impl(guard1, guard2); + err = false; + } else { + err = true; + } + + return err; +} + +static inline bool +ehooks_unguard(tsdn_t *tsdn, ehooks_t *ehooks, void *guard1, void *guard2) { + bool err; + extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); + + if (extent_hooks == &ehooks_default_extent_hooks) { + ehooks_default_unguard_impl(guard1, guard2); + err = false; + } else { + err = true; + } + + return err; +} + +#endif /* JEMALLOC_INTERNAL_EHOOKS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/emap.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/emap.h new file mode 100644 index 000000000..7ac0ae95a --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/emap.h @@ -0,0 +1,358 @@ +#ifndef JEMALLOC_INTERNAL_EMAP_H +#define JEMALLOC_INTERNAL_EMAP_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/base.h" +#include "jemalloc/internal/rtree.h" + +/* + * Note: Ends without at semicolon, so that + * EMAP_DECLARE_RTREE_CTX; + * in uses will avoid empty-statement warnings. + */ +#define EMAP_DECLARE_RTREE_CTX \ + rtree_ctx_t rtree_ctx_fallback; \ + rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback) + +typedef struct emap_s emap_t; +struct emap_s { + rtree_t rtree; +}; + +/* Used to pass rtree lookup context down the path. */ +typedef struct emap_alloc_ctx_t emap_alloc_ctx_t; +struct emap_alloc_ctx_t { + szind_t szind; + bool slab; +}; + +typedef struct emap_full_alloc_ctx_s emap_full_alloc_ctx_t; +struct emap_full_alloc_ctx_s { + szind_t szind; + bool slab; + edata_t *edata; +}; + +bool emap_init(emap_t *emap, base_t *base, bool zeroed); + +void emap_remap(tsdn_t *tsdn, emap_t *emap, edata_t *edata, szind_t szind, + bool slab); + +void emap_update_edata_state(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + extent_state_t state); + +/* + * The two acquire functions below allow accessing neighbor edatas, if it's safe + * and valid to do so (i.e. from the same arena, of the same state, etc.). This + * is necessary because the ecache locks are state based, and only protect + * edatas with the same state. Therefore the neighbor edata's state needs to be + * verified first, before chasing the edata pointer. The returned edata will be + * in an acquired state, meaning other threads will be prevented from accessing + * it, even if technically the edata can still be discovered from the rtree. + * + * This means, at any moment when holding pointers to edata, either one of the + * state based locks is held (and the edatas are all of the protected state), or + * the edatas are in an acquired state (e.g. in active or merging state). The + * acquire operation itself (changing the edata to an acquired state) is done + * under the state locks. + */ +edata_t *emap_try_acquire_edata_neighbor(tsdn_t *tsdn, emap_t *emap, + edata_t *edata, extent_pai_t pai, extent_state_t expected_state, + bool forward); +edata_t *emap_try_acquire_edata_neighbor_expand(tsdn_t *tsdn, emap_t *emap, + edata_t *edata, extent_pai_t pai, extent_state_t expected_state); +void emap_release_edata(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + extent_state_t new_state); + +/* + * Associate the given edata with its beginning and end address, setting the + * szind and slab info appropriately. + * Returns true on error (i.e. resource exhaustion). + */ +bool emap_register_boundary(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + szind_t szind, bool slab); + +/* + * Does the same thing, but with the interior of the range, for slab + * allocations. + * + * You might wonder why we don't just have a single emap_register function that + * does both depending on the value of 'slab'. The answer is twofold: + * - As a practical matter, in places like the extract->split->commit pathway, + * we defer the interior operation until we're sure that the commit won't fail + * (but we have to register the split boundaries there). + * - In general, we're trying to move to a world where the page-specific + * allocator doesn't know as much about how the pages it allocates will be + * used, and passing a 'slab' parameter everywhere makes that more + * complicated. + * + * Unlike the boundary version, this function can't fail; this is because slabs + * can't get big enough to touch a new page that neither of the boundaries + * touched, so no allocation is necessary to fill the interior once the boundary + * has been touched. + */ +void emap_register_interior(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + szind_t szind); + +void emap_deregister_boundary(tsdn_t *tsdn, emap_t *emap, edata_t *edata); +void emap_deregister_interior(tsdn_t *tsdn, emap_t *emap, edata_t *edata); + +typedef struct emap_prepare_s emap_prepare_t; +struct emap_prepare_s { + rtree_leaf_elm_t *lead_elm_a; + rtree_leaf_elm_t *lead_elm_b; + rtree_leaf_elm_t *trail_elm_a; + rtree_leaf_elm_t *trail_elm_b; +}; + +/** + * These functions the emap metadata management for merging, splitting, and + * reusing extents. In particular, they set the boundary mappings from + * addresses to edatas. If the result is going to be used as a slab, you + * still need to call emap_register_interior on it, though. + * + * Remap simply changes the szind and slab status of an extent's boundary + * mappings. If the extent is not a slab, it doesn't bother with updating the + * end mapping (since lookups only occur in the interior of an extent for + * slabs). Since the szind and slab status only make sense for active extents, + * this should only be called while activating or deactivating an extent. + * + * Split and merge have a "prepare" and a "commit" portion. The prepare portion + * does the operations that can be done without exclusive access to the extent + * in question, while the commit variant requires exclusive access to maintain + * the emap invariants. The only function that can fail is emap_split_prepare, + * and it returns true on failure (at which point the caller shouldn't commit). + * + * In all cases, "lead" refers to the lower-addressed extent, and trail to the + * higher-addressed one. It's the caller's responsibility to set the edata + * state appropriately. + */ +bool emap_split_prepare(tsdn_t *tsdn, emap_t *emap, emap_prepare_t *prepare, + edata_t *edata, size_t size_a, edata_t *trail, size_t size_b); +void emap_split_commit(tsdn_t *tsdn, emap_t *emap, emap_prepare_t *prepare, + edata_t *lead, size_t size_a, edata_t *trail, size_t size_b); +void emap_merge_prepare(tsdn_t *tsdn, emap_t *emap, emap_prepare_t *prepare, + edata_t *lead, edata_t *trail); +void emap_merge_commit(tsdn_t *tsdn, emap_t *emap, emap_prepare_t *prepare, + edata_t *lead, edata_t *trail); + +/* Assert that the emap's view of the given edata matches the edata's view. */ +void emap_do_assert_mapped(tsdn_t *tsdn, emap_t *emap, edata_t *edata); +static inline void +emap_assert_mapped(tsdn_t *tsdn, emap_t *emap, edata_t *edata) { + if (config_debug) { + emap_do_assert_mapped(tsdn, emap, edata); + } +} + +/* Assert that the given edata isn't in the map. */ +void emap_do_assert_not_mapped(tsdn_t *tsdn, emap_t *emap, edata_t *edata); +static inline void +emap_assert_not_mapped(tsdn_t *tsdn, emap_t *emap, edata_t *edata) { + if (config_debug) { + emap_do_assert_not_mapped(tsdn, emap, edata); + } +} + +JEMALLOC_ALWAYS_INLINE bool +emap_edata_in_transition(tsdn_t *tsdn, emap_t *emap, edata_t *edata) { + assert(config_debug); + emap_assert_mapped(tsdn, emap, edata); + + EMAP_DECLARE_RTREE_CTX; + rtree_contents_t contents = rtree_read(tsdn, &emap->rtree, rtree_ctx, + (uintptr_t)edata_base_get(edata)); + + return edata_state_in_transition(contents.metadata.state); +} + +JEMALLOC_ALWAYS_INLINE bool +emap_edata_is_acquired(tsdn_t *tsdn, emap_t *emap, edata_t *edata) { + if (!config_debug) { + /* For assertions only. */ + return false; + } + + /* + * The edata is considered acquired if no other threads will attempt to + * read / write any fields from it. This includes a few cases: + * + * 1) edata not hooked into emap yet -- This implies the edata just got + * allocated or initialized. + * + * 2) in an active or transition state -- In both cases, the edata can + * be discovered from the emap, however the state tracked in the rtree + * will prevent other threads from accessing the actual edata. + */ + EMAP_DECLARE_RTREE_CTX; + rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, &emap->rtree, + rtree_ctx, (uintptr_t)edata_base_get(edata), /* dependent */ false, + /* init_missing */ false); + if (elm == NULL) { + return true; + } + rtree_contents_t contents = rtree_leaf_elm_read(tsdn, &emap->rtree, elm, + /* dependent */ false); + if (contents.edata == NULL || + contents.metadata.state == extent_state_active || + edata_state_in_transition(contents.metadata.state)) { + return true; + } + + return false; +} + +JEMALLOC_ALWAYS_INLINE void +extent_assert_can_coalesce(const edata_t *inner, const edata_t *outer) { + assert(edata_arena_ind_get(inner) == edata_arena_ind_get(outer)); + assert(edata_pai_get(inner) == edata_pai_get(outer)); + assert(edata_committed_get(inner) == edata_committed_get(outer)); + assert(edata_state_get(inner) == extent_state_active); + assert(edata_state_get(outer) == extent_state_merging); + assert(!edata_guarded_get(inner) && !edata_guarded_get(outer)); + assert(edata_base_get(inner) == edata_past_get(outer) || + edata_base_get(outer) == edata_past_get(inner)); +} + +JEMALLOC_ALWAYS_INLINE void +extent_assert_can_expand(const edata_t *original, const edata_t *expand) { + assert(edata_arena_ind_get(original) == edata_arena_ind_get(expand)); + assert(edata_pai_get(original) == edata_pai_get(expand)); + assert(edata_state_get(original) == extent_state_active); + assert(edata_state_get(expand) == extent_state_merging); + assert(edata_past_get(original) == edata_base_get(expand)); +} + +JEMALLOC_ALWAYS_INLINE edata_t * +emap_edata_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr) { + EMAP_DECLARE_RTREE_CTX; + + return rtree_read(tsdn, &emap->rtree, rtree_ctx, (uintptr_t)ptr).edata; +} + +/* Fills in alloc_ctx with the info in the map. */ +JEMALLOC_ALWAYS_INLINE void +emap_alloc_ctx_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr, + emap_alloc_ctx_t *alloc_ctx) { + EMAP_DECLARE_RTREE_CTX; + + rtree_metadata_t metadata = rtree_metadata_read(tsdn, &emap->rtree, + rtree_ctx, (uintptr_t)ptr); + alloc_ctx->szind = metadata.szind; + alloc_ctx->slab = metadata.slab; +} + +/* The pointer must be mapped. */ +JEMALLOC_ALWAYS_INLINE void +emap_full_alloc_ctx_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr, + emap_full_alloc_ctx_t *full_alloc_ctx) { + EMAP_DECLARE_RTREE_CTX; + + rtree_contents_t contents = rtree_read(tsdn, &emap->rtree, rtree_ctx, + (uintptr_t)ptr); + full_alloc_ctx->edata = contents.edata; + full_alloc_ctx->szind = contents.metadata.szind; + full_alloc_ctx->slab = contents.metadata.slab; +} + +/* + * The pointer is allowed to not be mapped. + * + * Returns true when the pointer is not present. + */ +JEMALLOC_ALWAYS_INLINE bool +emap_full_alloc_ctx_try_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr, + emap_full_alloc_ctx_t *full_alloc_ctx) { + EMAP_DECLARE_RTREE_CTX; + + rtree_contents_t contents; + bool err = rtree_read_independent(tsdn, &emap->rtree, rtree_ctx, + (uintptr_t)ptr, &contents); + if (err) { + return true; + } + full_alloc_ctx->edata = contents.edata; + full_alloc_ctx->szind = contents.metadata.szind; + full_alloc_ctx->slab = contents.metadata.slab; + return false; +} + +/* + * Only used on the fastpath of free. Returns true when cannot be fulfilled by + * fast path, e.g. when the metadata key is not cached. + */ +JEMALLOC_ALWAYS_INLINE bool +emap_alloc_ctx_try_lookup_fast(tsd_t *tsd, emap_t *emap, const void *ptr, + emap_alloc_ctx_t *alloc_ctx) { + /* Use the unsafe getter since this may gets called during exit. */ + rtree_ctx_t *rtree_ctx = tsd_rtree_ctxp_get_unsafe(tsd); + + rtree_metadata_t metadata; + bool err = rtree_metadata_try_read_fast(tsd_tsdn(tsd), &emap->rtree, + rtree_ctx, (uintptr_t)ptr, &metadata); + if (err) { + return true; + } + alloc_ctx->szind = metadata.szind; + alloc_ctx->slab = metadata.slab; + return false; +} + +/* + * We want to do batch lookups out of the cache bins, which use + * cache_bin_ptr_array_get to access the i'th element of the bin (since they + * invert usual ordering in deciding what to flush). This lets the emap avoid + * caring about its caller's ordering. + */ +typedef const void *(*emap_ptr_getter)(void *ctx, size_t ind); +/* + * This allows size-checking assertions, which we can only do while we're in the + * process of edata lookups. + */ +typedef void (*emap_metadata_visitor)(void *ctx, emap_full_alloc_ctx_t *alloc_ctx); + +typedef union emap_batch_lookup_result_u emap_batch_lookup_result_t; +union emap_batch_lookup_result_u { + edata_t *edata; + rtree_leaf_elm_t *rtree_leaf; +}; + +JEMALLOC_ALWAYS_INLINE void +emap_edata_lookup_batch(tsd_t *tsd, emap_t *emap, size_t nptrs, + emap_ptr_getter ptr_getter, void *ptr_getter_ctx, + emap_metadata_visitor metadata_visitor, void *metadata_visitor_ctx, + emap_batch_lookup_result_t *result) { + /* Avoids null-checking tsdn in the loop below. */ + util_assume(tsd != NULL); + rtree_ctx_t *rtree_ctx = tsd_rtree_ctxp_get(tsd); + + for (size_t i = 0; i < nptrs; i++) { + const void *ptr = ptr_getter(ptr_getter_ctx, i); + /* + * Reuse the edatas array as a temp buffer, lying a little about + * the types. + */ + result[i].rtree_leaf = rtree_leaf_elm_lookup(tsd_tsdn(tsd), + &emap->rtree, rtree_ctx, (uintptr_t)ptr, + /* dependent */ true, /* init_missing */ false); + } + + for (size_t i = 0; i < nptrs; i++) { + rtree_leaf_elm_t *elm = result[i].rtree_leaf; + rtree_contents_t contents = rtree_leaf_elm_read(tsd_tsdn(tsd), + &emap->rtree, elm, /* dependent */ true); + result[i].edata = contents.edata; + emap_full_alloc_ctx_t alloc_ctx; + /* + * Not all these fields are read in practice by the metadata + * visitor. But the compiler can easily optimize away the ones + * that aren't, so no sense in being incomplete. + */ + alloc_ctx.szind = contents.metadata.szind; + alloc_ctx.slab = contents.metadata.slab; + alloc_ctx.edata = contents.edata; + metadata_visitor(metadata_visitor_ctx, &alloc_ctx); + } +} + +#endif /* JEMALLOC_INTERNAL_EMAP_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/emitter.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/emitter.h new file mode 100644 index 000000000..11153254b --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/emitter.h @@ -0,0 +1,534 @@ +#ifndef JEMALLOC_INTERNAL_EMITTER_H +#define JEMALLOC_INTERNAL_EMITTER_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/jemalloc_internal_types.h" +#include "jemalloc/internal/malloc_io.h" +#include "jemalloc/internal/ql.h" + +typedef enum emitter_output_e emitter_output_t; +enum emitter_output_e { + emitter_output_json, + emitter_output_json_compact, + emitter_output_table +}; + +typedef enum emitter_justify_e emitter_justify_t; +enum emitter_justify_e { + emitter_justify_left, + emitter_justify_right, + /* Not for users; just to pass to internal functions. */ + emitter_justify_none +}; + +typedef enum emitter_type_e emitter_type_t; +enum emitter_type_e { + emitter_type_bool, + emitter_type_int, + emitter_type_int64, + emitter_type_unsigned, + emitter_type_uint32, + emitter_type_uint64, + emitter_type_size, + emitter_type_ssize, + emitter_type_string, + /* + * A title is a column title in a table; it's just a string, but it's + * not quoted. + */ + emitter_type_title, +}; + +typedef struct emitter_col_s emitter_col_t; +struct emitter_col_s { + /* Filled in by the user. */ + emitter_justify_t justify; + int width; + emitter_type_t type; + union { + bool bool_val; + int int_val; + unsigned unsigned_val; + uint32_t uint32_val; + uint32_t uint32_t_val; + uint64_t uint64_val; + uint64_t uint64_t_val; + size_t size_val; + ssize_t ssize_val; + const char *str_val; + }; + + /* Filled in by initialization. */ + ql_elm(emitter_col_t) link; +}; + +typedef struct emitter_row_s emitter_row_t; +struct emitter_row_s { + ql_head(emitter_col_t) cols; +}; + +typedef struct emitter_s emitter_t; +struct emitter_s { + emitter_output_t output; + /* The output information. */ + write_cb_t *write_cb; + void *cbopaque; + int nesting_depth; + /* True if we've already emitted a value at the given depth. */ + bool item_at_depth; + /* True if we emitted a key and will emit corresponding value next. */ + bool emitted_key; +}; + +static inline bool +emitter_outputs_json(emitter_t *emitter) { + return emitter->output == emitter_output_json || + emitter->output == emitter_output_json_compact; +} + +/* Internal convenience function. Write to the emitter the given string. */ +JEMALLOC_FORMAT_PRINTF(2, 3) +static inline void +emitter_printf(emitter_t *emitter, const char *format, ...) { + va_list ap; + + va_start(ap, format); + malloc_vcprintf(emitter->write_cb, emitter->cbopaque, format, ap); + va_end(ap); +} + +static inline const char * JEMALLOC_FORMAT_ARG(3) +emitter_gen_fmt(char *out_fmt, size_t out_size, const char *fmt_specifier, + emitter_justify_t justify, int width) { + size_t written; + fmt_specifier++; + if (justify == emitter_justify_none) { + written = malloc_snprintf(out_fmt, out_size, + "%%%s", fmt_specifier); + } else if (justify == emitter_justify_left) { + written = malloc_snprintf(out_fmt, out_size, + "%%-%d%s", width, fmt_specifier); + } else { + written = malloc_snprintf(out_fmt, out_size, + "%%%d%s", width, fmt_specifier); + } + /* Only happens in case of bad format string, which *we* choose. */ + assert(written < out_size); + return out_fmt; +} + +static inline void +emitter_emit_str(emitter_t *emitter, emitter_justify_t justify, int width, + char *fmt, size_t fmt_size, const char *str) { +#define BUF_SIZE 256 + char buf[BUF_SIZE]; + size_t str_written = malloc_snprintf(buf, BUF_SIZE, "\"%s\"", str); + emitter_printf(emitter, + emitter_gen_fmt(fmt, fmt_size, "%s", justify, width), buf); + if (str_written < BUF_SIZE) { + return; + } + /* + * There is no support for long string justification at the moment as + * we output them partially with multiple malloc_snprintf calls and + * justufication will work correctly only withing one call. + * Fortunately this is not a big concern as we don't use justufication + * with long strings right now. + * + * We emitted leading quotation mark and trailing '\0', hence need to + * exclude extra characters from str shift. + */ + str += BUF_SIZE - 2; + do { + str_written = malloc_snprintf(buf, BUF_SIZE, "%s\"", str); + str += str_written >= BUF_SIZE ? BUF_SIZE - 1 : str_written; + emitter_printf(emitter, + emitter_gen_fmt(fmt, fmt_size, "%s", justify, width), buf); + } while (str_written >= BUF_SIZE); +#undef BUF_SIZE +} + +/* + * Internal. Emit the given value type in the relevant encoding (so that the + * bool true gets mapped to json "true", but the string "true" gets mapped to + * json "\"true\"", for instance. + * + * Width is ignored if justify is emitter_justify_none. + */ +static inline void +emitter_print_value(emitter_t *emitter, emitter_justify_t justify, int width, + emitter_type_t value_type, const void *value) { +#define FMT_SIZE 10 + /* + * We dynamically generate a format string to emit, to let us use the + * snprintf machinery. This is kinda hacky, but gets the job done + * quickly without having to think about the various snprintf edge + * cases. + */ + char fmt[FMT_SIZE]; + +#define EMIT_SIMPLE(type, format) \ + emitter_printf(emitter, \ + emitter_gen_fmt(fmt, FMT_SIZE, format, justify, width), \ + *(const type *)value); + + switch (value_type) { + case emitter_type_bool: + emitter_printf(emitter, + emitter_gen_fmt(fmt, FMT_SIZE, "%s", justify, width), + *(const bool *)value ? "true" : "false"); + break; + case emitter_type_int: + EMIT_SIMPLE(int, "%d") + break; + case emitter_type_int64: + EMIT_SIMPLE(int64_t, "%" FMTd64) + break; + case emitter_type_unsigned: + EMIT_SIMPLE(unsigned, "%u") + break; + case emitter_type_ssize: + EMIT_SIMPLE(ssize_t, "%zd") + break; + case emitter_type_size: + EMIT_SIMPLE(size_t, "%zu") + break; + case emitter_type_string: + emitter_emit_str(emitter, justify, width, fmt, FMT_SIZE, + *(const char *const *)value); + break; + case emitter_type_uint32: + EMIT_SIMPLE(uint32_t, "%" FMTu32) + break; + case emitter_type_uint64: + EMIT_SIMPLE(uint64_t, "%" FMTu64) + break; + case emitter_type_title: + EMIT_SIMPLE(char *const, "%s"); + break; + default: + unreachable(); + } +#undef FMT_SIZE +} + + +/* Internal functions. In json mode, tracks nesting state. */ +static inline void +emitter_nest_inc(emitter_t *emitter) { + emitter->nesting_depth++; + emitter->item_at_depth = false; +} + +static inline void +emitter_nest_dec(emitter_t *emitter) { + emitter->nesting_depth--; + emitter->item_at_depth = true; +} + +static inline void +emitter_indent(emitter_t *emitter) { + int amount = emitter->nesting_depth; + const char *indent_str; + assert(emitter->output != emitter_output_json_compact); + if (emitter->output == emitter_output_json) { + indent_str = "\t"; + } else { + amount *= 2; + indent_str = " "; + } + for (int i = 0; i < amount; i++) { + emitter_printf(emitter, "%s", indent_str); + } +} + +static inline void +emitter_json_key_prefix(emitter_t *emitter) { + assert(emitter_outputs_json(emitter)); + if (emitter->emitted_key) { + emitter->emitted_key = false; + return; + } + if (emitter->item_at_depth) { + emitter_printf(emitter, ","); + } + if (emitter->output != emitter_output_json_compact) { + emitter_printf(emitter, "\n"); + emitter_indent(emitter); + } +} + +/******************************************************************************/ +/* Public functions for emitter_t. */ + +static inline void +emitter_init(emitter_t *emitter, emitter_output_t emitter_output, + write_cb_t *write_cb, void *cbopaque) { + emitter->output = emitter_output; + emitter->write_cb = write_cb; + emitter->cbopaque = cbopaque; + emitter->item_at_depth = false; + emitter->emitted_key = false; + emitter->nesting_depth = 0; +} + +/******************************************************************************/ +/* JSON public API. */ + +/* + * Emits a key (e.g. as appears in an object). The next json entity emitted will + * be the corresponding value. + */ +static inline void +emitter_json_key(emitter_t *emitter, const char *json_key) { + if (emitter_outputs_json(emitter)) { + emitter_json_key_prefix(emitter); + emitter_printf(emitter, "\"%s\":%s", json_key, + emitter->output == emitter_output_json_compact ? "" : " "); + emitter->emitted_key = true; + } +} + +static inline void +emitter_json_value(emitter_t *emitter, emitter_type_t value_type, + const void *value) { + if (emitter_outputs_json(emitter)) { + emitter_json_key_prefix(emitter); + emitter_print_value(emitter, emitter_justify_none, -1, + value_type, value); + emitter->item_at_depth = true; + } +} + +/* Shorthand for calling emitter_json_key and then emitter_json_value. */ +static inline void +emitter_json_kv(emitter_t *emitter, const char *json_key, + emitter_type_t value_type, const void *value) { + emitter_json_key(emitter, json_key); + emitter_json_value(emitter, value_type, value); +} + +static inline void +emitter_json_array_begin(emitter_t *emitter) { + if (emitter_outputs_json(emitter)) { + emitter_json_key_prefix(emitter); + emitter_printf(emitter, "["); + emitter_nest_inc(emitter); + } +} + +/* Shorthand for calling emitter_json_key and then emitter_json_array_begin. */ +static inline void +emitter_json_array_kv_begin(emitter_t *emitter, const char *json_key) { + emitter_json_key(emitter, json_key); + emitter_json_array_begin(emitter); +} + +static inline void +emitter_json_array_end(emitter_t *emitter) { + if (emitter_outputs_json(emitter)) { + assert(emitter->nesting_depth > 0); + emitter_nest_dec(emitter); + if (emitter->output != emitter_output_json_compact) { + emitter_printf(emitter, "\n"); + emitter_indent(emitter); + } + emitter_printf(emitter, "]"); + } +} + +static inline void +emitter_json_object_begin(emitter_t *emitter) { + if (emitter_outputs_json(emitter)) { + emitter_json_key_prefix(emitter); + emitter_printf(emitter, "{"); + emitter_nest_inc(emitter); + } +} + +/* Shorthand for calling emitter_json_key and then emitter_json_object_begin. */ +static inline void +emitter_json_object_kv_begin(emitter_t *emitter, const char *json_key) { + emitter_json_key(emitter, json_key); + emitter_json_object_begin(emitter); +} + +static inline void +emitter_json_object_end(emitter_t *emitter) { + if (emitter_outputs_json(emitter)) { + assert(emitter->nesting_depth > 0); + emitter_nest_dec(emitter); + if (emitter->output != emitter_output_json_compact) { + emitter_printf(emitter, "\n"); + emitter_indent(emitter); + } + emitter_printf(emitter, "}"); + } +} + + +/******************************************************************************/ +/* Table public API. */ + +static inline void +emitter_table_dict_begin(emitter_t *emitter, const char *table_key) { + if (emitter->output == emitter_output_table) { + emitter_indent(emitter); + emitter_printf(emitter, "%s\n", table_key); + emitter_nest_inc(emitter); + } +} + +static inline void +emitter_table_dict_end(emitter_t *emitter) { + if (emitter->output == emitter_output_table) { + emitter_nest_dec(emitter); + } +} + +static inline void +emitter_table_kv_note(emitter_t *emitter, const char *table_key, + emitter_type_t value_type, const void *value, + const char *table_note_key, emitter_type_t table_note_value_type, + const void *table_note_value) { + if (emitter->output == emitter_output_table) { + emitter_indent(emitter); + emitter_printf(emitter, "%s: ", table_key); + emitter_print_value(emitter, emitter_justify_none, -1, + value_type, value); + if (table_note_key != NULL) { + emitter_printf(emitter, " (%s: ", table_note_key); + emitter_print_value(emitter, emitter_justify_none, -1, + table_note_value_type, table_note_value); + emitter_printf(emitter, ")"); + } + emitter_printf(emitter, "\n"); + } + emitter->item_at_depth = true; +} + +static inline void +emitter_table_kv(emitter_t *emitter, const char *table_key, + emitter_type_t value_type, const void *value) { + emitter_table_kv_note(emitter, table_key, value_type, value, NULL, + emitter_type_bool, NULL); +} + + +/* Write to the emitter the given string, but only in table mode. */ +JEMALLOC_FORMAT_PRINTF(2, 3) +static inline void +emitter_table_printf(emitter_t *emitter, const char *format, ...) { + if (emitter->output == emitter_output_table) { + va_list ap; + va_start(ap, format); + malloc_vcprintf(emitter->write_cb, emitter->cbopaque, format, ap); + va_end(ap); + } +} + +static inline void +emitter_table_row(emitter_t *emitter, emitter_row_t *row) { + if (emitter->output != emitter_output_table) { + return; + } + emitter_col_t *col; + ql_foreach(col, &row->cols, link) { + emitter_print_value(emitter, col->justify, col->width, + col->type, (const void *)&col->bool_val); + } + emitter_table_printf(emitter, "\n"); +} + +static inline void +emitter_row_init(emitter_row_t *row) { + ql_new(&row->cols); +} + +static inline void +emitter_col_init(emitter_col_t *col, emitter_row_t *row) { + ql_elm_new(col, link); + ql_tail_insert(&row->cols, col, link); +} + + +/******************************************************************************/ +/* + * Generalized public API. Emits using either JSON or table, according to + * settings in the emitter_t. */ + +/* + * Note emits a different kv pair as well, but only in table mode. Omits the + * note if table_note_key is NULL. + */ +static inline void +emitter_kv_note(emitter_t *emitter, const char *json_key, const char *table_key, + emitter_type_t value_type, const void *value, + const char *table_note_key, emitter_type_t table_note_value_type, + const void *table_note_value) { + if (emitter_outputs_json(emitter)) { + emitter_json_key(emitter, json_key); + emitter_json_value(emitter, value_type, value); + } else { + emitter_table_kv_note(emitter, table_key, value_type, value, + table_note_key, table_note_value_type, table_note_value); + } + emitter->item_at_depth = true; +} + +static inline void +emitter_kv(emitter_t *emitter, const char *json_key, const char *table_key, + emitter_type_t value_type, const void *value) { + emitter_kv_note(emitter, json_key, table_key, value_type, value, NULL, + emitter_type_bool, NULL); +} + +static inline void +emitter_dict_begin(emitter_t *emitter, const char *json_key, + const char *table_header) { + if (emitter_outputs_json(emitter)) { + emitter_json_key(emitter, json_key); + emitter_json_object_begin(emitter); + } else { + emitter_table_dict_begin(emitter, table_header); + } +} + +static inline void +emitter_dict_end(emitter_t *emitter) { + if (emitter_outputs_json(emitter)) { + emitter_json_object_end(emitter); + } else { + emitter_table_dict_end(emitter); + } +} + +static inline void +emitter_begin(emitter_t *emitter) { + if (emitter_outputs_json(emitter)) { + assert(emitter->nesting_depth == 0); + emitter_printf(emitter, "{"); + emitter_nest_inc(emitter); + } else { + /* + * This guarantees that we always call write_cb at least once. + * This is useful if some invariant is established by each call + * to write_cb, but doesn't hold initially: e.g., some buffer + * holds a null-terminated string. + */ + emitter_printf(emitter, "%s", ""); + } +} + +static inline void +emitter_end(emitter_t *emitter) { + if (emitter_outputs_json(emitter)) { + assert(emitter->nesting_depth == 1); + emitter_nest_dec(emitter); + emitter_printf(emitter, "%s", emitter->output == + emitter_output_json_compact ? "}" : "\n}\n"); + } +} + +#endif /* JEMALLOC_INTERNAL_EMITTER_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/eset.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/eset.h new file mode 100644 index 000000000..9b7c4a89e --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/eset.h @@ -0,0 +1,78 @@ +#ifndef JEMALLOC_INTERNAL_ESET_H +#define JEMALLOC_INTERNAL_ESET_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/edata.h" +#include "jemalloc/internal/fb.h" +#include "jemalloc/internal/mutex.h" + +/* + * An eset ("extent set") is a quantized collection of extents, with built-in + * LRU queue. + * + * This class is not thread-safe; synchronization must be done externally if + * there are mutating operations. One exception is the stats counters, which + * may be read without any locking. + */ + +typedef struct eset_bin_s eset_bin_t; +struct eset_bin_s { + edata_heap_t heap; + /* + * We do first-fit across multiple size classes. If we compared against + * the min element in each heap directly, we'd take a cache miss per + * extent we looked at. If we co-locate the edata summaries, we only + * take a miss on the edata we're actually going to return (which is + * inevitable anyways). + */ + edata_cmp_summary_t heap_min; +}; + +typedef struct eset_bin_stats_s eset_bin_stats_t; +struct eset_bin_stats_s { + atomic_zu_t nextents; + atomic_zu_t nbytes; +}; + +typedef struct eset_s eset_t; +struct eset_s { + /* Bitmap for which set bits correspond to non-empty heaps. */ + fb_group_t bitmap[FB_NGROUPS(SC_NPSIZES + 1)]; + + /* Quantized per size class heaps of extents. */ + eset_bin_t bins[SC_NPSIZES + 1]; + + eset_bin_stats_t bin_stats[SC_NPSIZES + 1]; + + /* LRU of all extents in heaps. */ + edata_list_inactive_t lru; + + /* Page sum for all extents in heaps. */ + atomic_zu_t npages; + + /* + * A duplication of the data in the containing ecache. We use this only + * for assertions on the states of the passed-in extents. + */ + extent_state_t state; +}; + +void eset_init(eset_t *eset, extent_state_t state); + +size_t eset_npages_get(eset_t *eset); +/* Get the number of extents in the given page size index. */ +size_t eset_nextents_get(eset_t *eset, pszind_t ind); +/* Get the sum total bytes of the extents in the given page size index. */ +size_t eset_nbytes_get(eset_t *eset, pszind_t ind); + +void eset_insert(eset_t *eset, edata_t *edata); +void eset_remove(eset_t *eset, edata_t *edata); +/* + * Select an extent from this eset of the given size and alignment. Returns + * null if no such item could be found. + */ +edata_t *eset_fit(eset_t *eset, size_t esize, size_t alignment, bool exact_only, + unsigned lg_max_fit); + +#endif /* JEMALLOC_INTERNAL_ESET_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/exp_grow.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/exp_grow.h new file mode 100644 index 000000000..40a1add03 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/exp_grow.h @@ -0,0 +1,52 @@ +#ifndef JEMALLOC_INTERNAL_EXP_GROW_H +#define JEMALLOC_INTERNAL_EXP_GROW_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/sz.h" +typedef struct exp_grow_s exp_grow_t; +struct exp_grow_s { + /* + * Next extent size class in a growing series to use when satisfying a + * request via the extent hooks (only if opt_retain). This limits the + * number of disjoint virtual memory ranges so that extent merging can + * be effective even if multiple arenas' extent allocation requests are + * highly interleaved. + * + * retain_grow_limit is the max allowed size ind to expand (unless the + * required size is greater). Default is no limit, and controlled + * through mallctl only. + */ + pszind_t next; + pszind_t limit; +}; + +static inline bool +exp_grow_size_prepare(exp_grow_t *exp_grow, size_t alloc_size_min, + size_t *r_alloc_size, pszind_t *r_skip) { + *r_skip = 0; + *r_alloc_size = sz_pind2sz(exp_grow->next + *r_skip); + while (*r_alloc_size < alloc_size_min) { + (*r_skip)++; + if (exp_grow->next + *r_skip >= + sz_psz2ind(SC_LARGE_MAXCLASS)) { + /* Outside legal range. */ + return true; + } + *r_alloc_size = sz_pind2sz(exp_grow->next + *r_skip); + } + return false; +} + +static inline void +exp_grow_size_commit(exp_grow_t *exp_grow, pszind_t skip) { + if (exp_grow->next + skip + 1 <= exp_grow->limit) { + exp_grow->next += skip + 1; + } else { + exp_grow->next = exp_grow->limit; + } + +} + +void exp_grow_init(exp_grow_t *exp_grow); + +#endif /* JEMALLOC_INTERNAL_EXP_GROW_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/extent.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/extent.h new file mode 100644 index 000000000..17feb703e --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/extent.h @@ -0,0 +1,137 @@ +#ifndef JEMALLOC_INTERNAL_EXTENT_H +#define JEMALLOC_INTERNAL_EXTENT_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/ecache.h" +#include "jemalloc/internal/ehooks.h" +#include "jemalloc/internal/pac.h" +#include "jemalloc/internal/ph.h" +#include "jemalloc/internal/rtree.h" + +/* + * This module contains the page-level allocator. It chooses the addresses that + * allocations requested by other modules will inhabit, and updates the global + * metadata to reflect allocation/deallocation/purging decisions. + */ + +/* + * When reuse (and split) an active extent, (1U << opt_lg_extent_max_active_fit) + * is the max ratio between the size of the active extent and the new extent. + */ +#define LG_EXTENT_MAX_ACTIVE_FIT_DEFAULT 6 +extern size_t opt_lg_extent_max_active_fit; + +edata_t *ecache_alloc(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, edata_t *expand_edata, size_t size, size_t alignment, + bool zero, bool guarded); +edata_t *ecache_alloc_grow(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, edata_t *expand_edata, size_t size, size_t alignment, + bool zero, bool guarded); +void ecache_dalloc(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, edata_t *edata); +edata_t *ecache_evict(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, size_t npages_min); + +void extent_gdump_add(tsdn_t *tsdn, const edata_t *edata); +void extent_record(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, + edata_t *edata); +void extent_dalloc_gap(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata); +edata_t *extent_alloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + void *new_addr, size_t size, size_t alignment, bool zero, bool *commit, + bool growing_retained); +void extent_dalloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata); +void extent_destroy_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata); +bool extent_commit_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + size_t offset, size_t length); +bool extent_purge_lazy_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + size_t offset, size_t length); +bool extent_purge_forced_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + size_t offset, size_t length); +edata_t *extent_split_wrapper(tsdn_t *tsdn, pac_t *pac, + ehooks_t *ehooks, edata_t *edata, size_t size_a, size_t size_b, + bool holding_core_locks); +bool extent_merge_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *a, edata_t *b); +bool extent_commit_zero(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + bool commit, bool zero, bool growing_retained); +size_t extent_sn_next(pac_t *pac); +bool extent_boot(void); + +JEMALLOC_ALWAYS_INLINE bool +extent_neighbor_head_state_mergeable(bool edata_is_head, + bool neighbor_is_head, bool forward) { + /* + * Head states checking: disallow merging if the higher addr extent is a + * head extent. This helps preserve first-fit, and more importantly + * makes sure no merge across arenas. + */ + if (forward) { + if (neighbor_is_head) { + return false; + } + } else { + if (edata_is_head) { + return false; + } + } + return true; +} + +JEMALLOC_ALWAYS_INLINE bool +extent_can_acquire_neighbor(edata_t *edata, rtree_contents_t contents, + extent_pai_t pai, extent_state_t expected_state, bool forward, + bool expanding) { + edata_t *neighbor = contents.edata; + if (neighbor == NULL) { + return false; + } + /* It's not safe to access *neighbor yet; must verify states first. */ + bool neighbor_is_head = contents.metadata.is_head; + if (!extent_neighbor_head_state_mergeable(edata_is_head_get(edata), + neighbor_is_head, forward)) { + return false; + } + extent_state_t neighbor_state = contents.metadata.state; + if (pai == EXTENT_PAI_PAC) { + if (neighbor_state != expected_state) { + return false; + } + /* From this point, it's safe to access *neighbor. */ + if (!expanding && (edata_committed_get(edata) != + edata_committed_get(neighbor))) { + /* + * Some platforms (e.g. Windows) require an explicit + * commit step (and writing to uncommitted memory is not + * allowed). + */ + return false; + } + } else { + if (neighbor_state == extent_state_active) { + return false; + } + /* From this point, it's safe to access *neighbor. */ + } + + assert(edata_pai_get(edata) == pai); + if (edata_pai_get(neighbor) != pai) { + return false; + } + if (opt_retain) { + assert(edata_arena_ind_get(edata) == + edata_arena_ind_get(neighbor)); + } else { + if (edata_arena_ind_get(edata) != + edata_arena_ind_get(neighbor)) { + return false; + } + } + assert(!edata_guarded_get(edata) && !edata_guarded_get(neighbor)); + + return true; +} + +#endif /* JEMALLOC_INTERNAL_EXTENT_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/extent_dss.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/extent_dss.h new file mode 100644 index 000000000..c8e71e82b --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/extent_dss.h @@ -0,0 +1,30 @@ +#ifndef JEMALLOC_INTERNAL_EXTENT_DSS_H +#define JEMALLOC_INTERNAL_EXTENT_DSS_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/arena_types.h" +#include "jemalloc/internal/tsd_types.h" + +typedef enum { + dss_prec_disabled = 0, + dss_prec_primary = 1, + dss_prec_secondary = 2, + + dss_prec_limit = 3 +} dss_prec_t; +#define DSS_PREC_DEFAULT dss_prec_secondary +#define DSS_DEFAULT "secondary" + +extern const char *const dss_prec_names[]; + +extern const char *opt_dss; + +dss_prec_t extent_dss_prec_get(void); +bool extent_dss_prec_set(dss_prec_t dss_prec); +void *extent_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr, + size_t size, size_t alignment, bool *zero, bool *commit); +bool extent_in_dss(void *addr); +bool extent_dss_mergeable(void *addr_a, void *addr_b); +void extent_dss_boot(void); + +#endif /* JEMALLOC_INTERNAL_EXTENT_DSS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/extent_mmap.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/extent_mmap.h new file mode 100644 index 000000000..e6a4649e7 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/extent_mmap.h @@ -0,0 +1,12 @@ +#ifndef JEMALLOC_INTERNAL_EXTENT_MMAP_EXTERNS_H +#define JEMALLOC_INTERNAL_EXTENT_MMAP_EXTERNS_H + +#include "jemalloc/internal/jemalloc_preamble.h" + +extern bool opt_retain; + +void *extent_alloc_mmap(void *new_addr, size_t size, size_t alignment, + bool *zero, bool *commit); +bool extent_dalloc_mmap(void *addr, size_t size); + +#endif /* JEMALLOC_INTERNAL_EXTENT_MMAP_EXTERNS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/fb.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/fb.h new file mode 100644 index 000000000..e38095aff --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/fb.h @@ -0,0 +1,377 @@ +#ifndef JEMALLOC_INTERNAL_FB_H +#define JEMALLOC_INTERNAL_FB_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/bit_util.h" + +/* + * The flat bitmap module. This has a larger API relative to the bitmap module + * (supporting things like backwards searches, and searching for both set and + * unset bits), at the cost of slower operations for very large bitmaps. + * + * Initialized flat bitmaps start at all-zeros (all bits unset). + */ + +typedef unsigned long fb_group_t; +#define FB_GROUP_BITS (ZU(1) << (LG_SIZEOF_LONG + 3)) +#define FB_NGROUPS(nbits) ((nbits) / FB_GROUP_BITS \ + + ((nbits) % FB_GROUP_BITS == 0 ? 0 : 1)) + +static inline void +fb_init(fb_group_t *fb, size_t nbits) { + size_t ngroups = FB_NGROUPS(nbits); + memset(fb, 0, ngroups * sizeof(fb_group_t)); +} + +static inline bool +fb_empty(fb_group_t *fb, size_t nbits) { + size_t ngroups = FB_NGROUPS(nbits); + for (size_t i = 0; i < ngroups; i++) { + if (fb[i] != 0) { + return false; + } + } + return true; +} + +static inline bool +fb_full(fb_group_t *fb, size_t nbits) { + size_t ngroups = FB_NGROUPS(nbits); + size_t trailing_bits = nbits % FB_GROUP_BITS; + size_t limit = (trailing_bits == 0 ? ngroups : ngroups - 1); + for (size_t i = 0; i < limit; i++) { + if (fb[i] != ~(fb_group_t)0) { + return false; + } + } + if (trailing_bits == 0) { + return true; + } + return fb[ngroups - 1] == ((fb_group_t)1 << trailing_bits) - 1; +} + +static inline bool +fb_get(fb_group_t *fb, size_t nbits, size_t bit) { + assert(bit < nbits); + size_t group_ind = bit / FB_GROUP_BITS; + size_t bit_ind = bit % FB_GROUP_BITS; + return (bool)(fb[group_ind] & ((fb_group_t)1 << bit_ind)); +} + +static inline void +fb_set(fb_group_t *fb, size_t nbits, size_t bit) { + assert(bit < nbits); + size_t group_ind = bit / FB_GROUP_BITS; + size_t bit_ind = bit % FB_GROUP_BITS; + fb[group_ind] |= ((fb_group_t)1 << bit_ind); +} + +static inline void +fb_unset(fb_group_t *fb, size_t nbits, size_t bit) { + assert(bit < nbits); + size_t group_ind = bit / FB_GROUP_BITS; + size_t bit_ind = bit % FB_GROUP_BITS; + fb[group_ind] &= ~((fb_group_t)1 << bit_ind); +} + + +/* + * Some implementation details. This visitation function lets us apply a group + * visitor to each group in the bitmap (potentially modifying it). The mask + * indicates which bits are logically part of the visitation. + */ +typedef void (*fb_group_visitor_t)(void *ctx, fb_group_t *fb, fb_group_t mask); +JEMALLOC_ALWAYS_INLINE void +fb_visit_impl(fb_group_t *fb, size_t nbits, fb_group_visitor_t visit, void *ctx, + size_t start, size_t cnt) { + assert(cnt > 0); + assert(start + cnt <= nbits); + size_t group_ind = start / FB_GROUP_BITS; + size_t start_bit_ind = start % FB_GROUP_BITS; + /* + * The first group is special; it's the only one we don't start writing + * to from bit 0. + */ + size_t first_group_cnt = (start_bit_ind + cnt > FB_GROUP_BITS + ? FB_GROUP_BITS - start_bit_ind : cnt); + /* + * We can basically split affected words into: + * - The first group, where we touch only the high bits + * - The last group, where we touch only the low bits + * - The middle, where we set all the bits to the same thing. + * We treat each case individually. The last two could be merged, but + * this can lead to bad codegen for those middle words. + */ + /* First group */ + fb_group_t mask = ((~(fb_group_t)0) + >> (FB_GROUP_BITS - first_group_cnt)) + << start_bit_ind; + visit(ctx, &fb[group_ind], mask); + + cnt -= first_group_cnt; + group_ind++; + /* Middle groups */ + while (cnt > FB_GROUP_BITS) { + visit(ctx, &fb[group_ind], ~(fb_group_t)0); + cnt -= FB_GROUP_BITS; + group_ind++; + } + /* Last group */ + if (cnt != 0) { + mask = (~(fb_group_t)0) >> (FB_GROUP_BITS - cnt); + visit(ctx, &fb[group_ind], mask); + } +} + +JEMALLOC_ALWAYS_INLINE void +fb_assign_visitor(void *ctx, fb_group_t *fb, fb_group_t mask) { + bool val = *(bool *)ctx; + if (val) { + *fb |= mask; + } else { + *fb &= ~mask; + } +} + +/* Sets the cnt bits starting at position start. Must not have a 0 count. */ +static inline void +fb_set_range(fb_group_t *fb, size_t nbits, size_t start, size_t cnt) { + bool val = true; + fb_visit_impl(fb, nbits, &fb_assign_visitor, &val, start, cnt); +} + +/* Unsets the cnt bits starting at position start. Must not have a 0 count. */ +static inline void +fb_unset_range(fb_group_t *fb, size_t nbits, size_t start, size_t cnt) { + bool val = false; + fb_visit_impl(fb, nbits, &fb_assign_visitor, &val, start, cnt); +} + +JEMALLOC_ALWAYS_INLINE void +fb_scount_visitor(void *ctx, fb_group_t *fb, fb_group_t mask) { + size_t *scount = (size_t *)ctx; + *scount += popcount_lu(*fb & mask); +} + +/* Finds the number of set bit in the of length cnt starting at start. */ +JEMALLOC_ALWAYS_INLINE size_t +fb_scount(fb_group_t *fb, size_t nbits, size_t start, size_t cnt) { + size_t scount = 0; + fb_visit_impl(fb, nbits, &fb_scount_visitor, &scount, start, cnt); + return scount; +} + +/* Finds the number of unset bit in the of length cnt starting at start. */ +JEMALLOC_ALWAYS_INLINE size_t +fb_ucount(fb_group_t *fb, size_t nbits, size_t start, size_t cnt) { + size_t scount = fb_scount(fb, nbits, start, cnt); + return cnt - scount; +} + +/* + * An implementation detail; find the first bit at position >= min_bit with the + * value val. + * + * Returns the number of bits in the bitmap if no such bit exists. + */ +JEMALLOC_ALWAYS_INLINE ssize_t +fb_find_impl(fb_group_t *fb, size_t nbits, size_t start, bool val, + bool forward) { + assert(start < nbits); + size_t ngroups = FB_NGROUPS(nbits); + ssize_t group_ind = start / FB_GROUP_BITS; + size_t bit_ind = start % FB_GROUP_BITS; + + fb_group_t maybe_invert = (val ? 0 : (fb_group_t)-1); + + fb_group_t group = fb[group_ind]; + group ^= maybe_invert; + if (forward) { + /* Only keep ones in bits bit_ind and above. */ + group &= ~((1LU << bit_ind) - 1); + } else { + /* + * Only keep ones in bits bit_ind and below. You might more + * naturally express this as (1 << (bit_ind + 1)) - 1, but + * that shifts by an invalid amount if bit_ind is one less than + * FB_GROUP_BITS. + */ + group &= ((2LU << bit_ind) - 1); + } + ssize_t group_ind_bound = forward ? (ssize_t)ngroups : -1; + while (group == 0) { + group_ind += forward ? 1 : -1; + if (group_ind == group_ind_bound) { + return forward ? (ssize_t)nbits : (ssize_t)-1; + } + group = fb[group_ind]; + group ^= maybe_invert; + } + assert(group != 0); + size_t bit = forward ? ffs_lu(group) : fls_lu(group); + size_t pos = group_ind * FB_GROUP_BITS + bit; + /* + * The high bits of a partially filled last group are zeros, so if we're + * looking for zeros we don't want to report an invalid result. + */ + if (forward && !val && pos > nbits) { + return nbits; + } + return pos; +} + +/* + * Find the first set bit in the bitmap with an index >= min_bit. Returns the + * number of bits in the bitmap if no such bit exists. + */ +static inline size_t +fb_ffu(fb_group_t *fb, size_t nbits, size_t min_bit) { + return (size_t)fb_find_impl(fb, nbits, min_bit, /* val */ false, + /* forward */ true); +} + +/* The same, but looks for an unset bit. */ +static inline size_t +fb_ffs(fb_group_t *fb, size_t nbits, size_t min_bit) { + return (size_t)fb_find_impl(fb, nbits, min_bit, /* val */ true, + /* forward */ true); +} + +/* + * Find the last set bit in the bitmap with an index <= max_bit. Returns -1 if + * no such bit exists. + */ +static inline ssize_t +fb_flu(fb_group_t *fb, size_t nbits, size_t max_bit) { + return fb_find_impl(fb, nbits, max_bit, /* val */ false, + /* forward */ false); +} + +static inline ssize_t +fb_fls(fb_group_t *fb, size_t nbits, size_t max_bit) { + return fb_find_impl(fb, nbits, max_bit, /* val */ true, + /* forward */ false); +} + +/* Returns whether or not we found a range. */ +JEMALLOC_ALWAYS_INLINE bool +fb_iter_range_impl(fb_group_t *fb, size_t nbits, size_t start, size_t *r_begin, + size_t *r_len, bool val, bool forward) { + assert(start < nbits); + ssize_t next_range_begin = fb_find_impl(fb, nbits, start, val, forward); + if ((forward && next_range_begin == (ssize_t)nbits) + || (!forward && next_range_begin == (ssize_t)-1)) { + return false; + } + /* Half open range; the set bits are [begin, end). */ + ssize_t next_range_end = fb_find_impl(fb, nbits, next_range_begin, !val, + forward); + if (forward) { + *r_begin = next_range_begin; + *r_len = next_range_end - next_range_begin; + } else { + *r_begin = next_range_end + 1; + *r_len = next_range_begin - next_range_end; + } + return true; +} + +/* + * Used to iterate through ranges of set bits. + * + * Tries to find the next contiguous sequence of set bits with a first index >= + * start. If one exists, puts the earliest bit of the range in *r_begin, its + * length in *r_len, and returns true. Otherwise, returns false (without + * touching *r_begin or *r_end). + */ +static inline bool +fb_srange_iter(fb_group_t *fb, size_t nbits, size_t start, size_t *r_begin, + size_t *r_len) { + return fb_iter_range_impl(fb, nbits, start, r_begin, r_len, + /* val */ true, /* forward */ true); +} + +/* + * The same as fb_srange_iter, but searches backwards from start rather than + * forwards. (The position returned is still the earliest bit in the range). + */ +static inline bool +fb_srange_riter(fb_group_t *fb, size_t nbits, size_t start, size_t *r_begin, + size_t *r_len) { + return fb_iter_range_impl(fb, nbits, start, r_begin, r_len, + /* val */ true, /* forward */ false); +} + +/* Similar to fb_srange_iter, but searches for unset bits. */ +static inline bool +fb_urange_iter(fb_group_t *fb, size_t nbits, size_t start, size_t *r_begin, + size_t *r_len) { + return fb_iter_range_impl(fb, nbits, start, r_begin, r_len, + /* val */ false, /* forward */ true); +} + +/* Similar to fb_srange_riter, but searches for unset bits. */ +static inline bool +fb_urange_riter(fb_group_t *fb, size_t nbits, size_t start, size_t *r_begin, + size_t *r_len) { + return fb_iter_range_impl(fb, nbits, start, r_begin, r_len, + /* val */ false, /* forward */ false); +} + +JEMALLOC_ALWAYS_INLINE size_t +fb_range_longest_impl(fb_group_t *fb, size_t nbits, bool val) { + size_t begin = 0; + size_t longest_len = 0; + size_t len = 0; + while (begin < nbits && fb_iter_range_impl(fb, nbits, begin, &begin, + &len, val, /* forward */ true)) { + if (len > longest_len) { + longest_len = len; + } + begin += len; + } + return longest_len; +} + +static inline size_t +fb_srange_longest(fb_group_t *fb, size_t nbits) { + return fb_range_longest_impl(fb, nbits, /* val */ true); +} + +static inline size_t +fb_urange_longest(fb_group_t *fb, size_t nbits) { + return fb_range_longest_impl(fb, nbits, /* val */ false); +} + +/* + * Initializes each bit of dst with the bitwise-AND of the corresponding bits of + * src1 and src2. All bitmaps must be the same size. + */ +static inline void +fb_bit_and(fb_group_t *dst, fb_group_t *src1, fb_group_t *src2, size_t nbits) { + size_t ngroups = FB_NGROUPS(nbits); + for (size_t i = 0; i < ngroups; i++) { + dst[i] = src1[i] & src2[i]; + } +} + +/* Like fb_bit_and, but with bitwise-OR. */ +static inline void +fb_bit_or(fb_group_t *dst, fb_group_t *src1, fb_group_t *src2, size_t nbits) { + size_t ngroups = FB_NGROUPS(nbits); + for (size_t i = 0; i < ngroups; i++) { + dst[i] = src1[i] | src2[i]; + } +} + +/* Initializes dst bit i to the negation of source bit i. */ +static inline void +fb_bit_not(fb_group_t *dst, fb_group_t *src, size_t nbits) { + size_t ngroups = FB_NGROUPS(nbits); + for (size_t i = 0; i < ngroups; i++) { + dst[i] = ~src[i]; + } +} + +#endif /* JEMALLOC_INTERNAL_FB_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/fxp.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/fxp.h new file mode 100644 index 000000000..e42425f91 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/fxp.h @@ -0,0 +1,129 @@ +#ifndef JEMALLOC_INTERNAL_FXP_H +#define JEMALLOC_INTERNAL_FXP_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/assert.h" + +/* + * A simple fixed-point math implementation, supporting only unsigned values + * (with overflow being an error). + * + * It's not in general safe to use floating point in core code, because various + * libc implementations we get linked against can assume that malloc won't touch + * floating point state and call it with an unusual calling convention. + */ + +/* + * High 16 bits are the integer part, low 16 are the fractional part. Or + * equivalently, repr == 2**16 * val, where we use "val" to refer to the + * (imaginary) fractional representation of the true value. + * + * We pick a uint32_t here since it's convenient in some places to + * double the representation size (i.e. multiplication and division use + * 64-bit integer types), and a uint64_t is the largest type we're + * certain is available. + */ +typedef uint32_t fxp_t; +#define FXP_INIT_INT(x) ((x) << 16) +#define FXP_INIT_PERCENT(pct) (((pct) << 16) / 100) + +/* + * Amount of precision used in parsing and printing numbers. The integer bound + * is simply because the integer part of the number gets 16 bits, and so is + * bounded by 65536. + * + * We use a lot of precision for the fractional part, even though most of it + * gets rounded off; this lets us get exact values for the important special + * case where the denominator is a small power of 2 (for instance, + * 1/512 == 0.001953125 is exactly representable even with only 16 bits of + * fractional precision). We need to left-shift by 16 before dividing by + * 10**precision, so we pick precision to be floor(log(2**48)) = 14. + */ +#define FXP_INTEGER_PART_DIGITS 5 +#define FXP_FRACTIONAL_PART_DIGITS 14 + +/* + * In addition to the integer and fractional parts of the number, we need to + * include a null character and (possibly) a decimal point. + */ +#define FXP_BUF_SIZE (FXP_INTEGER_PART_DIGITS + FXP_FRACTIONAL_PART_DIGITS + 2) + +static inline fxp_t +fxp_add(fxp_t a, fxp_t b) { + return a + b; +} + +static inline fxp_t +fxp_sub(fxp_t a, fxp_t b) { + assert(a >= b); + return a - b; +} + +static inline fxp_t +fxp_mul(fxp_t a, fxp_t b) { + uint64_t unshifted = (uint64_t)a * (uint64_t)b; + /* + * Unshifted is (a.val * 2**16) * (b.val * 2**16) + * == (a.val * b.val) * 2**32, but we want + * (a.val * b.val) * 2 ** 16. + */ + return (uint32_t)(unshifted >> 16); +} + +static inline fxp_t +fxp_div(fxp_t a, fxp_t b) { + assert(b != 0); + uint64_t unshifted = ((uint64_t)a << 32) / (uint64_t)b; + /* + * Unshifted is (a.val * 2**16) * (2**32) / (b.val * 2**16) + * == (a.val / b.val) * (2 ** 32), which again corresponds to a right + * shift of 16. + */ + return (uint32_t)(unshifted >> 16); +} + +static inline uint32_t +fxp_round_down(fxp_t a) { + return a >> 16; +} + +static inline uint32_t +fxp_round_nearest(fxp_t a) { + uint32_t fractional_part = (a & ((1U << 16) - 1)); + uint32_t increment = (uint32_t)(fractional_part >= (1U << 15)); + return (a >> 16) + increment; +} + +/* + * Approximately computes x * frac, without the size limitations that would be + * imposed by converting u to an fxp_t. + */ +static inline size_t +fxp_mul_frac(size_t x_orig, fxp_t frac) { + assert(frac <= (1U << 16)); + /* + * Work around an over-enthusiastic warning about type limits below (on + * 32-bit platforms, a size_t is always less than 1ULL << 48). + */ + uint64_t x = (uint64_t)x_orig; + /* + * If we can guarantee no overflow, multiply first before shifting, to + * preserve some precision. Otherwise, shift first and then multiply. + * In the latter case, we only lose the low 16 bits of a 48-bit number, + * so we're still accurate to within 1/2**32. + */ + if (x < (1ULL << 48)) { + return (size_t)((x * frac) >> 16); + } else { + return (size_t)((x >> 16) * (uint64_t)frac); + } +} + +/* + * Returns true on error. Otherwise, returns false and updates *ptr to point to + * the first character not parsed (because it wasn't a digit). + */ +bool fxp_parse(fxp_t *a, const char *ptr, char **end); +void fxp_print(fxp_t a, char buf[FXP_BUF_SIZE]); + +#endif /* JEMALLOC_INTERNAL_FXP_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/hash.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/hash.h new file mode 100644 index 000000000..15162b947 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/hash.h @@ -0,0 +1,321 @@ +#ifndef JEMALLOC_INTERNAL_HASH_H +#define JEMALLOC_INTERNAL_HASH_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/assert.h" + +/* + * The following hash function is based on MurmurHash3, placed into the public + * domain by Austin Appleby. See https://github.com/aappleby/smhasher for + * details. + */ + +/******************************************************************************/ +/* Internal implementation. */ +static inline uint32_t +hash_rotl_32(uint32_t x, int8_t r) { + return ((x << r) | (x >> (32 - r))); +} + +static inline uint64_t +hash_rotl_64(uint64_t x, int8_t r) { + return ((x << r) | (x >> (64 - r))); +} + +static inline uint32_t +hash_get_block_32(const uint32_t *p, int i) { + /* Handle unaligned read. */ + if (unlikely((uintptr_t)p & (sizeof(uint32_t)-1)) != 0) { + uint32_t ret; + + memcpy(&ret, (uint8_t *)(p + i), sizeof(uint32_t)); + return ret; + } + + return p[i]; +} + +static inline uint64_t +hash_get_block_64(const uint64_t *p, int i) { + /* Handle unaligned read. */ + if (unlikely((uintptr_t)p & (sizeof(uint64_t)-1)) != 0) { + uint64_t ret; + + memcpy(&ret, (uint8_t *)(p + i), sizeof(uint64_t)); + return ret; + } + + return p[i]; +} + +static inline uint32_t +hash_fmix_32(uint32_t h) { + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +static inline uint64_t +hash_fmix_64(uint64_t k) { + k ^= k >> 33; + k *= KQU(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= KQU(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + + return k; +} + +static inline uint32_t +hash_x86_32(const void *key, int len, uint32_t seed) { + const uint8_t *data = (const uint8_t *) key; + const int nblocks = len / 4; + + uint32_t h1 = seed; + + const uint32_t c1 = 0xcc9e2d51; + const uint32_t c2 = 0x1b873593; + + /* body */ + { + const uint32_t *blocks = (const uint32_t *) (data + nblocks*4); + int i; + + for (i = -nblocks; i; i++) { + uint32_t k1 = hash_get_block_32(blocks, i); + + k1 *= c1; + k1 = hash_rotl_32(k1, 15); + k1 *= c2; + + h1 ^= k1; + h1 = hash_rotl_32(h1, 13); + h1 = h1*5 + 0xe6546b64; + } + } + + /* tail */ + { + const uint8_t *tail = (const uint8_t *) (data + nblocks*4); + + uint32_t k1 = 0; + + switch (len & 3) { + case 3: k1 ^= tail[2] << 16; JEMALLOC_FALLTHROUGH; + case 2: k1 ^= tail[1] << 8; JEMALLOC_FALLTHROUGH; + case 1: k1 ^= tail[0]; k1 *= c1; k1 = hash_rotl_32(k1, 15); + k1 *= c2; h1 ^= k1; + } + } + + /* finalization */ + h1 ^= len; + + h1 = hash_fmix_32(h1); + + return h1; +} + +static inline void +hash_x86_128(const void *key, const int len, uint32_t seed, + uint64_t r_out[2]) { + const uint8_t * data = (const uint8_t *) key; + const int nblocks = len / 16; + + uint32_t h1 = seed; + uint32_t h2 = seed; + uint32_t h3 = seed; + uint32_t h4 = seed; + + const uint32_t c1 = 0x239b961b; + const uint32_t c2 = 0xab0e9789; + const uint32_t c3 = 0x38b34ae5; + const uint32_t c4 = 0xa1e38b93; + + /* body */ + { + const uint32_t *blocks = (const uint32_t *) (data + nblocks*16); + int i; + + for (i = -nblocks; i; i++) { + uint32_t k1 = hash_get_block_32(blocks, i*4 + 0); + uint32_t k2 = hash_get_block_32(blocks, i*4 + 1); + uint32_t k3 = hash_get_block_32(blocks, i*4 + 2); + uint32_t k4 = hash_get_block_32(blocks, i*4 + 3); + + k1 *= c1; k1 = hash_rotl_32(k1, 15); k1 *= c2; h1 ^= k1; + + h1 = hash_rotl_32(h1, 19); h1 += h2; + h1 = h1*5 + 0x561ccd1b; + + k2 *= c2; k2 = hash_rotl_32(k2, 16); k2 *= c3; h2 ^= k2; + + h2 = hash_rotl_32(h2, 17); h2 += h3; + h2 = h2*5 + 0x0bcaa747; + + k3 *= c3; k3 = hash_rotl_32(k3, 17); k3 *= c4; h3 ^= k3; + + h3 = hash_rotl_32(h3, 15); h3 += h4; + h3 = h3*5 + 0x96cd1c35; + + k4 *= c4; k4 = hash_rotl_32(k4, 18); k4 *= c1; h4 ^= k4; + + h4 = hash_rotl_32(h4, 13); h4 += h1; + h4 = h4*5 + 0x32ac3b17; + } + } + + /* tail */ + { + const uint8_t *tail = (const uint8_t *) (data + nblocks*16); + uint32_t k1 = 0; + uint32_t k2 = 0; + uint32_t k3 = 0; + uint32_t k4 = 0; + + switch (len & 15) { + case 15: k4 ^= tail[14] << 16; JEMALLOC_FALLTHROUGH; + case 14: k4 ^= tail[13] << 8; JEMALLOC_FALLTHROUGH; + case 13: k4 ^= tail[12] << 0; + k4 *= c4; k4 = hash_rotl_32(k4, 18); k4 *= c1; h4 ^= k4; + JEMALLOC_FALLTHROUGH; + case 12: k3 ^= (uint32_t) tail[11] << 24; JEMALLOC_FALLTHROUGH; + case 11: k3 ^= tail[10] << 16; JEMALLOC_FALLTHROUGH; + case 10: k3 ^= tail[ 9] << 8; JEMALLOC_FALLTHROUGH; + case 9: k3 ^= tail[ 8] << 0; + k3 *= c3; k3 = hash_rotl_32(k3, 17); k3 *= c4; h3 ^= k3; + JEMALLOC_FALLTHROUGH; + case 8: k2 ^= (uint32_t) tail[ 7] << 24; JEMALLOC_FALLTHROUGH; + case 7: k2 ^= tail[ 6] << 16; JEMALLOC_FALLTHROUGH; + case 6: k2 ^= tail[ 5] << 8; JEMALLOC_FALLTHROUGH; + case 5: k2 ^= tail[ 4] << 0; + k2 *= c2; k2 = hash_rotl_32(k2, 16); k2 *= c3; h2 ^= k2; + JEMALLOC_FALLTHROUGH; + case 4: k1 ^= (uint32_t) tail[ 3] << 24; JEMALLOC_FALLTHROUGH; + case 3: k1 ^= tail[ 2] << 16; JEMALLOC_FALLTHROUGH; + case 2: k1 ^= tail[ 1] << 8; JEMALLOC_FALLTHROUGH; + case 1: k1 ^= tail[ 0] << 0; + k1 *= c1; k1 = hash_rotl_32(k1, 15); k1 *= c2; h1 ^= k1; + break; + } + } + + /* finalization */ + h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + h1 = hash_fmix_32(h1); + h2 = hash_fmix_32(h2); + h3 = hash_fmix_32(h3); + h4 = hash_fmix_32(h4); + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + r_out[0] = (((uint64_t) h2) << 32) | h1; + r_out[1] = (((uint64_t) h4) << 32) | h3; +} + +static inline void +hash_x64_128(const void *key, const int len, const uint32_t seed, + uint64_t r_out[2]) { + const uint8_t *data = (const uint8_t *) key; + const int nblocks = len / 16; + + uint64_t h1 = seed; + uint64_t h2 = seed; + + const uint64_t c1 = KQU(0x87c37b91114253d5); + const uint64_t c2 = KQU(0x4cf5ad432745937f); + + /* body */ + { + const uint64_t *blocks = (const uint64_t *) (data); + int i; + + for (i = 0; i < nblocks; i++) { + uint64_t k1 = hash_get_block_64(blocks, i*2 + 0); + uint64_t k2 = hash_get_block_64(blocks, i*2 + 1); + + k1 *= c1; k1 = hash_rotl_64(k1, 31); k1 *= c2; h1 ^= k1; + + h1 = hash_rotl_64(h1, 27); h1 += h2; + h1 = h1*5 + 0x52dce729; + + k2 *= c2; k2 = hash_rotl_64(k2, 33); k2 *= c1; h2 ^= k2; + + h2 = hash_rotl_64(h2, 31); h2 += h1; + h2 = h2*5 + 0x38495ab5; + } + } + + /* tail */ + { + const uint8_t *tail = (const uint8_t*)(data + nblocks*16); + uint64_t k1 = 0; + uint64_t k2 = 0; + + switch (len & 15) { + case 15: k2 ^= ((uint64_t)(tail[14])) << 48; JEMALLOC_FALLTHROUGH; + case 14: k2 ^= ((uint64_t)(tail[13])) << 40; JEMALLOC_FALLTHROUGH; + case 13: k2 ^= ((uint64_t)(tail[12])) << 32; JEMALLOC_FALLTHROUGH; + case 12: k2 ^= ((uint64_t)(tail[11])) << 24; JEMALLOC_FALLTHROUGH; + case 11: k2 ^= ((uint64_t)(tail[10])) << 16; JEMALLOC_FALLTHROUGH; + case 10: k2 ^= ((uint64_t)(tail[ 9])) << 8; JEMALLOC_FALLTHROUGH; + case 9: k2 ^= ((uint64_t)(tail[ 8])) << 0; + k2 *= c2; k2 = hash_rotl_64(k2, 33); k2 *= c1; h2 ^= k2; + JEMALLOC_FALLTHROUGH; + case 8: k1 ^= ((uint64_t)(tail[ 7])) << 56; JEMALLOC_FALLTHROUGH; + case 7: k1 ^= ((uint64_t)(tail[ 6])) << 48; JEMALLOC_FALLTHROUGH; + case 6: k1 ^= ((uint64_t)(tail[ 5])) << 40; JEMALLOC_FALLTHROUGH; + case 5: k1 ^= ((uint64_t)(tail[ 4])) << 32; JEMALLOC_FALLTHROUGH; + case 4: k1 ^= ((uint64_t)(tail[ 3])) << 24; JEMALLOC_FALLTHROUGH; + case 3: k1 ^= ((uint64_t)(tail[ 2])) << 16; JEMALLOC_FALLTHROUGH; + case 2: k1 ^= ((uint64_t)(tail[ 1])) << 8; JEMALLOC_FALLTHROUGH; + case 1: k1 ^= ((uint64_t)(tail[ 0])) << 0; + k1 *= c1; k1 = hash_rotl_64(k1, 31); k1 *= c2; h1 ^= k1; + break; + } + } + + /* finalization */ + h1 ^= len; h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = hash_fmix_64(h1); + h2 = hash_fmix_64(h2); + + h1 += h2; + h2 += h1; + + r_out[0] = h1; + r_out[1] = h2; +} + +/******************************************************************************/ +/* API. */ +static inline void +hash(const void *key, size_t len, const uint32_t seed, size_t r_hash[2]) { + assert(len <= INT_MAX); /* Unfortunate implementation limitation. */ + +#if (LG_SIZEOF_PTR == 3 && !defined(JEMALLOC_BIG_ENDIAN)) + hash_x64_128(key, (int)len, seed, (uint64_t *)r_hash); +#else + { + uint64_t hashes[2]; + hash_x86_128(key, (int)len, seed, hashes); + r_hash[0] = (size_t)hashes[0]; + r_hash[1] = (size_t)hashes[1]; + } +#endif +} + +#endif /* JEMALLOC_INTERNAL_HASH_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/hook.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/hook.h new file mode 100644 index 000000000..76b9130d2 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/hook.h @@ -0,0 +1,165 @@ +#ifndef JEMALLOC_INTERNAL_HOOK_H +#define JEMALLOC_INTERNAL_HOOK_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/tsd.h" + +/* + * This API is *extremely* experimental, and may get ripped out, changed in API- + * and ABI-incompatible ways, be insufficiently or incorrectly documented, etc. + * + * It allows hooking the stateful parts of the API to see changes as they + * happen. + * + * Allocation hooks are called after the allocation is done, free hooks are + * called before the free is done, and expand hooks are called after the + * allocation is expanded. + * + * For realloc and rallocx, if the expansion happens in place, the expansion + * hook is called. If it is moved, then the alloc hook is called on the new + * location, and then the free hook is called on the old location (i.e. both + * hooks are invoked in between the alloc and the dalloc). + * + * If we return NULL from OOM, then usize might not be trustworthy. Calling + * realloc(NULL, size) only calls the alloc hook, and calling realloc(ptr, 0) + * only calls the free hook. (Calling realloc(NULL, 0) is treated as malloc(0), + * and only calls the alloc hook). + * + * Reentrancy: + * Reentrancy is guarded against from within the hook implementation. If you + * call allocator functions from within a hook, the hooks will not be invoked + * again. + * Threading: + * The installation of a hook synchronizes with all its uses. If you can + * prove the installation of a hook happens-before a jemalloc entry point, + * then the hook will get invoked (unless there's a racing removal). + * + * Hook insertion appears to be atomic at a per-thread level (i.e. if a thread + * allocates and has the alloc hook invoked, then a subsequent free on the + * same thread will also have the free hook invoked). + * + * The *removal* of a hook does *not* block until all threads are done with + * the hook. Hook authors have to be resilient to this, and need some + * out-of-band mechanism for cleaning up any dynamically allocated memory + * associated with their hook. + * Ordering: + * Order of hook execution is unspecified, and may be different than insertion + * order. + */ + +#define HOOK_MAX 4 + +enum hook_alloc_e { + hook_alloc_malloc, + hook_alloc_posix_memalign, + hook_alloc_aligned_alloc, + hook_alloc_calloc, + hook_alloc_memalign, + hook_alloc_valloc, + hook_alloc_pvalloc, + hook_alloc_mallocx, + + /* The reallocating functions have both alloc and dalloc variants */ + hook_alloc_realloc, + hook_alloc_rallocx, +}; +/* + * We put the enum typedef after the enum, since this file may get included by + * jemalloc_cpp.cpp, and C++ disallows enum forward declarations. + */ +typedef enum hook_alloc_e hook_alloc_t; + +enum hook_dalloc_e { + hook_dalloc_free, + hook_dalloc_dallocx, + hook_dalloc_sdallocx, + + /* + * The dalloc halves of reallocation (not called if in-place expansion + * happens). + */ + hook_dalloc_realloc, + hook_dalloc_rallocx, +}; +typedef enum hook_dalloc_e hook_dalloc_t; + + +enum hook_expand_e { + hook_expand_realloc, + hook_expand_rallocx, + hook_expand_xallocx, +}; +typedef enum hook_expand_e hook_expand_t; + +typedef void (*hook_alloc)( + void *extra, hook_alloc_t type, void *result, uintptr_t result_raw, + uintptr_t args_raw[3]); + +typedef void (*hook_dalloc)( + void *extra, hook_dalloc_t type, void *address, uintptr_t args_raw[3]); + +typedef void (*hook_expand)( + void *extra, hook_expand_t type, void *address, size_t old_usize, + size_t new_usize, uintptr_t result_raw, uintptr_t args_raw[4]); + +typedef struct hooks_s hooks_t; +struct hooks_s { + hook_alloc alloc_hook; + hook_dalloc dalloc_hook; + hook_expand expand_hook; + void *extra; +}; + +/* + * Begin implementation details; everything above this point might one day live + * in a public API. Everything below this point never will. + */ + +/* + * The realloc pathways haven't gotten any refactoring love in a while, and it's + * fairly difficult to pass information from the entry point to the hooks. We + * put the informaiton the hooks will need into a struct to encapsulate + * everything. + * + * Much of these pathways are force-inlined, so that the compiler can avoid + * materializing this struct until we hit an extern arena function. For fairly + * goofy reasons, *many* of the realloc paths hit an extern arena function. + * These paths are cold enough that it doesn't matter; eventually, we should + * rewrite the realloc code to make the expand-in-place and the + * free-then-realloc paths more orthogonal, at which point we don't need to + * spread the hook logic all over the place. + */ +typedef struct hook_ralloc_args_s hook_ralloc_args_t; +struct hook_ralloc_args_s { + /* I.e. as opposed to rallocx. */ + bool is_realloc; + /* + * The expand hook takes 4 arguments, even if only 3 are actually used; + * we add an extra one in case the user decides to memcpy without + * looking too closely at the hooked function. + */ + uintptr_t args[4]; +}; + +/* + * Returns an opaque handle to be used when removing the hook. NULL means that + * we couldn't install the hook. + */ +bool hook_boot(void); + +void *hook_install(tsdn_t *tsdn, hooks_t *to_install); +/* Uninstalls the hook with the handle previously returned from hook_install. */ +void hook_remove(tsdn_t *tsdn, void *opaque); + +/* Hooks */ + +void hook_invoke_alloc(hook_alloc_t type, void *result, uintptr_t result_raw, + uintptr_t args_raw[3]); + +void hook_invoke_dalloc(hook_dalloc_t type, void *address, + uintptr_t args_raw[3]); + +void hook_invoke_expand(hook_expand_t type, void *address, size_t old_usize, + size_t new_usize, uintptr_t result_raw, uintptr_t args_raw[4]); + +#endif /* JEMALLOC_INTERNAL_HOOK_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/hpa.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/hpa.h new file mode 100644 index 000000000..4805efaf8 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/hpa.h @@ -0,0 +1,182 @@ +#ifndef JEMALLOC_INTERNAL_HPA_H +#define JEMALLOC_INTERNAL_HPA_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/base.h" +#include "jemalloc/internal/edata_cache.h" +#include "jemalloc/internal/emap.h" +#include "jemalloc/internal/exp_grow.h" +#include "jemalloc/internal/hpa_hooks.h" +#include "jemalloc/internal/hpa_opts.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/pai.h" +#include "jemalloc/internal/psset.h" + +typedef struct hpa_central_s hpa_central_t; +struct hpa_central_s { + /* + * Guards expansion of eden. We separate this from the regular mutex so + * that cheaper operations can still continue while we're doing the OS + * call. + */ + malloc_mutex_t grow_mtx; + /* + * Either NULL (if empty), or some integer multiple of a + * hugepage-aligned number of hugepages. We carve them off one at a + * time to satisfy new pageslab requests. + * + * Guarded by grow_mtx. + */ + void *eden; + size_t eden_len; + /* Source for metadata. */ + base_t *base; + /* Number of grow operations done on this hpa_central_t. */ + uint64_t age_counter; + + /* The HPA hooks. */ + hpa_hooks_t hooks; +}; + +typedef struct hpa_shard_nonderived_stats_s hpa_shard_nonderived_stats_t; +struct hpa_shard_nonderived_stats_s { + /* + * The number of times we've purged within a hugepage. + * + * Guarded by mtx. + */ + uint64_t npurge_passes; + /* + * The number of individual purge calls we perform (which should always + * be bigger than npurge_passes, since each pass purges at least one + * extent within a hugepage. + * + * Guarded by mtx. + */ + uint64_t npurges; + + /* + * The number of times we've hugified a pageslab. + * + * Guarded by mtx. + */ + uint64_t nhugifies; + /* + * The number of times we've dehugified a pageslab. + * + * Guarded by mtx. + */ + uint64_t ndehugifies; +}; + +/* Completely derived; only used by CTL. */ +typedef struct hpa_shard_stats_s hpa_shard_stats_t; +struct hpa_shard_stats_s { + psset_stats_t psset_stats; + hpa_shard_nonderived_stats_t nonderived_stats; +}; + +typedef struct hpa_shard_s hpa_shard_t; +struct hpa_shard_s { + /* + * pai must be the first member; we cast from a pointer to it to a + * pointer to the hpa_shard_t. + */ + pai_t pai; + + /* The central allocator we get our hugepages from. */ + hpa_central_t *central; + /* Protects most of this shard's state. */ + malloc_mutex_t mtx; + /* + * Guards the shard's access to the central allocator (preventing + * multiple threads operating on this shard from accessing the central + * allocator). + */ + malloc_mutex_t grow_mtx; + /* The base metadata allocator. */ + base_t *base; + + /* + * This edata cache is the one we use when allocating a small extent + * from a pageslab. The pageslab itself comes from the centralized + * allocator, and so will use its edata_cache. + */ + edata_cache_fast_t ecf; + + psset_t psset; + + /* + * How many grow operations have occurred. + * + * Guarded by grow_mtx. + */ + uint64_t age_counter; + + /* The arena ind we're associated with. */ + unsigned ind; + + /* + * Our emap. This is just a cache of the emap pointer in the associated + * hpa_central. + */ + emap_t *emap; + + /* The configuration choices for this hpa shard. */ + hpa_shard_opts_t opts; + + /* + * How many pages have we started but not yet finished purging in this + * hpa shard. + */ + size_t npending_purge; + + /* + * Those stats which are copied directly into the CTL-centric hpa shard + * stats. + */ + hpa_shard_nonderived_stats_t stats; + + /* + * Last time we performed purge on this shard. + */ + nstime_t last_purge; +}; + +/* + * Whether or not the HPA can be used given the current configuration. This is + * is not necessarily a guarantee that it backs its allocations by hugepages, + * just that it can function properly given the system it's running on. + */ +bool hpa_supported(void); +bool hpa_central_init(hpa_central_t *central, base_t *base, const hpa_hooks_t *hooks); +bool hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap, + base_t *base, edata_cache_t *edata_cache, unsigned ind, + const hpa_shard_opts_t *opts); + +void hpa_shard_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src); +void hpa_shard_stats_merge(tsdn_t *tsdn, hpa_shard_t *shard, + hpa_shard_stats_t *dst); + +/* + * Notify the shard that we won't use it for allocations much longer. Due to + * the possibility of races, we don't actually prevent allocations; just flush + * and disable the embedded edata_cache_small. + */ +void hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard); +void hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard); + +void hpa_shard_set_deferral_allowed(tsdn_t *tsdn, hpa_shard_t *shard, + bool deferral_allowed); +void hpa_shard_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard); + +/* + * We share the fork ordering with the PA and arena prefork handling; that's why + * these are 3 and 4 rather than 0 and 1. + */ +void hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard); +void hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard); +void hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard); +void hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard); + +#endif /* JEMALLOC_INTERNAL_HPA_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/hpa_hooks.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/hpa_hooks.h new file mode 100644 index 000000000..72f3a43c2 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/hpa_hooks.h @@ -0,0 +1,20 @@ +#ifndef JEMALLOC_INTERNAL_HPA_HOOKS_H +#define JEMALLOC_INTERNAL_HPA_HOOKS_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/nstime.h" + +typedef struct hpa_hooks_s hpa_hooks_t; +struct hpa_hooks_s { + void *(*map)(size_t size); + void (*unmap)(void *ptr, size_t size); + void (*purge)(void *ptr, size_t size); + void (*hugify)(void *ptr, size_t size); + void (*dehugify)(void *ptr, size_t size); + void (*curtime)(nstime_t *r_time, bool first_reading); + uint64_t (*ms_since)(nstime_t *r_time); +}; + +extern const hpa_hooks_t hpa_hooks_default; + +#endif /* JEMALLOC_INTERNAL_HPA_HOOKS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/hpa_opts.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/hpa_opts.h new file mode 100644 index 000000000..93add641d --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/hpa_opts.h @@ -0,0 +1,85 @@ +#ifndef JEMALLOC_INTERNAL_HPA_OPTS_H +#define JEMALLOC_INTERNAL_HPA_OPTS_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/fxp.h" + +/* + * This file is morally part of hpa.h, but is split out for header-ordering + * reasons. + */ + +typedef struct hpa_shard_opts_s hpa_shard_opts_t; +struct hpa_shard_opts_s { + /* + * The largest size we'll allocate out of the shard. For those + * allocations refused, the caller (in practice, the PA module) will + * fall back to the more general (for now) PAC, which can always handle + * any allocation request. + */ + size_t slab_max_alloc; + + /* + * When the number of active bytes in a hugepage is >= + * hugification_threshold, we force hugify it. + */ + size_t hugification_threshold; + + /* + * The HPA purges whenever the number of pages exceeds dirty_mult * + * active_pages. This may be set to (fxp_t)-1 to disable purging. + */ + fxp_t dirty_mult; + + /* + * Whether or not the PAI methods are allowed to defer work to a + * subsequent hpa_shard_do_deferred_work() call. Practically, this + * corresponds to background threads being enabled. We track this + * ourselves for encapsulation purposes. + */ + bool deferral_allowed; + + /* + * How long a hugepage has to be a hugification candidate before it will + * actually get hugified. + */ + uint64_t hugify_delay_ms; + + /* + * Minimum amount of time between purges. + */ + uint64_t min_purge_interval_ms; + + /* + * Strictly respect minimum amout of time between purges. + * + * This is an option to provide backward compatibility for staged rollout of + * purging logic fix. + */ + bool strict_min_purge_interval; +}; + +#define HPA_SHARD_OPTS_DEFAULT { \ + /* slab_max_alloc */ \ + 64 * 1024, \ + /* hugification_threshold */ \ + HUGEPAGE * 95 / 100, \ + /* dirty_mult */ \ + FXP_INIT_PERCENT(25), \ + /* \ + * deferral_allowed \ + * \ + * Really, this is always set by the arena during creation \ + * or by an hpa_shard_set_deferral_allowed call, so the value \ + * we put here doesn't matter. \ + */ \ + false, \ + /* hugify_delay_ms */ \ + 10 * 1000, \ + /* min_purge_interval_ms */ \ + 5 * 1000, \ + /* strict_min_purge_interval */ \ + false \ +} + +#endif /* JEMALLOC_INTERNAL_HPA_OPTS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/hpdata.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/hpdata.h new file mode 100644 index 000000000..7ba92112f --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/hpdata.h @@ -0,0 +1,416 @@ +#ifndef JEMALLOC_INTERNAL_HPDATA_H +#define JEMALLOC_INTERNAL_HPDATA_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/fb.h" +#include "jemalloc/internal/nstime.h" +#include "jemalloc/internal/pages.h" +#include "jemalloc/internal/ph.h" +#include "jemalloc/internal/ql.h" +#include "jemalloc/internal/typed_list.h" + +/* + * The metadata representation we use for extents in hugepages. While the PAC + * uses the edata_t to represent both active and inactive extents, the HP only + * uses the edata_t for active ones; instead, inactive extent state is tracked + * within hpdata associated with the enclosing hugepage-sized, hugepage-aligned + * region of virtual address space. + * + * An hpdata need not be "truly" backed by a hugepage (which is not necessarily + * an observable property of any given region of address space). It's just + * hugepage-sized and hugepage-aligned; it's *potentially* huge. + */ +typedef struct hpdata_s hpdata_t; +ph_structs(hpdata_age_heap, hpdata_t); +struct hpdata_s { + /* + * We likewise follow the edata convention of mangling names and forcing + * the use of accessors -- this lets us add some consistency checks on + * access. + */ + + /* + * The address of the hugepage in question. This can't be named h_addr, + * since that conflicts with a macro defined in Windows headers. + */ + void *h_address; + /* Its age (measured in psset operations). */ + uint64_t h_age; + /* Whether or not we think the hugepage is mapped that way by the OS. */ + bool h_huge; + + /* + * For some properties, we keep parallel sets of bools; h_foo_allowed + * and h_in_psset_foo_container. This is a decoupling mechanism to + * avoid bothering the hpa (which manages policies) from the psset + * (which is the mechanism used to enforce those policies). This allows + * all the container management logic to live in one place, without the + * HPA needing to know or care how that happens. + */ + + /* + * Whether or not the hpdata is allowed to be used to serve allocations, + * and whether or not the psset is currently tracking it as such. + */ + bool h_alloc_allowed; + bool h_in_psset_alloc_container; + + /* + * The same, but with purging. There's no corresponding + * h_in_psset_purge_container, because the psset (currently) always + * removes hpdatas from their containers during updates (to implement + * LRU for purging). + */ + bool h_purge_allowed; + + /* And with hugifying. */ + bool h_hugify_allowed; + /* When we became a hugification candidate. */ + nstime_t h_time_hugify_allowed; + bool h_in_psset_hugify_container; + + /* Whether or not a purge or hugify is currently happening. */ + bool h_mid_purge; + bool h_mid_hugify; + + /* + * Whether or not the hpdata is being updated in the psset (i.e. if + * there has been a psset_update_begin call issued without a matching + * psset_update_end call). Eventually this will expand to other types + * of updates. + */ + bool h_updating; + + /* Whether or not the hpdata is in a psset. */ + bool h_in_psset; + + union { + /* When nonempty (and also nonfull), used by the psset bins. */ + hpdata_age_heap_link_t age_link; + /* + * When empty (or not corresponding to any hugepage), list + * linkage. + */ + ql_elm(hpdata_t) ql_link_empty; + }; + + /* + * Linkage for the psset to track candidates for purging and hugifying. + */ + ql_elm(hpdata_t) ql_link_purge; + ql_elm(hpdata_t) ql_link_hugify; + + /* The length of the largest contiguous sequence of inactive pages. */ + size_t h_longest_free_range; + + /* Number of active pages. */ + size_t h_nactive; + + /* A bitmap with bits set in the active pages. */ + fb_group_t active_pages[FB_NGROUPS(HUGEPAGE_PAGES)]; + + /* + * Number of dirty or active pages, and a bitmap tracking them. One + * way to think of this is as which pages are dirty from the OS's + * perspective. + */ + size_t h_ntouched; + + /* The touched pages (using the same definition as above). */ + fb_group_t touched_pages[FB_NGROUPS(HUGEPAGE_PAGES)]; +}; + +TYPED_LIST(hpdata_empty_list, hpdata_t, ql_link_empty) +TYPED_LIST(hpdata_purge_list, hpdata_t, ql_link_purge) +TYPED_LIST(hpdata_hugify_list, hpdata_t, ql_link_hugify) + +ph_proto(, hpdata_age_heap, hpdata_t); + +static inline void * +hpdata_addr_get(const hpdata_t *hpdata) { + return hpdata->h_address; +} + +static inline void +hpdata_addr_set(hpdata_t *hpdata, void *addr) { + assert(HUGEPAGE_ADDR2BASE(addr) == addr); + hpdata->h_address = addr; +} + +static inline uint64_t +hpdata_age_get(const hpdata_t *hpdata) { + return hpdata->h_age; +} + +static inline void +hpdata_age_set(hpdata_t *hpdata, uint64_t age) { + hpdata->h_age = age; +} + +static inline bool +hpdata_huge_get(const hpdata_t *hpdata) { + return hpdata->h_huge; +} + +static inline bool +hpdata_alloc_allowed_get(const hpdata_t *hpdata) { + return hpdata->h_alloc_allowed; +} + +static inline void +hpdata_alloc_allowed_set(hpdata_t *hpdata, bool alloc_allowed) { + hpdata->h_alloc_allowed = alloc_allowed; +} + +static inline bool +hpdata_in_psset_alloc_container_get(const hpdata_t *hpdata) { + return hpdata->h_in_psset_alloc_container; +} + +static inline void +hpdata_in_psset_alloc_container_set(hpdata_t *hpdata, bool in_container) { + assert(in_container != hpdata->h_in_psset_alloc_container); + hpdata->h_in_psset_alloc_container = in_container; +} + +static inline bool +hpdata_purge_allowed_get(const hpdata_t *hpdata) { + return hpdata->h_purge_allowed; +} + +static inline void +hpdata_purge_allowed_set(hpdata_t *hpdata, bool purge_allowed) { + assert(purge_allowed == false || !hpdata->h_mid_purge); + hpdata->h_purge_allowed = purge_allowed; +} + +static inline bool +hpdata_hugify_allowed_get(const hpdata_t *hpdata) { + return hpdata->h_hugify_allowed; +} + +static inline void +hpdata_allow_hugify(hpdata_t *hpdata, nstime_t now) { + assert(!hpdata->h_mid_hugify); + hpdata->h_hugify_allowed = true; + hpdata->h_time_hugify_allowed = now; +} + +static inline nstime_t +hpdata_time_hugify_allowed(hpdata_t *hpdata) { + return hpdata->h_time_hugify_allowed; +} + +static inline void +hpdata_disallow_hugify(hpdata_t *hpdata) { + hpdata->h_hugify_allowed = false; +} + +static inline bool +hpdata_in_psset_hugify_container_get(const hpdata_t *hpdata) { + return hpdata->h_in_psset_hugify_container; +} + +static inline void +hpdata_in_psset_hugify_container_set(hpdata_t *hpdata, bool in_container) { + assert(in_container != hpdata->h_in_psset_hugify_container); + hpdata->h_in_psset_hugify_container = in_container; +} + +static inline bool +hpdata_mid_purge_get(const hpdata_t *hpdata) { + return hpdata->h_mid_purge; +} + +static inline void +hpdata_mid_purge_set(hpdata_t *hpdata, bool mid_purge) { + assert(mid_purge != hpdata->h_mid_purge); + hpdata->h_mid_purge = mid_purge; +} + +static inline bool +hpdata_mid_hugify_get(const hpdata_t *hpdata) { + return hpdata->h_mid_hugify; +} + +static inline void +hpdata_mid_hugify_set(hpdata_t *hpdata, bool mid_hugify) { + assert(mid_hugify != hpdata->h_mid_hugify); + hpdata->h_mid_hugify = mid_hugify; +} + +static inline bool +hpdata_changing_state_get(const hpdata_t *hpdata) { + return hpdata->h_mid_purge || hpdata->h_mid_hugify; +} + + +static inline bool +hpdata_updating_get(const hpdata_t *hpdata) { + return hpdata->h_updating; +} + +static inline void +hpdata_updating_set(hpdata_t *hpdata, bool updating) { + assert(updating != hpdata->h_updating); + hpdata->h_updating = updating; +} + +static inline bool +hpdata_in_psset_get(const hpdata_t *hpdata) { + return hpdata->h_in_psset; +} + +static inline void +hpdata_in_psset_set(hpdata_t *hpdata, bool in_psset) { + assert(in_psset != hpdata->h_in_psset); + hpdata->h_in_psset = in_psset; +} + +static inline size_t +hpdata_longest_free_range_get(const hpdata_t *hpdata) { + return hpdata->h_longest_free_range; +} + +static inline void +hpdata_longest_free_range_set(hpdata_t *hpdata, size_t longest_free_range) { + assert(longest_free_range <= HUGEPAGE_PAGES); + hpdata->h_longest_free_range = longest_free_range; +} + +static inline size_t +hpdata_nactive_get(hpdata_t *hpdata) { + return hpdata->h_nactive; +} + +static inline size_t +hpdata_ntouched_get(hpdata_t *hpdata) { + return hpdata->h_ntouched; +} + +static inline size_t +hpdata_ndirty_get(hpdata_t *hpdata) { + return hpdata->h_ntouched - hpdata->h_nactive; +} + +static inline size_t +hpdata_nretained_get(hpdata_t *hpdata) { + return HUGEPAGE_PAGES - hpdata->h_ntouched; +} + +static inline void +hpdata_assert_empty(hpdata_t *hpdata) { + assert(fb_empty(hpdata->active_pages, HUGEPAGE_PAGES)); + assert(hpdata->h_nactive == 0); +} + +/* + * Only used in tests, and in hpdata_assert_consistent, below. Verifies some + * consistency properties of the hpdata (e.g. that cached counts of page stats + * match computed ones). + */ +static inline bool +hpdata_consistent(hpdata_t *hpdata) { + if(fb_urange_longest(hpdata->active_pages, HUGEPAGE_PAGES) + != hpdata_longest_free_range_get(hpdata)) { + return false; + } + if (fb_scount(hpdata->active_pages, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES) + != hpdata->h_nactive) { + return false; + } + if (fb_scount(hpdata->touched_pages, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES) + != hpdata->h_ntouched) { + return false; + } + if (hpdata->h_ntouched < hpdata->h_nactive) { + return false; + } + if (hpdata->h_huge && hpdata->h_ntouched != HUGEPAGE_PAGES) { + return false; + } + if (hpdata_changing_state_get(hpdata) + && ((hpdata->h_purge_allowed) || hpdata->h_hugify_allowed)) { + return false; + } + if (hpdata_hugify_allowed_get(hpdata) + != hpdata_in_psset_hugify_container_get(hpdata)) { + return false; + } + return true; +} + +static inline void +hpdata_assert_consistent(hpdata_t *hpdata) { + assert(hpdata_consistent(hpdata)); +} + +static inline bool +hpdata_empty(const hpdata_t *hpdata) { + return hpdata->h_nactive == 0; +} + +static inline bool +hpdata_full(const hpdata_t *hpdata) { + return hpdata->h_nactive == HUGEPAGE_PAGES; +} + +void hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age); + +/* + * Given an hpdata which can serve an allocation request, pick and reserve an + * offset within that allocation. + */ +void *hpdata_reserve_alloc(hpdata_t *hpdata, size_t sz); +void hpdata_unreserve(hpdata_t *hpdata, void *addr, size_t sz); + +/* + * The hpdata_purge_prepare_t allows grabbing the metadata required to purge + * subranges of a hugepage while holding a lock, drop the lock during the actual + * purging of them, and reacquire it to update the metadata again. + */ +typedef struct hpdata_purge_state_s hpdata_purge_state_t; +struct hpdata_purge_state_s { + size_t npurged; + size_t ndirty_to_purge; + fb_group_t to_purge[FB_NGROUPS(HUGEPAGE_PAGES)]; + size_t next_purge_search_begin; +}; + +/* + * Initializes purge state. The access to hpdata must be externally + * synchronized with other hpdata_* calls. + * + * You can tell whether or not a thread is purging or hugifying a given hpdata + * via hpdata_changing_state_get(hpdata). Racing hugification or purging + * operations aren't allowed. + * + * Once you begin purging, you have to follow through and call hpdata_purge_next + * until you're done, and then end. Allocating out of an hpdata undergoing + * purging is not allowed. + * + * Returns the number of dirty pages that will be purged. + */ +size_t hpdata_purge_begin(hpdata_t *hpdata, hpdata_purge_state_t *purge_state); + +/* + * If there are more extents to purge, sets *r_purge_addr and *r_purge_size to + * true, and returns true. Otherwise, returns false to indicate that we're + * done. + * + * This requires exclusive access to the purge state, but *not* to the hpdata. + * In particular, unreserve calls are allowed while purging (i.e. you can dalloc + * into one part of the hpdata while purging a different part). + */ +bool hpdata_purge_next(hpdata_t *hpdata, hpdata_purge_state_t *purge_state, + void **r_purge_addr, size_t *r_purge_size); +/* + * Updates the hpdata metadata after all purging is done. Needs external + * synchronization. + */ +void hpdata_purge_end(hpdata_t *hpdata, hpdata_purge_state_t *purge_state); + +void hpdata_hugify(hpdata_t *hpdata); +void hpdata_dehugify(hpdata_t *hpdata); + +#endif /* JEMALLOC_INTERNAL_HPDATA_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/inspect.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/inspect.h new file mode 100644 index 000000000..0da920ca1 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/inspect.h @@ -0,0 +1,43 @@ +#ifndef JEMALLOC_INTERNAL_INSPECT_H +#define JEMALLOC_INTERNAL_INSPECT_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/tsd_types.h" + +/* + * This module contains the heap introspection capabilities. For now they are + * exposed purely through mallctl APIs in the experimental namespace, but this + * may change over time. + */ + +/* + * The following two structs are for experimental purposes. See + * experimental_utilization_query_ctl and + * experimental_utilization_batch_query_ctl in src/ctl.c. + */ +typedef struct inspect_extent_util_stats_s inspect_extent_util_stats_t; +struct inspect_extent_util_stats_s { + size_t nfree; + size_t nregs; + size_t size; +}; + +typedef struct inspect_extent_util_stats_verbose_s + inspect_extent_util_stats_verbose_t; + +struct inspect_extent_util_stats_verbose_s { + void *slabcur_addr; + size_t nfree; + size_t nregs; + size_t size; + size_t bin_nfree; + size_t bin_nregs; +}; + +void inspect_extent_util_stats_get(tsdn_t *tsdn, const void *ptr, + size_t *nfree, size_t *nregs, size_t *size); +void inspect_extent_util_stats_verbose_get(tsdn_t *tsdn, const void *ptr, + size_t *nfree, size_t *nregs, size_t *size, + size_t *bin_nfree, size_t *bin_nregs, void **slabcur_addr); + +#endif /* JEMALLOC_INTERNAL_INSPECT_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_decls.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_decls.h new file mode 100644 index 000000000..0bca91331 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_decls.h @@ -0,0 +1,125 @@ +#ifndef JEMALLOC_INTERNAL_DECLS_H +#define JEMALLOC_INTERNAL_DECLS_H + +#include +#ifdef _WIN32 +# include +# include "msvc_compat/windows_extra.h" +# include "msvc_compat/strings.h" +# ifdef _WIN64 +# if LG_VADDR <= 32 +# error Generate the headers using x64 vcargs +# endif +# else +# if LG_VADDR > 32 +# undef LG_VADDR +# define LG_VADDR 32 +# endif +# endif +#else +# include +# include +# if !defined(__pnacl__) && !defined(__native_client__) +# include +# if !defined(SYS_write) && defined(__NR_write) +# define SYS_write __NR_write +# endif +# if defined(SYS_open) && defined(__aarch64__) + /* Android headers may define SYS_open to __NR_open even though + * __NR_open may not exist on AArch64 (superseded by __NR_openat). */ +# undef SYS_open +# endif +# include +# endif +# include +# if defined(__FreeBSD__) || defined(__DragonFly__) || defined(__OpenBSD__) +# include +# include +# if defined(__FreeBSD__) +# define cpu_set_t cpuset_t +# endif +# endif +# include +# ifdef JEMALLOC_OS_UNFAIR_LOCK +# include +# endif +# ifdef JEMALLOC_GLIBC_MALLOC_HOOK +# include +# endif +# include +# include +# include +# ifdef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME +# include +# endif +#endif +#include + +#include +#ifndef SIZE_T_MAX +# define SIZE_T_MAX SIZE_MAX +#endif +#ifndef SSIZE_MAX +# define SSIZE_MAX ((ssize_t)(SIZE_T_MAX >> 1)) +#endif +#include +#include +#include +#include +#include +#include +#ifndef offsetof +# define offsetof(type, member) ((size_t)&(((type *)NULL)->member)) +#endif +#include +#include +#include +#ifdef _MSC_VER +# include +typedef intptr_t ssize_t; +# define PATH_MAX 1024 +# define STDERR_FILENO 2 +# define __func__ __FUNCTION__ +# ifdef JEMALLOC_HAS_RESTRICT +# define restrict __restrict +# endif +/* Disable warnings about deprecated system functions. */ +# pragma warning(disable: 4996) +#if _MSC_VER < 1800 +static int +isblank(int c) { + return (c == '\t' || c == ' '); +} +#endif +#else +# include +#endif +#include + +/* + * The Win32 midl compiler has #define small char; we don't use midl, but + * "small" is a nice identifier to have available when talking about size + * classes. + */ +#ifdef small +# undef small +#endif + +/* + * Oftentimes we'd like to perform some kind of arithmetic to obtain + * a pointer from another pointer but with some offset or mask applied. + * Naively you would accomplish this by casting the source pointer to + * `uintptr_t`, performing all of the relevant arithmetic, and then casting + * the result to the desired pointer type. However, this has the unfortunate + * side-effect of concealing pointer provenance, hiding useful information for + * optimization from the compiler (see here for details: + * https://clang.llvm.org/extra/clang-tidy/checks/performance/no-int-to-ptr.html + * ) + * Instead what one should do is cast the source pointer to `char *` and perform + * the equivalent arithmetic (since `char` of course represents one byte). But + * because `char *` has the semantic meaning of "string", we define this typedef + * simply to make it clearer where we are performing such pointer arithmetic. + */ +typedef char byte_t; + +#endif /* JEMALLOC_INTERNAL_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_defs.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_defs.h new file mode 100644 index 000000000..eab618c82 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_defs.h @@ -0,0 +1,586 @@ +/* include/jemalloc/internal/jemalloc_internal_defs.h. Generated from jemalloc_internal_defs.h.in by configure. */ +#ifndef JEMALLOC_INTERNAL_DEFS_H_ +#define JEMALLOC_INTERNAL_DEFS_H_ + +#include + +/* + * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all + * public APIs to be prefixed. This makes it possible, with some care, to use + * multiple allocators simultaneously. + */ +#define JEMALLOC_PREFIX "duckdb_je_" +#define JEMALLOC_CPREFIX "DUCKDB_JE_" + +/* + * Define overrides for non-standard allocator-related functions if they are + * present on the system. + */ +/* #undef JEMALLOC_OVERRIDE___LIBC_CALLOC */ +/* #undef JEMALLOC_OVERRIDE___LIBC_FREE */ +/* #undef JEMALLOC_OVERRIDE___LIBC_MALLOC +/* #undef JEMALLOC_OVERRIDE___LIBC_MEMALIGN */ +/* #undef JEMALLOC_OVERRIDE___LIBC_REALLOC */ +/* #undef JEMALLOC_OVERRIDE___LIBC_VALLOC */ +/* #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN */ + +/* + * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs. + * For shared libraries, symbol visibility mechanisms prevent these symbols + * from being exported, but for static libraries, naming collisions are a real + * possibility. + */ +#define JEMALLOC_PRIVATE_NAMESPACE duckdb_je_ + +/* + * Hyper-threaded CPUs may need a special instruction inside spin loops in + * order to yield to another virtual CPU. + */ +#if defined(__aarch64__) || defined(__ARM_ARCH) +#define CPU_SPINWAIT __asm__ volatile("isb") +#else +#define CPU_SPINWAIT __asm__ volatile("pause") +#endif +/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */ +#define HAVE_CPU_SPINWAIT 1 + +/* + * Number of significant bits in virtual addresses. This may be less than the + * total number of bits in a pointer, e.g. on x64, for which the uppermost 16 + * bits are the same as bit 47. + */ +#if INTPTR_MAX == INT64_MAX +#define LG_VADDR 48 +#else +#define LG_VADDR 32 +#endif + +/* Defined if C11 atomics are available. */ +#define JEMALLOC_C11_ATOMICS + +/* Defined if GCC __atomic atomics are available. */ +#ifndef _MSC_VER +#define JEMALLOC_GCC_ATOMIC_ATOMICS +#endif +/* and the 8-bit variant support. */ +#define JEMALLOC_GCC_U8_ATOMIC_ATOMICS + +/* Defined if GCC __sync atomics are available. */ +#ifndef _MSC_VER +#define JEMALLOC_GCC_SYNC_ATOMICS +#endif +/* and the 8-bit variant support. */ +#ifndef _MSC_VER +#define JEMALLOC_GCC_U8_SYNC_ATOMICS +#endif + +/* + * Defined if __builtin_clz() and __builtin_clzl() are available. + */ +#ifdef __GNUC__ +#define JEMALLOC_HAVE_BUILTIN_CLZ +#endif + +/* + * Defined if os_unfair_lock_*() functions are available, as provided by Darwin. + */ +#if defined(__APPLE__) +#define JEMALLOC_OS_UNFAIR_LOCK +#endif + +/* Defined if syscall(2) is usable. */ +#ifdef __GLIBC__ +#define JEMALLOC_USE_SYSCALL +#endif + +/* + * Defined if secure_getenv(3) is available. + */ +#ifdef __GLIBC__ +#undef JEMALLOC_HAVE_SECURE_GETENV +#endif + +/* + * Defined if issetugid(2) is available. + */ +#ifdef __APPLE__ +#define JEMALLOC_HAVE_ISSETUGID +#endif + +/* Defined if pthread_atfork(3) is available. */ +#ifndef _MSC_VER +#define JEMALLOC_HAVE_PTHREAD_ATFORK +#endif + +/* Defined if pthread_setname_np(3) is available. */ +// #define JEMALLOC_HAVE_PTHREAD_SETNAME_NP + +/* Defined if pthread_getname_np(3) is available. */ +#ifdef __APPLE__ +#define JEMALLOC_HAVE_PTHREAD_GETNAME_NP +#endif + +/* Defined if pthread_set_name_np(3) is available. */ +/* #undef JEMALLOC_HAVE_PTHREAD_SET_NAME_NP */ + +/* Defined if pthread_get_name_np(3) is available. */ +/* #undef JEMALLOC_HAVE_PTHREAD_GET_NAME_NP */ + +/* + * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available. + */ +#ifdef __GLIBC__ +#define JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE +#endif + +/* + * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available. + */ +#ifdef __GLIBC__ +#define JEMALLOC_HAVE_CLOCK_MONOTONIC +#endif + +/* + * Defined if mach_absolute_time() is available. + */ +#ifdef __APPLE__ +#define JEMALLOC_HAVE_MACH_ABSOLUTE_TIME +#endif + +/* + * Defined if clock_gettime(CLOCK_REALTIME, ...) is available. + */ +#define JEMALLOC_HAVE_CLOCK_REALTIME + +/* + * Defined if _malloc_thread_cleanup() exists. At least in the case of + * FreeBSD, pthread_key_create() allocates, which if used during malloc + * bootstrapping will cause recursion into the pthreads library. Therefore, if + * _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in + * malloc_tsd. + */ +#if defined(__FreeBSD__) +#define JEMALLOC_MALLOC_THREAD_CLEANUP +#endif + +/* + * Defined if threaded initialization is known to be safe on this platform. + * Among other things, it must be possible to initialize a mutex without + * triggering allocation in order for threaded allocation to be safe. + */ +#ifdef __GLIBC__ +#define JEMALLOC_THREADED_INIT +#endif + +/* + * Defined if the pthreads implementation defines + * _pthread_mutex_init_calloc_cb(), in which case the function is used in order + * to avoid recursive allocation during mutex initialization. + */ +/* #undef JEMALLOC_MUTEX_INIT_CB */ + +/* Non-empty if the tls_model attribute is supported. */ +#define JEMALLOC_TLS_MODEL __attribute__((tls_model("global-dynamic"))) + +/* + * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables + * inline functions. + */ +/* #undef JEMALLOC_DEBUG */ + +/* JEMALLOC_STATS enables statistics calculation. */ +#define JEMALLOC_STATS + +/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */ +/* #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API */ + +/* JEMALLOC_PROF enables allocation profiling. */ +/* #undef JEMALLOC_PROF */ + +/* Use libunwind for profile backtracing if defined. */ +/* #undef JEMALLOC_PROF_LIBUNWIND */ + +/* Use libgcc for profile backtracing if defined. */ +/* #undef JEMALLOC_PROF_LIBGCC */ + +/* Use gcc intrinsics for profile backtracing if defined. */ +/* #undef JEMALLOC_PROF_GCC */ + +/* JEMALLOC_PAGEID enabled page id */ +/* #undef JEMALLOC_PAGEID */ + +/* JEMALLOC_HAVE_PRCTL checks prctl */ +#ifdef __GLIBC__ +#define JEMALLOC_HAVE_PRCTL +#endif + +/* + * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage + * segment (DSS). + */ +#ifdef __GLIBC__ +#define JEMALLOC_DSS +#endif + +/* Support memory filling (junk/zero). */ +#define JEMALLOC_FILL + +/* Support utrace(2)-based tracing. */ +/* #undef JEMALLOC_UTRACE */ + +/* Support utrace(2)-based tracing (label based signature). */ +/* #undef JEMALLOC_UTRACE_LABEL */ + +/* Support optional abort() on OOM. */ +/* #undef JEMALLOC_XMALLOC */ + +/* Support lazy locking (avoid locking unless a second thread is launched). */ +// #define JEMALLOC_LAZY_LOCK + +/* + * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size + * classes). + */ +/* #undef LG_QUANTUM */ + +/* One page is 2^LG_PAGE bytes. */ +// ----- DuckDB comment ----- +// The page size for jemalloc can always be bigger than the actual system page size +#if INTPTR_MAX != INT64_MAX +#define LG_PAGE 12 // 32-bit systems typically have a 4KB page size +#elif defined(__i386__) || defined(__x86_64__) || defined(__amd64__) || \ + defined(COMPILER_MSVC) && (defined(_M_IX86) || defined(_M_X64)) +#define LG_PAGE 12 // x86 and x86_64 typically have a 4KB page size +#elif defined(__powerpc__) || defined(__ppc__) +#define LG_PAGE 16 // PowerPC architectures often use 64KB page size +#elif defined(__sparc__) +#define LG_PAGE 13 // SPARC architectures usually have an 8KB page size +#elif defined(__aarch64__) || defined(__ARM_ARCH) + +// ARM architectures are less well-defined +#if defined(__APPLE__) +#define LG_PAGE 14 // Apple Silicon uses a 16KB page size +#else +#define LG_PAGE 16 // Use max known page size for ARM +#endif + +#else +#define LG_PAGE 12 // Default to the most common page size of 4KB +#endif + +/* Maximum number of regions in a slab. */ +/* #undef CONFIG_LG_SLAB_MAXREGS */ + +/* + * One huge page is 2^LG_HUGEPAGE bytes. Note that this is defined even if the + * system does not explicitly support huge pages; system calls that require + * explicit huge page support are separately configured. + */ +#define LG_HUGEPAGE 21 + +/* + * If defined, adjacent virtual memory mappings with identical attributes + * automatically coalesce, and they fragment when changes are made to subranges. + * This is the normal order of things for mmap()/munmap(), but on Windows + * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e. + * mappings do *not* coalesce/fragment. + */ +#ifndef _MSC_VER +#define JEMALLOC_MAPS_COALESCE +#endif + +/* + * If defined, retain memory for later reuse by default rather than using e.g. + * munmap() to unmap freed extents. This is enabled on 64-bit Linux because + * common sequences of mmap()/munmap() calls will cause virtual memory map + * holes. + */ +// ----- DuckDB comment ----- +// This makes it feasible to run the larger page size (https://github.com/duckdb/duckdb/discussions/11455), +// but it causes DuckDB to retain RSS even after closing the connection, so we have to disable it +#if INTPTR_MAX == INT64_MAX && !defined(__APPLE__) +#define JEMALLOC_RETAIN +#endif + +/* TLS is used to map arenas and magazine caches to threads. */ +#ifndef __APPLE__ +#define JEMALLOC_TLS +#endif + +/* + * Used to mark unreachable code to quiet "end of non-void" compiler warnings. + * Don't use this directly; instead use unreachable() from util.h + */ +#ifdef _MSC_VER +[[noreturn]] __forceinline void msvc_unreachable() { + __assume(false); +} +#define JEMALLOC_INTERNAL_UNREACHABLE msvc_unreachable +#else +#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable +#endif + +/* + * ffs*() functions to use for bitmapping. Don't use these directly; instead, + * use ffs_*() from util.h. + */ +#ifdef _MSC_VER +#include "msvc_compat/strings.h" +#define JEMALLOC_INTERNAL_FFSLL ffsll +#define JEMALLOC_INTERNAL_FFSL ffsl +#define JEMALLOC_INTERNAL_FFS ffs +#else +#define JEMALLOC_INTERNAL_FFSLL __builtin_ffsll +#define JEMALLOC_INTERNAL_FFSL __builtin_ffsl +#define JEMALLOC_INTERNAL_FFS __builtin_ffs +#endif + +/* + * popcount*() functions to use for bitmapping. + */ +#ifdef __GNUC__ +#define JEMALLOC_INTERNAL_POPCOUNTL __builtin_popcountl +#define JEMALLOC_INTERNAL_POPCOUNT __builtin_popcount +#endif + +/* + * If defined, explicitly attempt to more uniformly distribute large allocation + * pointer alignments across all cache indices. + */ +#if (LG_PAGE == 12) +#define JEMALLOC_CACHE_OBLIVIOUS +#endif + +/* + * If defined, enable logging facilities. We make this a configure option to + * avoid taking extra branches everywhere. + */ +/* #undef JEMALLOC_LOG */ + +/* + * If defined, use readlinkat() (instead of readlink()) to follow + * /etc/malloc_conf. + */ +/* #undef JEMALLOC_READLINKAT */ + +/* + * If defined, use getenv() (instead of secure_getenv() or + * alternatives) to access MALLOC_CONF. + */ +/* #undef JEMALLOC_FORCE_GETENV */ + +/* + * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings. + */ +#if defined(__APPLE__) +#define JEMALLOC_ZONE +#endif + +/* + * Methods for determining whether the OS overcommits. + * JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY: Linux's + * /proc/sys/vm.overcommit_memory file. + * JEMALLOC_SYSCTL_VM_OVERCOMMIT: FreeBSD's vm.overcommit sysctl. + */ +#if defined(__linux__) +#define JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY +#elif defined(__FreeBSD__) +#define JEMALLOC_SYSCTL_VM_OVERCOMMIT +#endif + +/* Defined if madvise(2) is available. */ +#define JEMALLOC_HAVE_MADVISE + +/* + * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE + * arguments to madvise(2). + */ +// #ifdef __GLIBC__ +// #define JEMALLOC_HAVE_MADVISE_HUGE +// #endif + +/* + * Methods for purging unused pages differ between operating systems. + * + * madvise(..., MADV_FREE) : This marks pages as being unused, such that they + * will be discarded rather than swapped out. + * madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is + * defined, this immediately discards pages, + * such that new pages will be demand-zeroed if + * the address region is later touched; + * otherwise this behaves similarly to + * MADV_FREE, though typically with higher + * system overhead. + */ +#define JEMALLOC_PURGE_MADVISE_FREE +#define JEMALLOC_PURGE_MADVISE_DONTNEED +#ifdef __GLIBC__ +#define JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS +#endif + +/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */ +#ifdef __GLIBC__ +#define JEMALLOC_DEFINE_MADVISE_FREE +#endif + +/* + * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise. + */ +#ifdef __GLIBC__ +#define JEMALLOC_MADVISE_DONTDUMP +#endif + +/* + * Defined if MADV_[NO]CORE is supported as an argument to madvise. + */ +/* #undef JEMALLOC_MADVISE_NOCORE */ + +/* Defined if mprotect(2) is available. */ +#define JEMALLOC_HAVE_MPROTECT + +/* + * Defined if transparent huge pages (THPs) are supported via the + * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled. + */ +/* #undef JEMALLOC_THP */ + +/* Defined if posix_madvise is available. */ +/* #undef JEMALLOC_HAVE_POSIX_MADVISE */ + +/* + * Method for purging unused pages using posix_madvise. + * + * posix_madvise(..., POSIX_MADV_DONTNEED) + */ +/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED */ +/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS */ + +/* + * Defined if memcntl page admin call is supported + */ +/* #undef JEMALLOC_HAVE_MEMCNTL */ + +/* + * Defined if malloc_size is supported + */ +#ifdef __APPLE__ +#define JEMALLOC_HAVE_MALLOC_SIZE +#endif + +/* Define if operating system has alloca.h header. */ +#ifdef __GLIBC__ +#define JEMALLOC_HAS_ALLOCA_H +#endif + +/* C99 restrict keyword supported. */ +#define JEMALLOC_HAS_RESTRICT + +/* For use by hash code. */ +/* #undef JEMALLOC_BIG_ENDIAN */ + +/* sizeof(int) == 2^LG_SIZEOF_INT. */ +#define LG_SIZEOF_INT 2 + +/* sizeof(long) == 2^LG_SIZEOF_LONG. */ +#if ULONG_MAX > UINT_MAX +#define LG_SIZEOF_LONG 3 +#else +#define LG_SIZEOF_LONG 2 +#endif + +/* sizeof(long long) == 2^LG_SIZEOF_LONG_LONG. */ +#define LG_SIZEOF_LONG_LONG 3 + +/* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */ +#define LG_SIZEOF_INTMAX_T 3 + +/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */ +#ifdef __GLIBC__ +#define JEMALLOC_GLIBC_MALLOC_HOOK +#endif + +/* glibc memalign hook. */ +#ifdef __GLIBC__ +#define JEMALLOC_GLIBC_MEMALIGN_HOOK +#endif + +/* pthread support */ +#define JEMALLOC_HAVE_PTHREAD + +/* dlsym() support */ +#if defined(__APPLE__) || defined(_GNU_SOURCE) +#define JEMALLOC_HAVE_DLSYM +#endif + +/* Adaptive mutex support in pthreads. */ +/* #undef JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP */ + +/* GNU specific sched_getcpu support */ +// #define JEMALLOC_HAVE_SCHED_GETCPU + +/* GNU specific sched_setaffinity support */ +// #define JEMALLOC_HAVE_SCHED_SETAFFINITY + +/* pthread_setaffinity_np support */ +/* #undef JEMALLOC_HAVE_PTHREAD_SETAFFINITY_NP */ + +/* + * If defined, all the features necessary for background threads are present. + */ +#define JEMALLOC_BACKGROUND_THREAD + +/* + * If defined, jemalloc symbols are not exported (doesn't work when + * JEMALLOC_PREFIX is not defined). + */ +#define JEMALLOC_EXPORT + +/* config.malloc_conf options string. */ +#define JEMALLOC_CONFIG_MALLOC_CONF "" + +/* If defined, jemalloc takes the malloc/free/etc. symbol names. */ +/* #undef JEMALLOC_IS_MALLOC */ + +/* + * Defined if strerror_r returns char * if _GNU_SOURCE is defined. + */ +#ifdef __GLIBC__ +#define JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE +#endif + +/* Performs additional safety checks when defined. */ +/* #undef JEMALLOC_OPT_SAFETY_CHECKS */ + +/* Is C++ support being built? */ +#define JEMALLOC_ENABLE_CXX + +/* Performs additional size checks when defined. */ +/* #undef JEMALLOC_OPT_SIZE_CHECKS */ + +/* Allows sampled junk and stash for checking use-after-free when defined. */ +/* #undef JEMALLOC_UAF_DETECTION */ + +/* Darwin VM_MAKE_TAG support */ +#if defined(__APPLE__) +#define JEMALLOC_HAVE_VM_MAKE_TAG +#endif + +/* If defined, realloc(ptr, 0) defaults to "free" instead of "alloc". */ +#ifdef __GLIBC__ +#undef JEMALLOC_ZERO_REALLOC_DEFAULT_FREE +#endif + +/* If defined, use volatile asm during benchmarks. */ +#ifdef __APPLE__ +#define JEMALLOC_HAVE_ASM_VOLATILE +#endif + +/* + * If defined, support the use of rdtscp to get the time stamp counter + * and the processor ID. + */ +/* #undef JEMALLOC_HAVE_RDTSCP */ + +#include "jemalloc/internal/jemalloc_internal_overrides.h" + +#endif /* JEMALLOC_INTERNAL_DEFS_H_ */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_externs.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_externs.h new file mode 100644 index 000000000..41c0f366f --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_externs.h @@ -0,0 +1,83 @@ +#ifndef JEMALLOC_INTERNAL_EXTERNS_H +#define JEMALLOC_INTERNAL_EXTERNS_H + +#include "jemalloc/internal/arena_types.h" +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/hpa_opts.h" +#include "jemalloc/internal/nstime.h" +#include "jemalloc/internal/sec_opts.h" +#include "jemalloc/internal/tsd_types.h" + +/* TSD checks this to set thread local slow state accordingly. */ +extern bool malloc_slow; + +/* Run-time options. */ +extern bool opt_abort; +extern bool opt_abort_conf; +extern bool opt_trust_madvise; +extern bool opt_confirm_conf; +extern bool opt_hpa; +extern hpa_shard_opts_t opt_hpa_opts; +extern sec_opts_t opt_hpa_sec_opts; + +extern const char *opt_junk; +extern bool opt_junk_alloc; +extern bool opt_junk_free; +extern void (*JET_MUTABLE junk_free_callback)(void *ptr, size_t size); +extern void (*JET_MUTABLE junk_alloc_callback)(void *ptr, size_t size); +extern void (*JET_MUTABLE invalid_conf_abort)(void); +extern bool opt_utrace; +extern bool opt_xmalloc; +extern bool opt_experimental_infallible_new; +extern bool opt_zero; +extern unsigned opt_narenas; +extern zero_realloc_action_t opt_zero_realloc_action; +extern malloc_init_t malloc_init_state; +extern const char *const zero_realloc_mode_names[]; +extern atomic_zu_t zero_realloc_count; +extern bool opt_cache_oblivious; +extern unsigned opt_debug_double_free_max_scan; +extern size_t opt_calloc_madvise_threshold; + +extern const char *opt_malloc_conf_symlink; +extern const char *opt_malloc_conf_env_var; + +/* Escape free-fastpath when ptr & mask == 0 (for sanitization purpose). */ +extern uintptr_t san_cache_bin_nonfast_mask; + +/* Number of CPUs. */ +extern unsigned ncpus; + +/* Number of arenas used for automatic multiplexing of threads and arenas. */ +extern unsigned narenas_auto; + +/* Base index for manual arenas. */ +extern unsigned manual_arena_base; + +/* + * Arenas that are used to service external requests. Not all elements of the + * arenas array are necessarily used; arenas are created lazily as needed. + */ +extern atomic_p_t arenas[]; + +void *a0malloc(size_t size); +void a0dalloc(void *ptr); +void *bootstrap_malloc(size_t size); +void *bootstrap_calloc(size_t num, size_t size); +void bootstrap_free(void *ptr); +void arena_set(unsigned ind, arena_t *arena); +unsigned narenas_total_get(void); +arena_t *arena_init(tsdn_t *tsdn, unsigned ind, const arena_config_t *config); +arena_t *arena_choose_hard(tsd_t *tsd, bool internal); +void arena_migrate(tsd_t *tsd, arena_t *oldarena, arena_t *newarena); +void iarena_cleanup(tsd_t *tsd); +void arena_cleanup(tsd_t *tsd); +size_t batch_alloc(void **ptrs, size_t num, size_t size, int flags); +void jemalloc_prefork(void); +void jemalloc_postfork_parent(void); +void jemalloc_postfork_child(void); +void sdallocx_default(void *ptr, size_t size, int flags); +void free_default(void *ptr); +void *malloc_default(size_t size); + +#endif /* JEMALLOC_INTERNAL_EXTERNS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_includes.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_includes.h new file mode 100644 index 000000000..751c112ff --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_includes.h @@ -0,0 +1,84 @@ +#ifndef JEMALLOC_INTERNAL_INCLUDES_H +#define JEMALLOC_INTERNAL_INCLUDES_H + +/* + * jemalloc can conceptually be broken into components (arena, tcache, etc.), + * but there are circular dependencies that cannot be broken without + * substantial performance degradation. + * + * Historically, we dealt with this by each header into four sections (types, + * structs, externs, and inlines), and included each header file multiple times + * in this file, picking out the portion we want on each pass using the + * following #defines: + * JEMALLOC_H_TYPES : Preprocessor-defined constants and pseudo-opaque data + * types. + * JEMALLOC_H_STRUCTS : Data structures. + * JEMALLOC_H_EXTERNS : Extern data declarations and function prototypes. + * JEMALLOC_H_INLINES : Inline functions. + * + * We're moving toward a world in which the dependencies are explicit; each file + * will #include the headers it depends on (rather than relying on them being + * implicitly available via this file including every header file in the + * project). + * + * We're now in an intermediate state: we've broken up the header files to avoid + * having to include each one multiple times, but have not yet moved the + * dependency information into the header files (i.e. we still rely on the + * ordering in this file to ensure all a header's dependencies are available in + * its translation unit). Each component is now broken up into multiple header + * files, corresponding to the sections above (e.g. instead of "foo.h", we now + * have "foo_types.h", "foo_structs.h", "foo_externs.h", "foo_inlines.h"). + * + * Those files which have been converted to explicitly include their + * inter-component dependencies are now in the initial HERMETIC HEADERS + * section. All headers may still rely on jemalloc_preamble.h (which, by fiat, + * must be included first in every translation unit) for system headers and + * global jemalloc definitions, however. + */ + +/******************************************************************************/ +/* TYPES */ +/******************************************************************************/ + +#include "jemalloc/internal/arena_types.h" +#include "jemalloc/internal/tcache_types.h" +#include "jemalloc/internal/prof_types.h" + +/******************************************************************************/ +/* STRUCTS */ +/******************************************************************************/ + +#include "jemalloc/internal/prof_structs.h" +#include "jemalloc/internal/arena_structs.h" +#include "jemalloc/internal/tcache_structs.h" +#include "jemalloc/internal/background_thread_structs.h" + +/******************************************************************************/ +/* EXTERNS */ +/******************************************************************************/ + +#include "jemalloc/internal/jemalloc_internal_externs.h" +#include "jemalloc/internal/arena_externs.h" +#include "jemalloc/internal/large_externs.h" +#include "jemalloc/internal/tcache_externs.h" +#include "jemalloc/internal/prof_externs.h" +#include "jemalloc/internal/background_thread_externs.h" + +/******************************************************************************/ +/* INLINES */ +/******************************************************************************/ + +#include "jemalloc/internal/jemalloc_internal_inlines_a.h" +/* + * Include portions of arena code interleaved with tcache code in order to + * resolve circular dependencies. + */ +#include "jemalloc/internal/arena_inlines_a.h" +#include "jemalloc/internal/jemalloc_internal_inlines_b.h" +#include "jemalloc/internal/tcache_inlines.h" +#include "jemalloc/internal/arena_inlines_b.h" +#include "jemalloc/internal/jemalloc_internal_inlines_c.h" +#include "jemalloc/internal/prof_inlines.h" +#include "jemalloc/internal/background_thread_inlines.h" + +#endif /* JEMALLOC_INTERNAL_INCLUDES_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_a.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_a.h new file mode 100644 index 000000000..111cda429 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_a.h @@ -0,0 +1,135 @@ +#ifndef JEMALLOC_INTERNAL_INLINES_A_H +#define JEMALLOC_INTERNAL_INLINES_A_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/arena_externs.h" +#include "jemalloc/internal/arena_types.h" +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/bit_util.h" +#include "jemalloc/internal/jemalloc_internal_types.h" +#include "jemalloc/internal/sc.h" +#include "jemalloc/internal/tcache_externs.h" +#include "jemalloc/internal/ticker.h" + +JEMALLOC_ALWAYS_INLINE malloc_cpuid_t +malloc_getcpu(void) { + assert(have_percpu_arena); +#if defined(_WIN32) + return GetCurrentProcessorNumber(); +#elif defined(JEMALLOC_HAVE_SCHED_GETCPU) + return (malloc_cpuid_t)sched_getcpu(); +#elif defined(JEMALLOC_HAVE_RDTSCP) + unsigned int ecx; + asm volatile("rdtscp" : "=c" (ecx) :: "eax", "edx"); + return (malloc_cpuid_t)(ecx & 0xfff); +#elif defined(__aarch64__) && defined(__APPLE__) + /* Other oses most likely use tpidr_el0 instead */ + uintptr_t c; + asm volatile("mrs %x0, tpidrro_el0" : "=r"(c) :: "memory"); + return (malloc_cpuid_t)(c & (1 << 3) - 1); +#else + not_reached(); + return -1; +#endif +} + +/* Return the chosen arena index based on current cpu. */ +JEMALLOC_ALWAYS_INLINE unsigned +percpu_arena_choose(void) { + assert(have_percpu_arena && PERCPU_ARENA_ENABLED(opt_percpu_arena)); + + malloc_cpuid_t cpuid = malloc_getcpu(); + assert(cpuid >= 0); + + unsigned arena_ind; + if ((opt_percpu_arena == percpu_arena) || ((unsigned)cpuid < ncpus / + 2)) { + arena_ind = cpuid; + } else { + assert(opt_percpu_arena == per_phycpu_arena); + /* Hyper threads on the same physical CPU share arena. */ + arena_ind = cpuid - ncpus / 2; + } + + return arena_ind; +} + +/* Return the limit of percpu auto arena range, i.e. arenas[0...ind_limit). */ +JEMALLOC_ALWAYS_INLINE unsigned +percpu_arena_ind_limit(percpu_arena_mode_t mode) { + assert(have_percpu_arena && PERCPU_ARENA_ENABLED(mode)); + if (mode == per_phycpu_arena && ncpus > 1) { + if (ncpus % 2) { + /* This likely means a misconfig. */ + return ncpus / 2 + 1; + } + return ncpus / 2; + } else { + return ncpus; + } +} + +static inline arena_t * +arena_get(tsdn_t *tsdn, unsigned ind, bool init_if_missing) { + arena_t *ret; + + assert(ind < MALLOCX_ARENA_LIMIT); + + ret = (arena_t *)atomic_load_p(&arenas[ind], ATOMIC_ACQUIRE); + if (unlikely(ret == NULL)) { + if (init_if_missing) { + ret = arena_init(tsdn, ind, &arena_config_default); + } + } + return ret; +} + +JEMALLOC_ALWAYS_INLINE bool +tcache_available(tsd_t *tsd) { + /* + * Thread specific auto tcache might be unavailable if: 1) during tcache + * initialization, or 2) disabled through thread.tcache.enabled mallctl + * or config options. This check covers all cases. + */ + if (likely(tsd_tcache_enabled_get(tsd))) { + /* Associated arena == NULL implies tcache init in progress. */ + if (config_debug && tsd_tcache_slowp_get(tsd)->arena != NULL) { + tcache_assert_initialized(tsd_tcachep_get(tsd)); + } + return true; + } + + return false; +} + +JEMALLOC_ALWAYS_INLINE tcache_t * +tcache_get(tsd_t *tsd) { + if (!tcache_available(tsd)) { + return NULL; + } + + return tsd_tcachep_get(tsd); +} + +JEMALLOC_ALWAYS_INLINE tcache_slow_t * +tcache_slow_get(tsd_t *tsd) { + if (!tcache_available(tsd)) { + return NULL; + } + + return tsd_tcache_slowp_get(tsd); +} + +static inline void +pre_reentrancy(tsd_t *tsd, arena_t *arena) { + /* arena is the current context. Reentry from a0 is not allowed. */ + assert(arena != arena_get(tsd_tsdn(tsd), 0, false)); + tsd_pre_reentrancy_raw(tsd); +} + +static inline void +post_reentrancy(tsd_t *tsd) { + tsd_post_reentrancy_raw(tsd); +} + +#endif /* JEMALLOC_INTERNAL_INLINES_A_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_b.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_b.h new file mode 100644 index 000000000..2ddb4a894 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_b.h @@ -0,0 +1,107 @@ +#ifndef JEMALLOC_INTERNAL_INLINES_B_H +#define JEMALLOC_INTERNAL_INLINES_B_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/arena_inlines_a.h" +#include "jemalloc/internal/extent.h" +#include "jemalloc/internal/jemalloc_internal_inlines_a.h" + +static inline void +percpu_arena_update(tsd_t *tsd, unsigned cpu) { + assert(have_percpu_arena); + arena_t *oldarena = tsd_arena_get(tsd); + assert(oldarena != NULL); + unsigned oldind = arena_ind_get(oldarena); + + if (oldind != cpu) { + unsigned newind = cpu; + arena_t *newarena = arena_get(tsd_tsdn(tsd), newind, true); + assert(newarena != NULL); + + /* Set new arena/tcache associations. */ + arena_migrate(tsd, oldarena, newarena); + tcache_t *tcache = tcache_get(tsd); + if (tcache != NULL) { + tcache_slow_t *tcache_slow = tsd_tcache_slowp_get(tsd); + assert(tcache_slow->arena != NULL); + tcache_arena_reassociate(tsd_tsdn(tsd), tcache_slow, + tcache, newarena); + } + } +} + + +/* Choose an arena based on a per-thread value. */ +static inline arena_t * +arena_choose_impl(tsd_t *tsd, arena_t *arena, bool internal) { + arena_t *ret; + + if (arena != NULL) { + return arena; + } + + /* During reentrancy, arena 0 is the safest bet. */ + if (unlikely(tsd_reentrancy_level_get(tsd) > 0)) { + return arena_get(tsd_tsdn(tsd), 0, true); + } + + ret = internal ? tsd_iarena_get(tsd) : tsd_arena_get(tsd); + if (unlikely(ret == NULL)) { + ret = arena_choose_hard(tsd, internal); + assert(ret); + if (tcache_available(tsd)) { + tcache_slow_t *tcache_slow = tsd_tcache_slowp_get(tsd); + tcache_t *tcache = tsd_tcachep_get(tsd); + if (tcache_slow->arena != NULL) { + /* See comments in tsd_tcache_data_init().*/ + assert(tcache_slow->arena == + arena_get(tsd_tsdn(tsd), 0, false)); + if (tcache_slow->arena != ret) { + tcache_arena_reassociate(tsd_tsdn(tsd), + tcache_slow, tcache, ret); + } + } else { + tcache_arena_associate(tsd_tsdn(tsd), + tcache_slow, tcache, ret); + } + } + } + + /* + * Note that for percpu arena, if the current arena is outside of the + * auto percpu arena range, (i.e. thread is assigned to a manually + * managed arena), then percpu arena is skipped. + */ + if (have_percpu_arena && PERCPU_ARENA_ENABLED(opt_percpu_arena) && + !internal && (arena_ind_get(ret) < + percpu_arena_ind_limit(opt_percpu_arena)) && (ret->last_thd != + tsd_tsdn(tsd))) { + unsigned ind = percpu_arena_choose(); + if (arena_ind_get(ret) != ind) { + percpu_arena_update(tsd, ind); + ret = tsd_arena_get(tsd); + } + ret->last_thd = tsd_tsdn(tsd); + } + + return ret; +} + +static inline arena_t * +arena_choose(tsd_t *tsd, arena_t *arena) { + return arena_choose_impl(tsd, arena, false); +} + +static inline arena_t * +arena_ichoose(tsd_t *tsd, arena_t *arena) { + return arena_choose_impl(tsd, arena, true); +} + +static inline bool +arena_is_auto(arena_t *arena) { + assert(narenas_auto > 0); + + return (arena_ind_get(arena) < manual_arena_base); +} + +#endif /* JEMALLOC_INTERNAL_INLINES_B_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h new file mode 100644 index 000000000..6dcffac95 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h @@ -0,0 +1,597 @@ +#ifndef JEMALLOC_INTERNAL_INLINES_C_H +#define JEMALLOC_INTERNAL_INLINES_C_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/arena_externs.h" +#include "jemalloc/internal/arena_inlines_b.h" +#include "jemalloc/internal/emap.h" +#include "jemalloc/internal/hook.h" +#include "jemalloc/internal/jemalloc_internal_types.h" +#include "jemalloc/internal/log.h" +#include "jemalloc/internal/sz.h" +#include "jemalloc/internal/thread_event.h" +#include "jemalloc/internal/witness.h" + +/* + * These correspond to the macros in jemalloc/jemalloc_macros.h. Broadly, we + * should have one constant here per magic value there. Note however that the + * representations need not be related. + */ +#define TCACHE_IND_NONE ((unsigned)-1) +#define TCACHE_IND_AUTOMATIC ((unsigned)-2) +#define ARENA_IND_AUTOMATIC ((unsigned)-1) + +/* + * Translating the names of the 'i' functions: + * Abbreviations used in the first part of the function name (before + * alloc/dalloc) describe what that function accomplishes: + * a: arena (query) + * s: size (query, or sized deallocation) + * e: extent (query) + * p: aligned (allocates) + * vs: size (query, without knowing that the pointer is into the heap) + * r: rallocx implementation + * x: xallocx implementation + * Abbreviations used in the second part of the function name (after + * alloc/dalloc) describe the arguments it takes + * z: whether to return zeroed memory + * t: accepts a tcache_t * parameter + * m: accepts an arena_t * parameter + */ + +JEMALLOC_ALWAYS_INLINE arena_t * +iaalloc(tsdn_t *tsdn, const void *ptr) { + assert(ptr != NULL); + + return arena_aalloc(tsdn, ptr); +} + +JEMALLOC_ALWAYS_INLINE size_t +isalloc(tsdn_t *tsdn, const void *ptr) { + assert(ptr != NULL); + + return arena_salloc(tsdn, ptr); +} + +JEMALLOC_ALWAYS_INLINE void * +iallocztm_explicit_slab(tsdn_t *tsdn, size_t size, szind_t ind, bool zero, + bool slab, tcache_t *tcache, bool is_internal, arena_t *arena, + bool slow_path) { + void *ret; + + assert(!slab || sz_can_use_slab(size)); /* slab && large is illegal */ + assert(!is_internal || tcache == NULL); + assert(!is_internal || arena == NULL || arena_is_auto(arena)); + if (!tsdn_null(tsdn) && tsd_reentrancy_level_get(tsdn_tsd(tsdn)) == 0) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + } + + ret = arena_malloc(tsdn, arena, size, ind, zero, slab, tcache, slow_path); + if (config_stats && is_internal && likely(ret != NULL)) { + arena_internal_add(iaalloc(tsdn, ret), isalloc(tsdn, ret)); + } + return ret; +} + +JEMALLOC_ALWAYS_INLINE void * +iallocztm(tsdn_t *tsdn, size_t size, szind_t ind, bool zero, tcache_t *tcache, + bool is_internal, arena_t *arena, bool slow_path) { + bool slab = sz_can_use_slab(size); + return iallocztm_explicit_slab(tsdn, size, ind, zero, slab, tcache, + is_internal, arena, slow_path); +} + +JEMALLOC_ALWAYS_INLINE void * +ialloc(tsd_t *tsd, size_t size, szind_t ind, bool zero, bool slow_path) { + return iallocztm(tsd_tsdn(tsd), size, ind, zero, tcache_get(tsd), false, + NULL, slow_path); +} + +JEMALLOC_ALWAYS_INLINE void * +ipallocztm_explicit_slab(tsdn_t *tsdn, size_t usize, size_t alignment, bool zero, + bool slab, tcache_t *tcache, bool is_internal, arena_t *arena) { + void *ret; + + assert(!slab || sz_can_use_slab(usize)); /* slab && large is illegal */ + assert(usize != 0); + assert(usize == sz_sa2u(usize, alignment)); + assert(!is_internal || tcache == NULL); + assert(!is_internal || arena == NULL || arena_is_auto(arena)); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + ret = arena_palloc(tsdn, arena, usize, alignment, zero, slab, tcache); + assert(ALIGNMENT_ADDR2BASE(ret, alignment) == ret); + if (config_stats && is_internal && likely(ret != NULL)) { + arena_internal_add(iaalloc(tsdn, ret), isalloc(tsdn, ret)); + } + return ret; +} + +JEMALLOC_ALWAYS_INLINE void * +ipallocztm(tsdn_t *tsdn, size_t usize, size_t alignment, bool zero, + tcache_t *tcache, bool is_internal, arena_t *arena) { + return ipallocztm_explicit_slab(tsdn, usize, alignment, zero, + sz_can_use_slab(usize), tcache, is_internal, arena); +} + +JEMALLOC_ALWAYS_INLINE void * +ipalloct(tsdn_t *tsdn, size_t usize, size_t alignment, bool zero, + tcache_t *tcache, arena_t *arena) { + return ipallocztm(tsdn, usize, alignment, zero, tcache, false, arena); +} + +JEMALLOC_ALWAYS_INLINE void * +ipalloct_explicit_slab(tsdn_t *tsdn, size_t usize, size_t alignment, + bool zero, bool slab, tcache_t *tcache, arena_t *arena) { + return ipallocztm_explicit_slab(tsdn, usize, alignment, zero, slab, + tcache, false, arena); +} + +JEMALLOC_ALWAYS_INLINE void * +ipalloc(tsd_t *tsd, size_t usize, size_t alignment, bool zero) { + return ipallocztm(tsd_tsdn(tsd), usize, alignment, zero, + tcache_get(tsd), false, NULL); +} + +JEMALLOC_ALWAYS_INLINE size_t +ivsalloc(tsdn_t *tsdn, const void *ptr) { + return arena_vsalloc(tsdn, ptr); +} + +JEMALLOC_ALWAYS_INLINE void +idalloctm(tsdn_t *tsdn, void *ptr, tcache_t *tcache, + emap_alloc_ctx_t *alloc_ctx, bool is_internal, bool slow_path) { + assert(ptr != NULL); + assert(!is_internal || tcache == NULL); + assert(!is_internal || arena_is_auto(iaalloc(tsdn, ptr))); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + if (config_stats && is_internal) { + arena_internal_sub(iaalloc(tsdn, ptr), isalloc(tsdn, ptr)); + } + if (!is_internal && !tsdn_null(tsdn) && + tsd_reentrancy_level_get(tsdn_tsd(tsdn)) != 0) { + assert(tcache == NULL); + } + arena_dalloc(tsdn, ptr, tcache, alloc_ctx, slow_path); +} + +JEMALLOC_ALWAYS_INLINE void +idalloc(tsd_t *tsd, void *ptr) { + idalloctm(tsd_tsdn(tsd), ptr, tcache_get(tsd), NULL, false, true); +} + +JEMALLOC_ALWAYS_INLINE void +isdalloct(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache, + emap_alloc_ctx_t *alloc_ctx, bool slow_path) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + arena_sdalloc(tsdn, ptr, size, tcache, alloc_ctx, slow_path); +} + +JEMALLOC_ALWAYS_INLINE void * +iralloct_realign(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, + size_t alignment, bool zero, bool slab, tcache_t *tcache, arena_t *arena, + hook_ralloc_args_t *hook_args) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + void *p; + size_t usize, copysize; + + usize = sz_sa2u(size, alignment); + if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) { + return NULL; + } + p = ipalloct_explicit_slab(tsdn, usize, alignment, zero, slab, + tcache, arena); + if (p == NULL) { + return NULL; + } + /* + * Copy at most size bytes (not size+extra), since the caller has no + * expectation that the extra bytes will be reliably preserved. + */ + copysize = (size < oldsize) ? size : oldsize; + memcpy(p, ptr, copysize); + hook_invoke_alloc(hook_args->is_realloc + ? hook_alloc_realloc : hook_alloc_rallocx, p, (uintptr_t)p, + hook_args->args); + hook_invoke_dalloc(hook_args->is_realloc + ? hook_dalloc_realloc : hook_dalloc_rallocx, ptr, hook_args->args); + isdalloct(tsdn, ptr, oldsize, tcache, NULL, true); + return p; +} + +/* + * is_realloc threads through the knowledge of whether or not this call comes + * from je_realloc (as opposed to je_rallocx); this ensures that we pass the + * correct entry point into any hooks. + * Note that these functions are all force-inlined, so no actual bool gets + * passed-around anywhere. + */ +JEMALLOC_ALWAYS_INLINE void * +iralloct_explicit_slab(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, + size_t alignment, bool zero, bool slab, tcache_t *tcache, arena_t *arena, + hook_ralloc_args_t *hook_args) +{ + assert(ptr != NULL); + assert(size != 0); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1)) + != 0) { + /* + * Existing object alignment is inadequate; allocate new space + * and copy. + */ + return iralloct_realign(tsdn, ptr, oldsize, size, alignment, + zero, slab, tcache, arena, hook_args); + } + + return arena_ralloc(tsdn, arena, ptr, oldsize, size, alignment, zero, + slab, tcache, hook_args); +} + +JEMALLOC_ALWAYS_INLINE void * +iralloct(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t alignment, + size_t usize, bool zero, tcache_t *tcache, arena_t *arena, + hook_ralloc_args_t *hook_args) +{ + bool slab = sz_can_use_slab(usize); + return iralloct_explicit_slab(tsdn, ptr, oldsize, size, alignment, zero, + slab, tcache, arena, hook_args); +} + +JEMALLOC_ALWAYS_INLINE void * +iralloc(tsd_t *tsd, void *ptr, size_t oldsize, size_t size, size_t alignment, + size_t usize, bool zero, hook_ralloc_args_t *hook_args) { + return iralloct(tsd_tsdn(tsd), ptr, oldsize, size, alignment, usize, + zero, tcache_get(tsd), NULL, hook_args); +} + +JEMALLOC_ALWAYS_INLINE bool +ixalloc(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t extra, + size_t alignment, bool zero, size_t *newsize) { + assert(ptr != NULL); + assert(size != 0); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1)) + != 0) { + /* Existing object alignment is inadequate. */ + *newsize = oldsize; + return true; + } + + return arena_ralloc_no_move(tsdn, ptr, oldsize, size, extra, zero, + newsize); +} + +JEMALLOC_ALWAYS_INLINE void +fastpath_success_finish(tsd_t *tsd, uint64_t allocated_after, + cache_bin_t *bin, void *ret) { + thread_allocated_set(tsd, allocated_after); + if (config_stats) { + bin->tstats.nrequests++; + } +} + +JEMALLOC_ALWAYS_INLINE bool +malloc_initialized(void) { + return (malloc_init_state == malloc_init_initialized); +} + +/* + * malloc() fastpath. Included here so that we can inline it into operator new; + * function call overhead there is non-negligible as a fraction of total CPU in + * allocation-heavy C++ programs. We take the fallback alloc to allow malloc + * (which can return NULL) to differ in its behavior from operator new (which + * can't). It matches the signature of malloc / operator new so that we can + * tail-call the fallback allocator, allowing us to avoid setting up the call + * frame in the common case. + * + * Fastpath assumes size <= SC_LOOKUP_MAXCLASS, and that we hit + * tcache. If either of these is false, we tail-call to the slowpath, + * malloc_default(). Tail-calling is used to avoid any caller-saved + * registers. + * + * fastpath supports ticker and profiling, both of which will also + * tail-call to the slowpath if they fire. + */ +JEMALLOC_ALWAYS_INLINE void * +imalloc_fastpath(size_t size, void *(fallback_alloc)(size_t)) { + if (tsd_get_allocates() && unlikely(!malloc_initialized())) { + return fallback_alloc(size); + } + + tsd_t *tsd = tsd_get(false); + if (unlikely((size > SC_LOOKUP_MAXCLASS) || tsd == NULL)) { + return fallback_alloc(size); + } + /* + * The code below till the branch checking the next_event threshold may + * execute before malloc_init(), in which case the threshold is 0 to + * trigger slow path and initialization. + * + * Note that when uninitialized, only the fast-path variants of the sz / + * tsd facilities may be called. + */ + szind_t ind; + /* + * The thread_allocated counter in tsd serves as a general purpose + * accumulator for bytes of allocation to trigger different types of + * events. usize is always needed to advance thread_allocated, though + * it's not always needed in the core allocation logic. + */ + size_t usize; + sz_size2index_usize_fastpath(size, &ind, &usize); + /* Fast path relies on size being a bin. */ + assert(ind < SC_NBINS); + assert((SC_LOOKUP_MAXCLASS < SC_SMALL_MAXCLASS) && + (size <= SC_SMALL_MAXCLASS)); + + uint64_t allocated, threshold; + te_malloc_fastpath_ctx(tsd, &allocated, &threshold); + uint64_t allocated_after = allocated + usize; + /* + * The ind and usize might be uninitialized (or partially) before + * malloc_init(). The assertions check for: 1) full correctness (usize + * & ind) when initialized; and 2) guaranteed slow-path (threshold == 0) + * when !initialized. + */ + if (!malloc_initialized()) { + assert(threshold == 0); + } else { + assert(ind == sz_size2index(size)); + assert(usize > 0 && usize == sz_index2size(ind)); + } + /* + * Check for events and tsd non-nominal (fast_threshold will be set to + * 0) in a single branch. + */ + if (unlikely(allocated_after >= threshold)) { + return fallback_alloc(size); + } + assert(tsd_fast(tsd)); + + tcache_t *tcache = tsd_tcachep_get(tsd); + assert(tcache == tcache_get(tsd)); + cache_bin_t *bin = &tcache->bins[ind]; + /* Suppress spurious warning from static analysis */ + assert(bin != NULL); + bool tcache_success; + void *ret; + + /* + * We split up the code this way so that redundant low-water + * computation doesn't happen on the (more common) case in which we + * don't touch the low water mark. The compiler won't do this + * duplication on its own. + */ + ret = cache_bin_alloc_easy(bin, &tcache_success); + if (tcache_success) { + fastpath_success_finish(tsd, allocated_after, bin, ret); + return ret; + } + ret = cache_bin_alloc(bin, &tcache_success); + if (tcache_success) { + fastpath_success_finish(tsd, allocated_after, bin, ret); + return ret; + } + + return fallback_alloc(size); +} + +JEMALLOC_ALWAYS_INLINE tcache_t * +tcache_get_from_ind(tsd_t *tsd, unsigned tcache_ind, bool slow, bool is_alloc) { + tcache_t *tcache; + if (tcache_ind == TCACHE_IND_AUTOMATIC) { + if (likely(!slow)) { + /* Getting tcache ptr unconditionally. */ + tcache = tsd_tcachep_get(tsd); + assert(tcache == tcache_get(tsd)); + } else if (is_alloc || + likely(tsd_reentrancy_level_get(tsd) == 0)) { + tcache = tcache_get(tsd); + } else { + tcache = NULL; + } + } else { + /* + * Should not specify tcache on deallocation path when being + * reentrant. + */ + assert(is_alloc || tsd_reentrancy_level_get(tsd) == 0 || + tsd_state_nocleanup(tsd)); + if (tcache_ind == TCACHE_IND_NONE) { + tcache = NULL; + } else { + tcache = tcaches_get(tsd, tcache_ind); + } + } + return tcache; +} + +JEMALLOC_ALWAYS_INLINE bool +maybe_check_alloc_ctx(tsd_t *tsd, void *ptr, emap_alloc_ctx_t *alloc_ctx) { + if (config_opt_size_checks) { + emap_alloc_ctx_t dbg_ctx; + emap_alloc_ctx_lookup(tsd_tsdn(tsd), &arena_emap_global, ptr, + &dbg_ctx); + if (alloc_ctx->szind != dbg_ctx.szind) { + safety_check_fail_sized_dealloc( + /* current_dealloc */ true, ptr, + /* true_size */ sz_size2index(dbg_ctx.szind), + /* input_size */ sz_size2index(alloc_ctx->szind)); + return true; + } + if (alloc_ctx->slab != dbg_ctx.slab) { + safety_check_fail( + "Internal heap corruption detected: " + "mismatch in slab bit"); + return true; + } + } + return false; +} + +JEMALLOC_ALWAYS_INLINE bool +prof_sample_aligned(const void *ptr) { + return ((uintptr_t)ptr & PROF_SAMPLE_ALIGNMENT_MASK) == 0; +} + +JEMALLOC_ALWAYS_INLINE bool +free_fastpath_nonfast_aligned(void *ptr, bool check_prof) { + /* + * free_fastpath do not handle two uncommon cases: 1) sampled profiled + * objects and 2) sampled junk & stash for use-after-free detection. + * Both have special alignments which are used to escape the fastpath. + * + * prof_sample is page-aligned, which covers the UAF check when both + * are enabled (the assertion below). Avoiding redundant checks since + * this is on the fastpath -- at most one runtime branch from this. + */ + if (config_debug && cache_bin_nonfast_aligned(ptr)) { + assert(prof_sample_aligned(ptr)); + } + + if (config_prof && check_prof) { + /* When prof is enabled, the prof_sample alignment is enough. */ + if (prof_sample_aligned(ptr)) { + return true; + } else { + return false; + } + } + + if (config_uaf_detection) { + if (cache_bin_nonfast_aligned(ptr)) { + return true; + } else { + return false; + } + } + + return false; +} + +/* Returns whether or not the free attempt was successful. */ +JEMALLOC_ALWAYS_INLINE +bool free_fastpath(void *ptr, size_t size, bool size_hint) { + tsd_t *tsd = tsd_get(false); + /* The branch gets optimized away unless tsd_get_allocates(). */ + if (unlikely(tsd == NULL)) { + return false; + } + /* + * The tsd_fast() / initialized checks are folded into the branch + * testing (deallocated_after >= threshold) later in this function. + * The threshold will be set to 0 when !tsd_fast. + */ + assert(tsd_fast(tsd) || + *tsd_thread_deallocated_next_event_fastp_get_unsafe(tsd) == 0); + + emap_alloc_ctx_t alloc_ctx; + if (!size_hint) { + bool err = emap_alloc_ctx_try_lookup_fast(tsd, + &arena_emap_global, ptr, &alloc_ctx); + + /* Note: profiled objects will have alloc_ctx.slab set */ + if (unlikely(err || !alloc_ctx.slab || + free_fastpath_nonfast_aligned(ptr, + /* check_prof */ false))) { + return false; + } + assert(alloc_ctx.szind != SC_NSIZES); + } else { + /* + * Check for both sizes that are too large, and for sampled / + * special aligned objects. The alignment check will also check + * for null ptr. + */ + if (unlikely(size > SC_LOOKUP_MAXCLASS || + free_fastpath_nonfast_aligned(ptr, + /* check_prof */ true))) { + return false; + } + alloc_ctx.szind = sz_size2index_lookup(size); + /* Max lookup class must be small. */ + assert(alloc_ctx.szind < SC_NBINS); + /* This is a dead store, except when opt size checking is on. */ + alloc_ctx.slab = true; + } + /* + * Currently the fastpath only handles small sizes. The branch on + * SC_LOOKUP_MAXCLASS makes sure of it. This lets us avoid checking + * tcache szind upper limit (i.e. tcache_max) as well. + */ + assert(alloc_ctx.slab); + + uint64_t deallocated, threshold; + te_free_fastpath_ctx(tsd, &deallocated, &threshold); + + size_t usize = sz_index2size(alloc_ctx.szind); + uint64_t deallocated_after = deallocated + usize; + /* + * Check for events and tsd non-nominal (fast_threshold will be set to + * 0) in a single branch. Note that this handles the uninitialized case + * as well (TSD init will be triggered on the non-fastpath). Therefore + * anything depends on a functional TSD (e.g. the alloc_ctx sanity check + * below) needs to be after this branch. + */ + if (unlikely(deallocated_after >= threshold)) { + return false; + } + assert(tsd_fast(tsd)); + bool fail = maybe_check_alloc_ctx(tsd, ptr, &alloc_ctx); + if (fail) { + /* See the comment in isfree. */ + return true; + } + + tcache_t *tcache = tcache_get_from_ind(tsd, TCACHE_IND_AUTOMATIC, + /* slow */ false, /* is_alloc */ false); + cache_bin_t *bin = &tcache->bins[alloc_ctx.szind]; + + /* + * If junking were enabled, this is where we would do it. It's not + * though, since we ensured above that we're on the fast path. Assert + * that to double-check. + */ + assert(!opt_junk_free); + + if (!cache_bin_dalloc_easy(bin, ptr)) { + return false; + } + + *tsd_thread_deallocatedp_get(tsd) = deallocated_after; + + return true; +} + +JEMALLOC_ALWAYS_INLINE void JEMALLOC_NOTHROW +je_sdallocx_noflags(void *ptr, size_t size) { + if (!free_fastpath(ptr, size, true)) { + sdallocx_default(ptr, size, 0); + } +} + +JEMALLOC_ALWAYS_INLINE void JEMALLOC_NOTHROW +je_sdallocx_impl(void *ptr, size_t size, int flags) { + if (flags != 0 || !free_fastpath(ptr, size, true)) { + sdallocx_default(ptr, size, flags); + } +} + +JEMALLOC_ALWAYS_INLINE void JEMALLOC_NOTHROW +je_free_impl(void *ptr) { + if (!free_fastpath(ptr, 0, false)) { + free_default(ptr); + } +} + +#endif /* JEMALLOC_INTERNAL_INLINES_C_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_macros.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_macros.h new file mode 100644 index 000000000..407e868ae --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_macros.h @@ -0,0 +1,143 @@ +#ifndef JEMALLOC_INTERNAL_MACROS_H +#define JEMALLOC_INTERNAL_MACROS_H + +#ifdef JEMALLOC_DEBUG +# define JEMALLOC_ALWAYS_INLINE static inline +#else +# ifdef _MSC_VER +# define JEMALLOC_ALWAYS_INLINE static __forceinline +# else +# define JEMALLOC_ALWAYS_INLINE JEMALLOC_ATTR(always_inline) static inline +# endif +#endif +#ifdef _MSC_VER +# define inline _inline +#endif + +#define UNUSED JEMALLOC_ATTR(unused) + +#define ZU(z) ((size_t)z) +#define ZD(z) ((ssize_t)z) +#define QU(q) ((uint64_t)q) +#define QD(q) ((int64_t)q) + +#define KZU(z) ZU(z##ULL) +#define KZD(z) ZD(z##LL) +#define KQU(q) QU(q##ULL) +#define KQD(q) QI(q##LL) + +#ifndef __DECONST +# define __DECONST(type, var) ((type)(uintptr_t)(const void *)(var)) +#endif + +#if !defined(JEMALLOC_HAS_RESTRICT) || defined(__cplusplus) +# define restrict +#endif + +/* Various function pointers are static and immutable except during testing. */ +#ifdef JEMALLOC_JET +# define JET_MUTABLE +# define JET_EXTERN extern +#else +# define JET_MUTABLE const +# define JET_EXTERN static +#endif + +#define JEMALLOC_VA_ARGS_HEAD(head, ...) head +#define JEMALLOC_VA_ARGS_TAIL(head, ...) __VA_ARGS__ + +/* Diagnostic suppression macros */ +#if defined(_MSC_VER) && !defined(__clang__) +# define JEMALLOC_DIAGNOSTIC_PUSH __pragma(warning(push)) +# define JEMALLOC_DIAGNOSTIC_POP __pragma(warning(pop)) +# define JEMALLOC_DIAGNOSTIC_IGNORE(W) __pragma(warning(disable:W)) +# define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS +# define JEMALLOC_DIAGNOSTIC_IGNORE_FRAME_ADDRESS +# define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS +# define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN +# define JEMALLOC_DIAGNOSTIC_IGNORE_DEPRECATED +# define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS +/* #pragma GCC diagnostic first appeared in gcc 4.6. */ +#elif (defined(__GNUC__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && \ + (__GNUC_MINOR__ > 5)))) || defined(__clang__) +/* + * The JEMALLOC_PRAGMA__ macro is an implementation detail of the GCC and Clang + * diagnostic suppression macros and should not be used anywhere else. + */ +# define JEMALLOC_PRAGMA__(X) _Pragma(#X) +# define JEMALLOC_DIAGNOSTIC_PUSH JEMALLOC_PRAGMA__(GCC diagnostic push) +# define JEMALLOC_DIAGNOSTIC_POP JEMALLOC_PRAGMA__(GCC diagnostic pop) +# define JEMALLOC_DIAGNOSTIC_IGNORE(W) \ + JEMALLOC_PRAGMA__(GCC diagnostic ignored W) + +/* + * The -Wmissing-field-initializers warning is buggy in GCC versions < 5.1 and + * all clang versions up to version 7 (currently trunk, unreleased). This macro + * suppresses the warning for the affected compiler versions only. + */ +# if ((defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ < 5)) || \ + defined(__clang__) +# define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS \ + JEMALLOC_DIAGNOSTIC_IGNORE("-Wmissing-field-initializers") +# else +# define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS +# endif + +# define JEMALLOC_DIAGNOSTIC_IGNORE_FRAME_ADDRESS \ + JEMALLOC_DIAGNOSTIC_IGNORE("-Wframe-address") +# define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS \ + JEMALLOC_DIAGNOSTIC_IGNORE("-Wtype-limits") +# define JEMALLOC_DIAGNOSTIC_IGNORE_UNUSED_PARAMETER \ + JEMALLOC_DIAGNOSTIC_IGNORE("-Wunused-parameter") +# if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ >= 7) +# define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN \ + JEMALLOC_DIAGNOSTIC_IGNORE("-Walloc-size-larger-than=") +# else +# define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN +# endif +# ifdef JEMALLOC_HAVE_ATTR_DEPRECATED +# define JEMALLOC_DIAGNOSTIC_IGNORE_DEPRECATED \ + JEMALLOC_DIAGNOSTIC_IGNORE("-Wdeprecated-declarations") +# else +# define JEMALLOC_DIAGNOSTIC_IGNORE_DEPRECATED +# endif +# define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS \ + JEMALLOC_DIAGNOSTIC_PUSH \ + JEMALLOC_DIAGNOSTIC_IGNORE_UNUSED_PARAMETER +#else +# define JEMALLOC_DIAGNOSTIC_PUSH +# define JEMALLOC_DIAGNOSTIC_POP +# define JEMALLOC_DIAGNOSTIC_IGNORE(W) +# define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS +# define JEMALLOC_DIAGNOSTIC_IGNORE_FRAME_ADDRESS +# define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS +# define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN +# define JEMALLOC_DIAGNOSTIC_IGNORE_DEPRECATED +# define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS +#endif + +#ifdef __clang_analyzer__ +# define JEMALLOC_CLANG_ANALYZER +#endif + +#ifdef JEMALLOC_CLANG_ANALYZER +# define JEMALLOC_CLANG_ANALYZER_SUPPRESS __attribute__((suppress)) +# define JEMALLOC_CLANG_ANALYZER_SILENCE_INIT(v) = v +#else +# define JEMALLOC_CLANG_ANALYZER_SUPPRESS +# define JEMALLOC_CLANG_ANALYZER_SILENCE_INIT(v) +#endif + +#define JEMALLOC_SUPPRESS_WARN_ON_USAGE(...) \ + JEMALLOC_DIAGNOSTIC_PUSH \ + JEMALLOC_DIAGNOSTIC_IGNORE_DEPRECATED \ + __VA_ARGS__ \ + JEMALLOC_DIAGNOSTIC_POP + +/* + * Disables spurious diagnostics for all headers. Since these headers are not + * included by users directly, it does not affect their diagnostic settings. + */ +JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS + +#endif /* JEMALLOC_INTERNAL_MACROS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_overrides.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_overrides.h new file mode 100644 index 000000000..5fbbe2495 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_overrides.h @@ -0,0 +1,21 @@ +#ifndef JEMALLOC_INTERNAL_OVERRIDES_H +#define JEMALLOC_INTERNAL_OVERRIDES_H + +/* + * Under normal circumstances this header serves no purpose, as these settings + * can be customized via the corresponding autoconf options at configure-time. + * Overriding in this fashion is useful when the header files generated by + * autoconf are used as input for another build system. + */ + +#ifdef JEMALLOC_OVERRIDE_LG_PAGE + #undef LG_PAGE + #define LG_PAGE JEMALLOC_OVERRIDE_LG_PAGE +#endif + +#ifdef JEMALLOC_OVERRIDE_JEMALLOC_CONFIG_MALLOC_CONF + #undef JEMALLOC_CONFIG_MALLOC_CONF + #define JEMALLOC_CONFIG_MALLOC_CONF JEMALLOC_OVERRIDE_JEMALLOC_CONFIG_MALLOC_CONF +#endif + +#endif /* JEMALLOC_INTERNAL_OVERRIDES_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_types.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_types.h new file mode 100644 index 000000000..6a81f3cdc --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_internal_types.h @@ -0,0 +1,148 @@ +#ifndef JEMALLOC_INTERNAL_TYPES_H +#define JEMALLOC_INTERNAL_TYPES_H + +#include "jemalloc/internal/quantum.h" + +/* Processor / core id type. */ +typedef int malloc_cpuid_t; + +/* When realloc(non-null-ptr, 0) is called, what happens? */ +enum zero_realloc_action_e { + /* Realloc(ptr, 0) is free(ptr); return malloc(0); */ + zero_realloc_action_alloc = 0, + /* Realloc(ptr, 0) is free(ptr); */ + zero_realloc_action_free = 1, + /* Realloc(ptr, 0) aborts. */ + zero_realloc_action_abort = 2 +}; +typedef enum zero_realloc_action_e zero_realloc_action_t; + +/* Signature of write callback. */ +typedef void (write_cb_t)(void *, const char *); + +enum malloc_init_e { + malloc_init_uninitialized = 3, + malloc_init_a0_initialized = 2, + malloc_init_recursible = 1, + malloc_init_initialized = 0 /* Common case --> jnz. */ +}; +typedef enum malloc_init_e malloc_init_t; + +/* + * Flags bits: + * + * a: arena + * t: tcache + * 0: unused + * z: zero + * n: alignment + * + * aaaaaaaa aaaatttt tttttttt 0znnnnnn + */ +#define MALLOCX_ARENA_BITS 12 +#define MALLOCX_TCACHE_BITS 12 +#define MALLOCX_LG_ALIGN_BITS 6 +#define MALLOCX_ARENA_SHIFT 20 +#define MALLOCX_TCACHE_SHIFT 8 +#define MALLOCX_ARENA_MASK \ + ((unsigned)(((1U << MALLOCX_ARENA_BITS) - 1) << MALLOCX_ARENA_SHIFT)) +/* NB: Arena index bias decreases the maximum number of arenas by 1. */ +#define MALLOCX_ARENA_LIMIT ((unsigned)((1U << MALLOCX_ARENA_BITS) - 1)) +#define MALLOCX_TCACHE_MASK \ + ((unsigned)(((1U << MALLOCX_TCACHE_BITS) - 1) << MALLOCX_TCACHE_SHIFT)) +#define MALLOCX_TCACHE_MAX ((unsigned)((1U << MALLOCX_TCACHE_BITS) - 3)) +#define MALLOCX_LG_ALIGN_MASK ((1 << MALLOCX_LG_ALIGN_BITS) - 1) +/* Use MALLOCX_ALIGN_GET() if alignment may not be specified in flags. */ +#define MALLOCX_ALIGN_GET_SPECIFIED(flags) \ + (ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK)) +#define MALLOCX_ALIGN_GET(flags) \ + (MALLOCX_ALIGN_GET_SPECIFIED(flags) & (SIZE_T_MAX-1)) +#define MALLOCX_ZERO_GET(flags) \ + ((bool)(flags & MALLOCX_ZERO)) + +#define MALLOCX_TCACHE_GET(flags) \ + (((unsigned)((flags & MALLOCX_TCACHE_MASK) >> MALLOCX_TCACHE_SHIFT)) - 2) +#define MALLOCX_ARENA_GET(flags) \ + (((unsigned)(((unsigned)flags) >> MALLOCX_ARENA_SHIFT)) - 1) + +/* Smallest size class to support. */ +#define TINY_MIN (1U << LG_TINY_MIN) + +#define LONG ((size_t)(1U << LG_SIZEOF_LONG)) +#define LONG_MASK (LONG - 1) + +/* Return the smallest long multiple that is >= a. */ +#define LONG_CEILING(a) \ + (((a) + LONG_MASK) & ~LONG_MASK) + +#define SIZEOF_PTR (1U << LG_SIZEOF_PTR) +#define PTR_MASK (SIZEOF_PTR - 1) + +/* Return the smallest (void *) multiple that is >= a. */ +#define PTR_CEILING(a) \ + (((a) + PTR_MASK) & ~PTR_MASK) + +/* + * Maximum size of L1 cache line. This is used to avoid cache line aliasing. + * In addition, this controls the spacing of cacheline-spaced size classes. + * + * CACHELINE cannot be based on LG_CACHELINE because __declspec(align()) can + * only handle raw constants. + */ +#define LG_CACHELINE 6 +#define CACHELINE 64 +#define CACHELINE_MASK (CACHELINE - 1) + +/* Return the smallest cacheline multiple that is >= s. */ +#define CACHELINE_CEILING(s) \ + (((s) + CACHELINE_MASK) & ~CACHELINE_MASK) + +/* Return the nearest aligned address at or below a. */ +#define ALIGNMENT_ADDR2BASE(a, alignment) \ + ((void *)(((byte_t *)(a)) - (((uintptr_t)(a)) - \ + ((uintptr_t)(a) & ((~(alignment)) + 1))))) + +/* Return the offset between a and the nearest aligned address at or below a. */ +#define ALIGNMENT_ADDR2OFFSET(a, alignment) \ + ((size_t)((uintptr_t)(a) & (alignment - 1))) + +/* Return the smallest alignment multiple that is >= s. */ +#define ALIGNMENT_CEILING(s, alignment) \ + (((s) + (alignment - 1)) & ((~(alignment)) + 1)) + +/* + * Return the nearest aligned address at or above a. + * + * While at first glance this would appear to be merely a more complicated + * way to perform the same computation as `ALIGNMENT_CEILING`, + * this has the important additional property of not concealing pointer + * provenance from the compiler. See the block-comment on the + * definition of `byte_t` for more details. + */ +#define ALIGNMENT_ADDR2CEILING(a, alignment) \ + ((void *)(((byte_t *)(a)) + (((((uintptr_t)(a)) + \ + (alignment - 1)) & ((~(alignment)) + 1)) - ((uintptr_t)(a))))) + +/* Declare a variable-length array. */ +#if __STDC_VERSION__ < 199901L || defined(__STDC_NO_VLA__) +# ifdef _MSC_VER +# include +# define alloca _alloca +# else +# ifdef JEMALLOC_HAS_ALLOCA_H +# include +# else +# include +# endif +# endif +# define VARIABLE_ARRAY_UNSAFE(type, name, count) \ + type *name = alloca(sizeof(type) * (count)) +#else +# define VARIABLE_ARRAY_UNSAFE(type, name, count) type name[(count)] +#endif +#define VARIABLE_ARRAY_SIZE_MAX 2048 +#define VARIABLE_ARRAY(type, name, count) \ + assert(sizeof(type) * (count) <= VARIABLE_ARRAY_SIZE_MAX); \ + VARIABLE_ARRAY_UNSAFE(type, name, count) + +#endif /* JEMALLOC_INTERNAL_TYPES_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_preamble.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_preamble.h new file mode 100644 index 000000000..3f93c014d --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/jemalloc_preamble.h @@ -0,0 +1,263 @@ +#ifndef JEMALLOC_PREAMBLE_H +#define JEMALLOC_PREAMBLE_H + +#include "jemalloc/internal/jemalloc_internal_defs.h" +#include "jemalloc/internal/jemalloc_internal_decls.h" + +#if defined(JEMALLOC_UTRACE) || defined(JEMALLOC_UTRACE_LABEL) +#include +# if defined(JEMALLOC_UTRACE) +# define UTRACE_CALL(p, l) utrace(p, l) +# else +# define UTRACE_CALL(p, l) utrace("jemalloc_process", p, l) +# define JEMALLOC_UTRACE +# endif +#endif + +#define JEMALLOC_NO_DEMANGLE +#ifdef JEMALLOC_JET +# undef JEMALLOC_IS_MALLOC +# define JEMALLOC_N(n) jet_##n +# include "jemalloc/internal/public_namespace.h" +# define JEMALLOC_NO_RENAME +# include "../jemalloc.h" +# undef JEMALLOC_NO_RENAME +#else +# define JEMALLOC_N(n) duckdb_je_##n +# include "../jemalloc.h" +#endif + +#if defined(JEMALLOC_OSATOMIC) +#include +#endif + +#ifdef JEMALLOC_ZONE +#include +#include +#include +#endif + +#include "jemalloc/internal/jemalloc_internal_macros.h" + +/* + * Note that the ordering matters here; the hook itself is name-mangled. We + * want the inclusion of hooks to happen early, so that we hook as much as + * possible. + */ +#ifndef JEMALLOC_NO_PRIVATE_NAMESPACE +# ifndef JEMALLOC_JET +# include "jemalloc/internal/private_namespace.h" +# else +# include "jemalloc/internal/private_namespace_jet.h" +# endif +#endif +#include "jemalloc/internal/test_hooks.h" + +#ifdef JEMALLOC_DEFINE_MADVISE_FREE +# define JEMALLOC_MADV_FREE 8 +#endif + +static const bool config_debug = +#ifdef JEMALLOC_DEBUG + true +#else + false +#endif + ; +static const bool have_dss = +#ifdef JEMALLOC_DSS + true +#else + false +#endif + ; +static const bool have_madvise_huge = +#ifdef JEMALLOC_HAVE_MADVISE_HUGE + true +#else + false +#endif + ; +static const bool config_fill = +#ifdef JEMALLOC_FILL + true +#else + false +#endif + ; +static const bool config_lazy_lock = +#ifdef JEMALLOC_LAZY_LOCK + true +#else + false +#endif + ; +static const char * const config_malloc_conf = JEMALLOC_CONFIG_MALLOC_CONF; +static const bool config_prof = +#ifdef JEMALLOC_PROF + true +#else + false +#endif + ; +static const bool config_prof_libgcc = +#ifdef JEMALLOC_PROF_LIBGCC + true +#else + false +#endif + ; +static const bool config_prof_libunwind = +#ifdef JEMALLOC_PROF_LIBUNWIND + true +#else + false +#endif + ; +static const bool maps_coalesce = +#ifdef JEMALLOC_MAPS_COALESCE + true +#else + false +#endif + ; +static const bool config_stats = +#ifdef JEMALLOC_STATS + true +#else + false +#endif + ; +static const bool config_tls = +#ifdef JEMALLOC_TLS + true +#else + false +#endif + ; +static const bool config_utrace = +#ifdef JEMALLOC_UTRACE + true +#else + false +#endif + ; +static const bool config_xmalloc = +#ifdef JEMALLOC_XMALLOC + true +#else + false +#endif + ; +static const bool config_cache_oblivious = +#ifdef JEMALLOC_CACHE_OBLIVIOUS + true +#else + false +#endif + ; +/* + * Undocumented, for jemalloc development use only at the moment. See the note + * in jemalloc/internal/log.h. + */ +static const bool config_log = +#ifdef JEMALLOC_LOG + true +#else + false +#endif + ; +/* + * Are extra safety checks enabled; things like checking the size of sized + * deallocations, double-frees, etc. + */ +static const bool config_opt_safety_checks = +#ifdef JEMALLOC_OPT_SAFETY_CHECKS + true +#elif defined(JEMALLOC_DEBUG) + /* + * This lets us only guard safety checks by one flag instead of two; fast + * checks can guard solely by config_opt_safety_checks and run in debug mode + * too. + */ + true +#else + false +#endif + ; + +/* + * Extra debugging of sized deallocations too onerous to be included in the + * general safety checks. + */ +static const bool config_opt_size_checks = +#if defined(JEMALLOC_OPT_SIZE_CHECKS) || defined(JEMALLOC_DEBUG) + true +#else + false +#endif + ; + +static const bool config_uaf_detection = +#if defined(JEMALLOC_UAF_DETECTION) || defined(JEMALLOC_DEBUG) + true +#else + false +#endif + ; + +/* Whether or not the C++ extensions are enabled. */ +static const bool config_enable_cxx = +#ifdef JEMALLOC_ENABLE_CXX + true +#else + false +#endif +; + +#if defined(_WIN32) || defined(__APPLE__) || defined(JEMALLOC_HAVE_SCHED_GETCPU) +/* Currently percpu_arena depends on sched_getcpu. */ +#define JEMALLOC_PERCPU_ARENA +#endif +static const bool have_percpu_arena = +#ifdef JEMALLOC_PERCPU_ARENA + true +#else + false +#endif + ; +/* + * Undocumented, and not recommended; the application should take full + * responsibility for tracking provenance. + */ +static const bool force_ivsalloc = +#ifdef JEMALLOC_FORCE_IVSALLOC + true +#else + false +#endif + ; +static const bool have_background_thread = +#ifdef JEMALLOC_BACKGROUND_THREAD + true +#else + false +#endif + ; +static const bool config_high_res_timer = +#ifdef JEMALLOC_HAVE_CLOCK_REALTIME + true +#else + false +#endif + ; + +static const bool have_memcntl = +#ifdef JEMALLOC_HAVE_MEMCNTL + true +#else + false +#endif + ; + +#endif /* JEMALLOC_PREAMBLE_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/large_externs.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/large_externs.h new file mode 100644 index 000000000..ce9c86894 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/large_externs.h @@ -0,0 +1,26 @@ +#ifndef JEMALLOC_INTERNAL_LARGE_EXTERNS_H +#define JEMALLOC_INTERNAL_LARGE_EXTERNS_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/edata.h" +#include "jemalloc/internal/hook.h" + +void *large_malloc(tsdn_t *tsdn, arena_t *arena, size_t usize, bool zero); +void *large_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment, + bool zero); +bool large_ralloc_no_move(tsdn_t *tsdn, edata_t *edata, size_t usize_min, + size_t usize_max, bool zero); +void *large_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t usize, + size_t alignment, bool zero, tcache_t *tcache, + hook_ralloc_args_t *hook_args); + +void large_dalloc_prep_locked(tsdn_t *tsdn, edata_t *edata); +void large_dalloc_finish(tsdn_t *tsdn, edata_t *edata); +void large_dalloc(tsdn_t *tsdn, edata_t *edata); +size_t large_salloc(tsdn_t *tsdn, const edata_t *edata); +void large_prof_info_get(tsd_t *tsd, edata_t *edata, prof_info_t *prof_info, + bool reset_recent); +void large_prof_tctx_reset(edata_t *edata); +void large_prof_info_set(edata_t *edata, prof_tctx_t *tctx, size_t size); + +#endif /* JEMALLOC_INTERNAL_LARGE_EXTERNS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/lockedint.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/lockedint.h new file mode 100644 index 000000000..062dedbfd --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/lockedint.h @@ -0,0 +1,209 @@ +#ifndef JEMALLOC_INTERNAL_LOCKEDINT_H +#define JEMALLOC_INTERNAL_LOCKEDINT_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/tsd_types.h" + +/* + * In those architectures that support 64-bit atomics, we use atomic updates for + * our 64-bit values. Otherwise, we use a plain uint64_t and synchronize + * externally. + */ + +typedef struct locked_u64_s locked_u64_t; +#ifdef JEMALLOC_ATOMIC_U64 +struct locked_u64_s { + atomic_u64_t val; +}; +#else +/* Must hold the associated mutex. */ +struct locked_u64_s { + uint64_t val; +}; +#endif + +typedef struct locked_zu_s locked_zu_t; +struct locked_zu_s { + atomic_zu_t val; +}; + +#ifndef JEMALLOC_ATOMIC_U64 +# define LOCKEDINT_MTX_DECLARE(name) malloc_mutex_t name; +# define LOCKEDINT_MTX_INIT(mu, name, rank, rank_mode) \ + malloc_mutex_init(&(mu), name, rank, rank_mode) +# define LOCKEDINT_MTX(mtx) (&(mtx)) +# define LOCKEDINT_MTX_LOCK(tsdn, mu) malloc_mutex_lock(tsdn, &(mu)) +# define LOCKEDINT_MTX_UNLOCK(tsdn, mu) malloc_mutex_unlock(tsdn, &(mu)) +# define LOCKEDINT_MTX_PREFORK(tsdn, mu) malloc_mutex_prefork(tsdn, &(mu)) +# define LOCKEDINT_MTX_POSTFORK_PARENT(tsdn, mu) \ + malloc_mutex_postfork_parent(tsdn, &(mu)) +# define LOCKEDINT_MTX_POSTFORK_CHILD(tsdn, mu) \ + malloc_mutex_postfork_child(tsdn, &(mu)) +#else +# define LOCKEDINT_MTX_DECLARE(name) +# define LOCKEDINT_MTX(mtx) NULL +# define LOCKEDINT_MTX_INIT(mu, name, rank, rank_mode) false +# define LOCKEDINT_MTX_LOCK(tsdn, mu) +# define LOCKEDINT_MTX_UNLOCK(tsdn, mu) +# define LOCKEDINT_MTX_PREFORK(tsdn, mu) +# define LOCKEDINT_MTX_POSTFORK_PARENT(tsdn, mu) +# define LOCKEDINT_MTX_POSTFORK_CHILD(tsdn, mu) +#endif + +#ifdef JEMALLOC_ATOMIC_U64 +# define LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx) assert((mtx) == NULL) +#else +# define LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx) \ + malloc_mutex_assert_owner(tsdn, (mtx)) +#endif + +static inline uint64_t +locked_read_u64(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_u64_t *p) { + LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx); +#ifdef JEMALLOC_ATOMIC_U64 + return atomic_load_u64(&p->val, ATOMIC_RELAXED); +#else + return p->val; +#endif +} + +static inline void +locked_inc_u64(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_u64_t *p, + uint64_t x) { + LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx); +#ifdef JEMALLOC_ATOMIC_U64 + atomic_fetch_add_u64(&p->val, x, ATOMIC_RELAXED); +#else + p->val += x; +#endif +} + +static inline void +locked_dec_u64(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_u64_t *p, + uint64_t x) { + LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx); +#ifdef JEMALLOC_ATOMIC_U64 + uint64_t r = atomic_fetch_sub_u64(&p->val, x, ATOMIC_RELAXED); + assert(r - x <= r); +#else + p->val -= x; + assert(p->val + x >= p->val); +#endif +} + +/* Increment and take modulus. Returns whether the modulo made any change. */ +static inline bool +locked_inc_mod_u64(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_u64_t *p, + const uint64_t x, const uint64_t modulus) { + LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx); + uint64_t before, after; + bool overflow; +#ifdef JEMALLOC_ATOMIC_U64 + before = atomic_load_u64(&p->val, ATOMIC_RELAXED); + do { + after = before + x; + assert(after >= before); + overflow = (after >= modulus); + if (overflow) { + after %= modulus; + } + } while (!atomic_compare_exchange_weak_u64(&p->val, &before, after, + ATOMIC_RELAXED, ATOMIC_RELAXED)); +#else + before = p->val; + after = before + x; + overflow = (after >= modulus); + if (overflow) { + after %= modulus; + } + p->val = after; +#endif + return overflow; +} + +/* + * Non-atomically sets *dst += src. *dst needs external synchronization. + * This lets us avoid the cost of a fetch_add when its unnecessary (note that + * the types here are atomic). + */ +static inline void +locked_inc_u64_unsynchronized(locked_u64_t *dst, uint64_t src) { +#ifdef JEMALLOC_ATOMIC_U64 + uint64_t cur_dst = atomic_load_u64(&dst->val, ATOMIC_RELAXED); + atomic_store_u64(&dst->val, src + cur_dst, ATOMIC_RELAXED); +#else + dst->val += src; +#endif +} + +static inline uint64_t +locked_read_u64_unsynchronized(locked_u64_t *p) { +#ifdef JEMALLOC_ATOMIC_U64 + return atomic_load_u64(&p->val, ATOMIC_RELAXED); +#else + return p->val; +#endif +} + +static inline void +locked_init_u64_unsynchronized(locked_u64_t *p, uint64_t x) { +#ifdef JEMALLOC_ATOMIC_U64 + atomic_store_u64(&p->val, x, ATOMIC_RELAXED); +#else + p->val = x; +#endif +} + +static inline size_t +locked_read_zu(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_zu_t *p) { + LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx); +#ifdef JEMALLOC_ATOMIC_U64 + return atomic_load_zu(&p->val, ATOMIC_RELAXED); +#else + return atomic_load_zu(&p->val, ATOMIC_RELAXED); +#endif +} + +static inline void +locked_inc_zu(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_zu_t *p, + size_t x) { + LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx); +#ifdef JEMALLOC_ATOMIC_U64 + atomic_fetch_add_zu(&p->val, x, ATOMIC_RELAXED); +#else + size_t cur = atomic_load_zu(&p->val, ATOMIC_RELAXED); + atomic_store_zu(&p->val, cur + x, ATOMIC_RELAXED); +#endif +} + +static inline void +locked_dec_zu(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_zu_t *p, + size_t x) { + LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx); +#ifdef JEMALLOC_ATOMIC_U64 + size_t r = atomic_fetch_sub_zu(&p->val, x, ATOMIC_RELAXED); + assert(r - x <= r); +#else + size_t cur = atomic_load_zu(&p->val, ATOMIC_RELAXED); + atomic_store_zu(&p->val, cur - x, ATOMIC_RELAXED); +#endif +} + +/* Like the _u64 variant, needs an externally synchronized *dst. */ +static inline void +locked_inc_zu_unsynchronized(locked_zu_t *dst, size_t src) { + size_t cur_dst = atomic_load_zu(&dst->val, ATOMIC_RELAXED); + atomic_store_zu(&dst->val, src + cur_dst, ATOMIC_RELAXED); +} + +/* + * Unlike the _u64 variant, this is safe to call unconditionally. + */ +static inline size_t +locked_read_atomic_zu(locked_zu_t *p) { + return atomic_load_zu(&p->val, ATOMIC_RELAXED); +} + +#endif /* JEMALLOC_INTERNAL_LOCKEDINT_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/log.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/log.h new file mode 100644 index 000000000..7b074abde --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/log.h @@ -0,0 +1,115 @@ +#ifndef JEMALLOC_INTERNAL_LOG_H +#define JEMALLOC_INTERNAL_LOG_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/malloc_io.h" +#include "jemalloc/internal/mutex.h" + +#ifdef JEMALLOC_LOG +# define JEMALLOC_LOG_VAR_BUFSIZE 1000 +#else +# define JEMALLOC_LOG_VAR_BUFSIZE 1 +#endif + +#define JEMALLOC_LOG_BUFSIZE 4096 + +/* + * The log malloc_conf option is a '|'-delimited list of log_var name segments + * which should be logged. The names are themselves hierarchical, with '.' as + * the delimiter (a "segment" is just a prefix in the log namespace). So, if + * you have: + * + * log("arena", "log msg for arena"); // 1 + * log("arena.a", "log msg for arena.a"); // 2 + * log("arena.b", "log msg for arena.b"); // 3 + * log("arena.a.a", "log msg for arena.a.a"); // 4 + * log("extent.a", "log msg for extent.a"); // 5 + * log("extent.b", "log msg for extent.b"); // 6 + * + * And your malloc_conf option is "log:arena.a|extent", then lines 2, 4, 5, and + * 6 will print at runtime. You can enable logging from all log vars by + * writing "log:.". + * + * None of this should be regarded as a stable API for right now. It's intended + * as a debugging interface, to let us keep around some of our printf-debugging + * statements. + */ + +extern char log_var_names[JEMALLOC_LOG_VAR_BUFSIZE]; +extern atomic_b_t log_init_done; + +typedef struct log_var_s log_var_t; +struct log_var_s { + /* + * Lowest bit is "inited", second lowest is "enabled". Putting them in + * a single word lets us avoid any fences on weak architectures. + */ + atomic_u_t state; + const char *name; +}; + +#define LOG_NOT_INITIALIZED 0U +#define LOG_INITIALIZED_NOT_ENABLED 1U +#define LOG_ENABLED 2U + +#define LOG_VAR_INIT(name_str) {ATOMIC_INIT(LOG_NOT_INITIALIZED), name_str} + +/* + * Returns the value we should assume for state (which is not necessarily + * accurate; if logging is done before logging has finished initializing, then + * we default to doing the safe thing by logging everything). + */ +unsigned log_var_update_state(log_var_t *log_var); + +/* We factor out the metadata management to allow us to test more easily. */ +#define log_do_begin(log_var) \ +if (config_log) { \ + unsigned log_state = atomic_load_u(&(log_var).state, \ + ATOMIC_RELAXED); \ + if (unlikely(log_state == LOG_NOT_INITIALIZED)) { \ + log_state = log_var_update_state(&(log_var)); \ + assert(log_state != LOG_NOT_INITIALIZED); \ + } \ + if (log_state == LOG_ENABLED) { \ + { + /* User code executes here. */ +#define log_do_end(log_var) \ + } \ + } \ +} + +/* + * MSVC has some preprocessor bugs in its expansion of __VA_ARGS__ during + * preprocessing. To work around this, we take all potential extra arguments in + * a var-args functions. Since a varargs macro needs at least one argument in + * the "...", we accept the format string there, and require that the first + * argument in this "..." is a const char *. + */ +static inline void +log_impl_varargs(const char *name, ...) { + char buf[JEMALLOC_LOG_BUFSIZE]; + va_list ap; + + va_start(ap, name); + const char *format = va_arg(ap, const char *); + size_t dst_offset = 0; + dst_offset += malloc_snprintf(buf, JEMALLOC_LOG_BUFSIZE, "%s: ", name); + dst_offset += malloc_vsnprintf(buf + dst_offset, + JEMALLOC_LOG_BUFSIZE - dst_offset, format, ap); + malloc_snprintf(buf + dst_offset, JEMALLOC_LOG_BUFSIZE - dst_offset, "\n"); + va_end(ap); + + malloc_write(buf); +} + +/* Call as log("log.var.str", "format_string %d", arg_for_format_string); */ +#define LOG(log_var_str, ...) \ +do { \ + static log_var_t log_var = LOG_VAR_INIT(log_var_str); \ + log_do_begin(log_var) \ + log_impl_varargs((log_var).name, __VA_ARGS__); \ + log_do_end(log_var) \ +} while (0) + +#endif /* JEMALLOC_INTERNAL_LOG_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/malloc_io.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/malloc_io.h new file mode 100644 index 000000000..91e7b2ba7 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/malloc_io.h @@ -0,0 +1,137 @@ +#ifndef JEMALLOC_INTERNAL_MALLOC_IO_H +#define JEMALLOC_INTERNAL_MALLOC_IO_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_types.h" + +#ifdef _WIN32 +# ifdef _WIN64 +# define FMT64_PREFIX "ll" +# define FMTPTR_PREFIX "ll" +# else +# define FMT64_PREFIX "ll" +# define FMTPTR_PREFIX "" +# endif +# define FMTd32 "d" +# define FMTu32 "u" +# define FMTx32 "x" +# define FMTd64 FMT64_PREFIX "d" +# define FMTu64 FMT64_PREFIX "u" +# define FMTx64 FMT64_PREFIX "x" +# define FMTdPTR FMTPTR_PREFIX "d" +# define FMTuPTR FMTPTR_PREFIX "u" +# define FMTxPTR FMTPTR_PREFIX "x" +#else +# include +# define FMTd32 PRId32 +# define FMTu32 PRIu32 +# define FMTx32 PRIx32 +# define FMTd64 PRId64 +# define FMTu64 PRIu64 +# define FMTx64 PRIx64 +# define FMTdPTR PRIdPTR +# define FMTuPTR PRIuPTR +# define FMTxPTR PRIxPTR +#endif + +/* Size of stack-allocated buffer passed to buferror(). */ +#define BUFERROR_BUF 64 + +/* + * Size of stack-allocated buffer used by malloc_{,v,vc}printf(). This must be + * large enough for all possible uses within jemalloc. + */ +#define MALLOC_PRINTF_BUFSIZE 4096 + +write_cb_t wrtmessage; +int buferror(int err, char *buf, size_t buflen); +uintmax_t malloc_strtoumax(const char *restrict nptr, char **restrict endptr, + int base); +void malloc_write(const char *s); + +/* + * malloc_vsnprintf() supports a subset of snprintf(3) that avoids floating + * point math. + */ +size_t malloc_vsnprintf(char *str, size_t size, const char *format, + va_list ap); +size_t malloc_snprintf(char *str, size_t size, const char *format, ...) + JEMALLOC_FORMAT_PRINTF(3, 4); +/* + * The caller can set write_cb to null to choose to print with the + * je_malloc_message hook. + */ +void malloc_vcprintf(write_cb_t *write_cb, void *cbopaque, const char *format, + va_list ap); +void malloc_cprintf(write_cb_t *write_cb, void *cbopaque, const char *format, + ...) JEMALLOC_FORMAT_PRINTF(3, 4); +void malloc_printf(const char *format, ...) JEMALLOC_FORMAT_PRINTF(1, 2); + +static inline ssize_t +malloc_write_fd_syscall(int fd, const void *buf, size_t count) { +#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_write) + /* + * Use syscall(2) rather than write(2) when possible in order to avoid + * the possibility of memory allocation within libc. This is necessary + * on FreeBSD; most operating systems do not have this problem though. + * + * syscall() returns long or int, depending on platform, so capture the + * result in the widest plausible type to avoid compiler warnings. + */ + long result = syscall(SYS_write, fd, buf, count); +#else + ssize_t result = (ssize_t)write(fd, buf, +#ifdef _WIN32 + (unsigned int) +#endif + count); +#endif + return (ssize_t)result; +} + +static inline ssize_t +malloc_write_fd(int fd, const void *buf, size_t count) { + size_t bytes_written = 0; + do { + ssize_t result = malloc_write_fd_syscall(fd, + &((const byte_t *)buf)[bytes_written], + count - bytes_written); + if (result < 0) { + return result; + } + bytes_written += result; + } while (bytes_written < count); + return bytes_written; +} + +static inline ssize_t +malloc_read_fd_syscall(int fd, void *buf, size_t count) { +#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_read) + long result = syscall(SYS_read, fd, buf, count); +#else + ssize_t result = read(fd, buf, +#ifdef _WIN32 + (unsigned int) +#endif + count); +#endif + return (ssize_t)result; +} + +static inline ssize_t +malloc_read_fd(int fd, void *buf, size_t count) { + size_t bytes_read = 0; + do { + ssize_t result = malloc_read_fd_syscall(fd, + &((byte_t *)buf)[bytes_read], count - bytes_read); + if (result < 0) { + return result; + } else if (result == 0) { + break; + } + bytes_read += result; + } while (bytes_read < count); + return bytes_read; +} + +#endif /* JEMALLOC_INTERNAL_MALLOC_IO_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/mpsc_queue.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/mpsc_queue.h new file mode 100644 index 000000000..d8aa624bb --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/mpsc_queue.h @@ -0,0 +1,135 @@ +#ifndef JEMALLOC_INTERNAL_MPSC_QUEUE_H +#define JEMALLOC_INTERNAL_MPSC_QUEUE_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/atomic.h" + +/* + * A concurrent implementation of a multi-producer, single-consumer queue. It + * supports three concurrent operations: + * - Push + * - Push batch + * - Pop batch + * + * These operations are all lock-free. + * + * The implementation is the simple two-stack queue built on a Treiber stack. + * It's not terribly efficient, but this isn't expected to go into anywhere with + * hot code. In fact, we don't really even need queue semantics in any + * anticipated use cases; we could get away with just the stack. But this way + * lets us frame the API in terms of the existing list types, which is a nice + * convenience. We can save on cache misses by introducing our own (parallel) + * single-linked list type here, and dropping FIFO semantics, if we need this to + * get faster. Since we're currently providing queue semantics though, we use + * the prev field in the link rather than the next field for Treiber-stack + * linkage, so that we can preserve order for bash-pushed lists (recall that the + * two-stack tricks reverses orders in the lock-free first stack). + */ + +#define mpsc_queue(a_type) \ +struct { \ + atomic_p_t tail; \ +} + +#define mpsc_queue_proto(a_attr, a_prefix, a_queue_type, a_type, \ + a_list_type) \ +/* Initialize a queue. */ \ +a_attr void \ +a_prefix##new(a_queue_type *queue); \ +/* Insert all items in src into the queue, clearing src. */ \ +a_attr void \ +a_prefix##push_batch(a_queue_type *queue, a_list_type *src); \ +/* Insert node into the queue. */ \ +a_attr void \ +a_prefix##push(a_queue_type *queue, a_type *node); \ +/* \ + * Pop all items in the queue into the list at dst. dst should already \ + * be initialized (and may contain existing items, which then remain \ + * in dst). \ + */ \ +a_attr void \ +a_prefix##pop_batch(a_queue_type *queue, a_list_type *dst); + +#define mpsc_queue_gen(a_attr, a_prefix, a_queue_type, a_type, \ + a_list_type, a_link) \ +a_attr void \ +a_prefix##new(a_queue_type *queue) { \ + atomic_store_p(&queue->tail, NULL, ATOMIC_RELAXED); \ +} \ +a_attr void \ +a_prefix##push_batch(a_queue_type *queue, a_list_type *src) { \ + /* \ + * Reuse the ql list next field as the Treiber stack next \ + * field. \ + */ \ + a_type *first = ql_first(src); \ + a_type *last = ql_last(src, a_link); \ + void* cur_tail = atomic_load_p(&queue->tail, ATOMIC_RELAXED); \ + do { \ + /* \ + * Note that this breaks the queue ring structure; \ + * it's not a ring any more! \ + */ \ + first->a_link.qre_prev = cur_tail; \ + /* \ + * Note: the upcoming CAS doesn't need an atomic; every \ + * push only needs to synchronize with the next pop, \ + * which we get from the release sequence rules. \ + */ \ + } while (!atomic_compare_exchange_weak_p(&queue->tail, \ + &cur_tail, last, ATOMIC_RELEASE, ATOMIC_RELAXED)); \ + ql_new(src); \ +} \ +a_attr void \ +a_prefix##push(a_queue_type *queue, a_type *node) { \ + ql_elm_new(node, a_link); \ + a_list_type list; \ + ql_new(&list); \ + ql_head_insert(&list, node, a_link); \ + a_prefix##push_batch(queue, &list); \ +} \ +a_attr void \ +a_prefix##pop_batch(a_queue_type *queue, a_list_type *dst) { \ + a_type *tail = atomic_load_p(&queue->tail, ATOMIC_RELAXED); \ + if (tail == NULL) { \ + /* \ + * In the common special case where there are no \ + * pending elements, bail early without a costly RMW. \ + */ \ + return; \ + } \ + tail = atomic_exchange_p(&queue->tail, NULL, ATOMIC_ACQUIRE); \ + /* \ + * It's a single-consumer queue, so if cur started non-NULL, \ + * it'd better stay non-NULL. \ + */ \ + assert(tail != NULL); \ + /* \ + * We iterate through the stack and both fix up the link \ + * structure (stack insertion broke the list requirement that \ + * the list be circularly linked). It's just as efficient at \ + * this point to make the queue a "real" queue, so do that as \ + * well. \ + * If this ever gets to be a hot spot, we can omit this fixup \ + * and make the queue a bag (i.e. not necessarily ordered), but \ + * that would mean jettisoning the existing list API as the \ + * batch pushing/popping interface. \ + */ \ + a_list_type reversed; \ + ql_new(&reversed); \ + while (tail != NULL) { \ + /* \ + * Pop an item off the stack, prepend it onto the list \ + * (reversing the order). Recall that we use the \ + * list prev field as the Treiber stack next field to \ + * preserve order of batch-pushed items when reversed. \ + */ \ + a_type *next = tail->a_link.qre_prev; \ + ql_elm_new(tail, a_link); \ + ql_head_insert(&reversed, tail, a_link); \ + tail = next; \ + } \ + ql_concat(dst, &reversed, a_link); \ +} + +#endif /* JEMALLOC_INTERNAL_MPSC_QUEUE_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/mutex.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/mutex.h new file mode 100644 index 000000000..75abf298b --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/mutex.h @@ -0,0 +1,320 @@ +#ifndef JEMALLOC_INTERNAL_MUTEX_H +#define JEMALLOC_INTERNAL_MUTEX_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/mutex_prof.h" +#include "jemalloc/internal/tsd.h" +#include "jemalloc/internal/witness.h" + +extern int64_t opt_mutex_max_spin; + +typedef enum { + /* Can only acquire one mutex of a given witness rank at a time. */ + malloc_mutex_rank_exclusive, + /* + * Can acquire multiple mutexes of the same witness rank, but in + * address-ascending order only. + */ + malloc_mutex_address_ordered +} malloc_mutex_lock_order_t; + +typedef struct malloc_mutex_s malloc_mutex_t; +struct malloc_mutex_s { + union { + struct { + /* + * prof_data is defined first to reduce cacheline + * bouncing: the data is not touched by the mutex holder + * during unlocking, while might be modified by + * contenders. Having it before the mutex itself could + * avoid prefetching a modified cacheline (for the + * unlocking thread). + */ + mutex_prof_data_t prof_data; + /* + * Hint flag to avoid exclusive cache line contention + * during spin waiting. Placed along with prof_data + * since it's always modified even with no contention. + */ + atomic_b_t locked; +#ifdef _WIN32 +# if _WIN32_WINNT >= 0x0600 + SRWLOCK lock; +# else + CRITICAL_SECTION lock; +# endif +#elif (defined(JEMALLOC_OS_UNFAIR_LOCK)) + os_unfair_lock lock; +#elif (defined(JEMALLOC_MUTEX_INIT_CB)) + pthread_mutex_t lock; + malloc_mutex_t *postponed_next; +#else + pthread_mutex_t lock; +#endif + }; + /* + * We only touch witness when configured w/ debug. However we + * keep the field in a union when !debug so that we don't have + * to pollute the code base with #ifdefs, while avoid paying the + * memory cost. + */ +#if !defined(JEMALLOC_DEBUG) + witness_t witness; + malloc_mutex_lock_order_t lock_order; +#endif + }; + +#if defined(JEMALLOC_DEBUG) + witness_t witness; + malloc_mutex_lock_order_t lock_order; +#endif +}; + +#ifdef _WIN32 +# if _WIN32_WINNT >= 0x0600 +# define MALLOC_MUTEX_LOCK(m) AcquireSRWLockExclusive(&(m)->lock) +# define MALLOC_MUTEX_UNLOCK(m) ReleaseSRWLockExclusive(&(m)->lock) +# define MALLOC_MUTEX_TRYLOCK(m) (!TryAcquireSRWLockExclusive(&(m)->lock)) +# else +# define MALLOC_MUTEX_LOCK(m) EnterCriticalSection(&(m)->lock) +# define MALLOC_MUTEX_UNLOCK(m) LeaveCriticalSection(&(m)->lock) +# define MALLOC_MUTEX_TRYLOCK(m) (!TryEnterCriticalSection(&(m)->lock)) +# endif +#elif (defined(JEMALLOC_OS_UNFAIR_LOCK)) +# define MALLOC_MUTEX_LOCK(m) os_unfair_lock_lock(&(m)->lock) +# define MALLOC_MUTEX_UNLOCK(m) os_unfair_lock_unlock(&(m)->lock) +# define MALLOC_MUTEX_TRYLOCK(m) (!os_unfair_lock_trylock(&(m)->lock)) +#else +# define MALLOC_MUTEX_LOCK(m) pthread_mutex_lock(&(m)->lock) +# define MALLOC_MUTEX_UNLOCK(m) pthread_mutex_unlock(&(m)->lock) +# define MALLOC_MUTEX_TRYLOCK(m) (pthread_mutex_trylock(&(m)->lock) != 0) +#endif + +#define LOCK_PROF_DATA_INITIALIZER \ + {NSTIME_ZERO_INITIALIZER, NSTIME_ZERO_INITIALIZER, 0, 0, 0, \ + ATOMIC_INIT(0), 0, NULL, 0} + +#ifdef _WIN32 +# define MALLOC_MUTEX_INITIALIZER +#elif (defined(JEMALLOC_OS_UNFAIR_LOCK)) +# if defined(JEMALLOC_DEBUG) +# define MALLOC_MUTEX_INITIALIZER \ + {{{LOCK_PROF_DATA_INITIALIZER, ATOMIC_INIT(false), OS_UNFAIR_LOCK_INIT}}, \ + WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0} +# else +# define MALLOC_MUTEX_INITIALIZER \ + {{{LOCK_PROF_DATA_INITIALIZER, ATOMIC_INIT(false), OS_UNFAIR_LOCK_INIT}}, \ + WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)} +# endif +#elif (defined(JEMALLOC_MUTEX_INIT_CB)) +# if (defined(JEMALLOC_DEBUG)) +# define MALLOC_MUTEX_INITIALIZER \ + {{{LOCK_PROF_DATA_INITIALIZER, ATOMIC_INIT(false), PTHREAD_MUTEX_INITIALIZER, NULL}}, \ + WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0} +# else +# define MALLOC_MUTEX_INITIALIZER \ + {{{LOCK_PROF_DATA_INITIALIZER, ATOMIC_INIT(false), PTHREAD_MUTEX_INITIALIZER, NULL}}, \ + WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)} +# endif + +#else +# define MALLOC_MUTEX_TYPE PTHREAD_MUTEX_DEFAULT +# if defined(JEMALLOC_DEBUG) +# define MALLOC_MUTEX_INITIALIZER \ + {{{LOCK_PROF_DATA_INITIALIZER, ATOMIC_INIT(false), PTHREAD_MUTEX_INITIALIZER}}, \ + WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0} +# else +# define MALLOC_MUTEX_INITIALIZER \ + {{{LOCK_PROF_DATA_INITIALIZER, ATOMIC_INIT(false), PTHREAD_MUTEX_INITIALIZER}}, \ + WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)} +# endif +#endif + +#ifdef JEMALLOC_LAZY_LOCK +extern bool isthreaded; +#else +# undef isthreaded /* Undo private_namespace.h definition. */ +# define isthreaded true +#endif + +bool malloc_mutex_init(malloc_mutex_t *mutex, const char *name, + witness_rank_t rank, malloc_mutex_lock_order_t lock_order); +void malloc_mutex_prefork(tsdn_t *tsdn, malloc_mutex_t *mutex); +void malloc_mutex_postfork_parent(tsdn_t *tsdn, malloc_mutex_t *mutex); +void malloc_mutex_postfork_child(tsdn_t *tsdn, malloc_mutex_t *mutex); +bool malloc_mutex_boot(void); +void malloc_mutex_prof_data_reset(tsdn_t *tsdn, malloc_mutex_t *mutex); + +void malloc_mutex_lock_slow(malloc_mutex_t *mutex); + +static inline void +malloc_mutex_lock_final(malloc_mutex_t *mutex) { + MALLOC_MUTEX_LOCK(mutex); + atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED); +} + +static inline bool +malloc_mutex_trylock_final(malloc_mutex_t *mutex) { + return MALLOC_MUTEX_TRYLOCK(mutex); +} + +static inline void +mutex_owner_stats_update(tsdn_t *tsdn, malloc_mutex_t *mutex) { + if (config_stats) { + mutex_prof_data_t *data = &mutex->prof_data; + data->n_lock_ops++; + if (data->prev_owner != tsdn) { + data->prev_owner = tsdn; + data->n_owner_switches++; + } + } +} + +/* Trylock: return false if the lock is successfully acquired. */ +static inline bool +malloc_mutex_trylock(tsdn_t *tsdn, malloc_mutex_t *mutex) { + witness_assert_not_owner(tsdn_witness_tsdp_get(tsdn), &mutex->witness); + if (isthreaded) { + if (malloc_mutex_trylock_final(mutex)) { + return true; + } + mutex_owner_stats_update(tsdn, mutex); + } + witness_lock(tsdn_witness_tsdp_get(tsdn), &mutex->witness); + + return false; +} + +/* Aggregate lock prof data. */ +static inline void +malloc_mutex_prof_merge(mutex_prof_data_t *sum, mutex_prof_data_t *data) { + nstime_add(&sum->tot_wait_time, &data->tot_wait_time); + if (nstime_compare(&sum->max_wait_time, &data->max_wait_time) < 0) { + nstime_copy(&sum->max_wait_time, &data->max_wait_time); + } + + sum->n_wait_times += data->n_wait_times; + sum->n_spin_acquired += data->n_spin_acquired; + + if (sum->max_n_thds < data->max_n_thds) { + sum->max_n_thds = data->max_n_thds; + } + uint32_t cur_n_waiting_thds = atomic_load_u32(&sum->n_waiting_thds, + ATOMIC_RELAXED); + uint32_t new_n_waiting_thds = cur_n_waiting_thds + atomic_load_u32( + &data->n_waiting_thds, ATOMIC_RELAXED); + atomic_store_u32(&sum->n_waiting_thds, new_n_waiting_thds, + ATOMIC_RELAXED); + sum->n_owner_switches += data->n_owner_switches; + sum->n_lock_ops += data->n_lock_ops; +} + +static inline void +malloc_mutex_lock(tsdn_t *tsdn, malloc_mutex_t *mutex) { + witness_assert_not_owner(tsdn_witness_tsdp_get(tsdn), &mutex->witness); + if (isthreaded) { + if (malloc_mutex_trylock_final(mutex)) { + malloc_mutex_lock_slow(mutex); + atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED); + } + mutex_owner_stats_update(tsdn, mutex); + } + witness_lock(tsdn_witness_tsdp_get(tsdn), &mutex->witness); +} + +static inline void +malloc_mutex_unlock(tsdn_t *tsdn, malloc_mutex_t *mutex) { + atomic_store_b(&mutex->locked, false, ATOMIC_RELAXED); + witness_unlock(tsdn_witness_tsdp_get(tsdn), &mutex->witness); + if (isthreaded) { + MALLOC_MUTEX_UNLOCK(mutex); + } +} + +static inline void +malloc_mutex_assert_owner(tsdn_t *tsdn, malloc_mutex_t *mutex) { + witness_assert_owner(tsdn_witness_tsdp_get(tsdn), &mutex->witness); +} + +static inline void +malloc_mutex_assert_not_owner(tsdn_t *tsdn, malloc_mutex_t *mutex) { + witness_assert_not_owner(tsdn_witness_tsdp_get(tsdn), &mutex->witness); +} + +static inline void +malloc_mutex_prof_copy(mutex_prof_data_t *dst, mutex_prof_data_t *source) { + /* + * Not *really* allowed (we shouldn't be doing non-atomic loads of + * atomic data), but the mutex protection makes this safe, and writing + * a member-for-member copy is tedious for this situation. + */ + *dst = *source; + /* n_wait_thds is not reported (modified w/o locking). */ + atomic_store_u32(&dst->n_waiting_thds, 0, ATOMIC_RELAXED); +} + +/* Copy the prof data from mutex for processing. */ +static inline void +malloc_mutex_prof_read(tsdn_t *tsdn, mutex_prof_data_t *data, + malloc_mutex_t *mutex) { + /* Can only read holding the mutex. */ + malloc_mutex_assert_owner(tsdn, mutex); + malloc_mutex_prof_copy(data, &mutex->prof_data); +} + +static inline void +malloc_mutex_prof_accum(tsdn_t *tsdn, mutex_prof_data_t *data, + malloc_mutex_t *mutex) { + mutex_prof_data_t *source = &mutex->prof_data; + /* Can only read holding the mutex. */ + malloc_mutex_assert_owner(tsdn, mutex); + + nstime_add(&data->tot_wait_time, &source->tot_wait_time); + if (nstime_compare(&source->max_wait_time, &data->max_wait_time) > 0) { + nstime_copy(&data->max_wait_time, &source->max_wait_time); + } + data->n_wait_times += source->n_wait_times; + data->n_spin_acquired += source->n_spin_acquired; + if (data->max_n_thds < source->max_n_thds) { + data->max_n_thds = source->max_n_thds; + } + /* n_wait_thds is not reported. */ + atomic_store_u32(&data->n_waiting_thds, 0, ATOMIC_RELAXED); + data->n_owner_switches += source->n_owner_switches; + data->n_lock_ops += source->n_lock_ops; +} + +/* Compare the prof data and update to the maximum. */ +static inline void +malloc_mutex_prof_max_update(tsdn_t *tsdn, mutex_prof_data_t *data, + malloc_mutex_t *mutex) { + mutex_prof_data_t *source = &mutex->prof_data; + /* Can only read holding the mutex. */ + malloc_mutex_assert_owner(tsdn, mutex); + + if (nstime_compare(&source->tot_wait_time, &data->tot_wait_time) > 0) { + nstime_copy(&data->tot_wait_time, &source->tot_wait_time); + } + if (nstime_compare(&source->max_wait_time, &data->max_wait_time) > 0) { + nstime_copy(&data->max_wait_time, &source->max_wait_time); + } + if (source->n_wait_times > data->n_wait_times) { + data->n_wait_times = source->n_wait_times; + } + if (source->n_spin_acquired > data->n_spin_acquired) { + data->n_spin_acquired = source->n_spin_acquired; + } + if (source->max_n_thds > data->max_n_thds) { + data->max_n_thds = source->max_n_thds; + } + if (source->n_owner_switches > data->n_owner_switches) { + data->n_owner_switches = source->n_owner_switches; + } + if (source->n_lock_ops > data->n_lock_ops) { + data->n_lock_ops = source->n_lock_ops; + } + /* n_wait_thds is not reported. */ +} + +#endif /* JEMALLOC_INTERNAL_MUTEX_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/mutex_prof.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/mutex_prof.h new file mode 100644 index 000000000..14e4340b7 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/mutex_prof.h @@ -0,0 +1,118 @@ +#ifndef JEMALLOC_INTERNAL_MUTEX_PROF_H +#define JEMALLOC_INTERNAL_MUTEX_PROF_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/nstime.h" +#include "jemalloc/internal/tsd_types.h" + +#define MUTEX_PROF_GLOBAL_MUTEXES \ + OP(background_thread) \ + OP(max_per_bg_thd) \ + OP(ctl) \ + OP(prof) \ + OP(prof_thds_data) \ + OP(prof_dump) \ + OP(prof_recent_alloc) \ + OP(prof_recent_dump) \ + OP(prof_stats) + +typedef enum { +#define OP(mtx) global_prof_mutex_##mtx, + MUTEX_PROF_GLOBAL_MUTEXES +#undef OP + mutex_prof_num_global_mutexes +} mutex_prof_global_ind_t; + +#define MUTEX_PROF_ARENA_MUTEXES \ + OP(large) \ + OP(extent_avail) \ + OP(extents_dirty) \ + OP(extents_muzzy) \ + OP(extents_retained) \ + OP(decay_dirty) \ + OP(decay_muzzy) \ + OP(base) \ + OP(tcache_list) \ + OP(hpa_shard) \ + OP(hpa_shard_grow) \ + OP(hpa_sec) + +typedef enum { +#define OP(mtx) arena_prof_mutex_##mtx, + MUTEX_PROF_ARENA_MUTEXES +#undef OP + mutex_prof_num_arena_mutexes +} mutex_prof_arena_ind_t; + +/* + * The forth parameter is a boolean value that is true for derived rate counters + * and false for real ones. + */ +#define MUTEX_PROF_UINT64_COUNTERS \ + OP(num_ops, uint64_t, "n_lock_ops", false, num_ops) \ + OP(num_ops_ps, uint64_t, "(#/sec)", true, num_ops) \ + OP(num_wait, uint64_t, "n_waiting", false, num_wait) \ + OP(num_wait_ps, uint64_t, "(#/sec)", true, num_wait) \ + OP(num_spin_acq, uint64_t, "n_spin_acq", false, num_spin_acq) \ + OP(num_spin_acq_ps, uint64_t, "(#/sec)", true, num_spin_acq) \ + OP(num_owner_switch, uint64_t, "n_owner_switch", false, num_owner_switch) \ + OP(num_owner_switch_ps, uint64_t, "(#/sec)", true, num_owner_switch) \ + OP(total_wait_time, uint64_t, "total_wait_ns", false, total_wait_time) \ + OP(total_wait_time_ps, uint64_t, "(#/sec)", true, total_wait_time) \ + OP(max_wait_time, uint64_t, "max_wait_ns", false, max_wait_time) + +#define MUTEX_PROF_UINT32_COUNTERS \ + OP(max_num_thds, uint32_t, "max_n_thds", false, max_num_thds) + +#define MUTEX_PROF_COUNTERS \ + MUTEX_PROF_UINT64_COUNTERS \ + MUTEX_PROF_UINT32_COUNTERS + +#define OP(counter, type, human, derived, base_counter) mutex_counter_##counter, + +#define COUNTER_ENUM(counter_list, t) \ + typedef enum { \ + counter_list \ + mutex_prof_num_##t##_counters \ + } mutex_prof_##t##_counter_ind_t; + +COUNTER_ENUM(MUTEX_PROF_UINT64_COUNTERS, uint64_t) +COUNTER_ENUM(MUTEX_PROF_UINT32_COUNTERS, uint32_t) + +#undef COUNTER_ENUM +#undef OP + +typedef struct { + /* + * Counters touched on the slow path, i.e. when there is lock + * contention. We update them once we have the lock. + */ + /* Total time (in nano seconds) spent waiting on this mutex. */ + nstime_t tot_wait_time; + /* Max time (in nano seconds) spent on a single lock operation. */ + nstime_t max_wait_time; + /* # of times have to wait for this mutex (after spinning). */ + uint64_t n_wait_times; + /* # of times acquired the mutex through local spinning. */ + uint64_t n_spin_acquired; + /* Max # of threads waiting for the mutex at the same time. */ + uint32_t max_n_thds; + /* Current # of threads waiting on the lock. Atomic synced. */ + atomic_u32_t n_waiting_thds; + + /* + * Data touched on the fast path. These are modified right after we + * grab the lock, so it's placed closest to the end (i.e. right before + * the lock) so that we have a higher chance of them being on the same + * cacheline. + */ + /* # of times the mutex holder is different than the previous one. */ + uint64_t n_owner_switches; + /* Previous mutex holder, to facilitate n_owner_switches. */ + tsdn_t *prev_owner; + /* # of lock() operations in total. */ + uint64_t n_lock_ops; +} mutex_prof_data_t; + +#endif /* JEMALLOC_INTERNAL_MUTEX_PROF_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/nstime.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/nstime.h new file mode 100644 index 000000000..440a4d15e --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/nstime.h @@ -0,0 +1,76 @@ +#ifndef JEMALLOC_INTERNAL_NSTIME_H +#define JEMALLOC_INTERNAL_NSTIME_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/assert.h" + +/* Maximum supported number of seconds (~584 years). */ +#define NSTIME_SEC_MAX KQU(18446744072) + +#define NSTIME_MAGIC ((uint32_t)0xb8a9ce37) +#ifdef JEMALLOC_DEBUG +# define NSTIME_ZERO_INITIALIZER {0, NSTIME_MAGIC} +#else +# define NSTIME_ZERO_INITIALIZER {0} +#endif + +typedef struct { + uint64_t ns; +#ifdef JEMALLOC_DEBUG + uint32_t magic; /* Tracks if initialized. */ +#endif +} nstime_t; + +static const nstime_t nstime_zero = NSTIME_ZERO_INITIALIZER; + +void nstime_init(nstime_t *time, uint64_t ns); +void nstime_init2(nstime_t *time, uint64_t sec, uint64_t nsec); +uint64_t nstime_ns(const nstime_t *time); +uint64_t nstime_sec(const nstime_t *time); +uint64_t nstime_msec(const nstime_t *time); +uint64_t nstime_nsec(const nstime_t *time); +void nstime_copy(nstime_t *time, const nstime_t *source); +int nstime_compare(const nstime_t *a, const nstime_t *b); +void nstime_add(nstime_t *time, const nstime_t *addend); +void nstime_iadd(nstime_t *time, uint64_t addend); +void nstime_subtract(nstime_t *time, const nstime_t *subtrahend); +void nstime_isubtract(nstime_t *time, uint64_t subtrahend); +void nstime_imultiply(nstime_t *time, uint64_t multiplier); +void nstime_idivide(nstime_t *time, uint64_t divisor); +uint64_t nstime_divide(const nstime_t *time, const nstime_t *divisor); +uint64_t nstime_ns_since(const nstime_t *past); + +typedef bool (nstime_monotonic_t)(void); +extern nstime_monotonic_t *JET_MUTABLE nstime_monotonic; + +typedef void (nstime_update_t)(nstime_t *); +extern nstime_update_t *JET_MUTABLE nstime_update; + +typedef void (nstime_prof_update_t)(nstime_t *); +extern nstime_prof_update_t *JET_MUTABLE nstime_prof_update; + +void nstime_init_update(nstime_t *time); +void nstime_prof_init_update(nstime_t *time); + +enum prof_time_res_e { + prof_time_res_default = 0, + prof_time_res_high = 1 +}; +typedef enum prof_time_res_e prof_time_res_t; + +extern prof_time_res_t opt_prof_time_res; +extern const char *const prof_time_res_mode_names[]; + +JEMALLOC_ALWAYS_INLINE void +nstime_init_zero(nstime_t *time) { + nstime_copy(time, &nstime_zero); +} + +JEMALLOC_ALWAYS_INLINE bool +nstime_equals_zero(nstime_t *time) { + int diff = nstime_compare(time, &nstime_zero); + assert(diff >= 0); + return diff == 0; +} + +#endif /* JEMALLOC_INTERNAL_NSTIME_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/pa.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/pa.h new file mode 100644 index 000000000..756267387 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/pa.h @@ -0,0 +1,248 @@ +#ifndef JEMALLOC_INTERNAL_PA_H +#define JEMALLOC_INTERNAL_PA_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/base.h" +#include "jemalloc/internal/decay.h" +#include "jemalloc/internal/ecache.h" +#include "jemalloc/internal/edata_cache.h" +#include "jemalloc/internal/emap.h" +#include "jemalloc/internal/hpa.h" +#include "jemalloc/internal/lockedint.h" +#include "jemalloc/internal/pac.h" +#include "jemalloc/internal/pai.h" +#include "jemalloc/internal/sec.h" + +/* + * The page allocator; responsible for acquiring pages of memory for + * allocations. It picks the implementation of the page allocator interface + * (i.e. a pai_t) to handle a given page-level allocation request. For now, the + * only such implementation is the PAC code ("page allocator classic"), but + * others will be coming soon. + */ + +typedef struct pa_central_s pa_central_t; +struct pa_central_s { + hpa_central_t hpa; +}; + +/* + * The stats for a particular pa_shard. Because of the way the ctl module + * handles stats epoch data collection (it has its own arena_stats, and merges + * the stats from each arena into it), this needs to live in the arena_stats_t; + * hence we define it here and let the pa_shard have a pointer (rather than the + * more natural approach of just embedding it in the pa_shard itself). + * + * We follow the arena_stats_t approach of marking the derived fields. These + * are the ones that are not maintained on their own; instead, their values are + * derived during those stats merges. + */ +typedef struct pa_shard_stats_s pa_shard_stats_t; +struct pa_shard_stats_s { + /* Number of edata_t structs allocated by base, but not being used. */ + size_t edata_avail; /* Derived. */ + /* + * Stats specific to the PAC. For now, these are the only stats that + * exist, but there will eventually be other page allocators. Things + * like edata_avail make sense in a cross-PA sense, but things like + * npurges don't. + */ + pac_stats_t pac_stats; +}; + +/* + * The local allocator handle. Keeps the state necessary to satisfy page-sized + * allocations. + * + * The contents are mostly internal to the PA module. The key exception is that + * arena decay code is allowed to grab pointers to the dirty and muzzy ecaches + * decay_ts, for a couple of queries, passing them back to a PA function, or + * acquiring decay.mtx and looking at decay.purging. The reasoning is that, + * while PA decides what and how to purge, the arena code decides when and where + * (e.g. on what thread). It's allowed to use the presence of another purger to + * decide. + * (The background thread code also touches some other decay internals, but + * that's not fundamental; its' just an artifact of a partial refactoring, and + * its accesses could be straightforwardly moved inside the decay module). + */ +typedef struct pa_shard_s pa_shard_t; +struct pa_shard_s { + /* The central PA this shard is associated with. */ + pa_central_t *central; + + /* + * Number of pages in active extents. + * + * Synchronization: atomic. + */ + atomic_zu_t nactive; + + /* + * Whether or not we should prefer the hugepage allocator. Atomic since + * it may be concurrently modified by a thread setting extent hooks. + * Note that we still may do HPA operations in this arena; if use_hpa is + * changed from true to false, we'll free back to the hugepage allocator + * for those allocations. + */ + atomic_b_t use_hpa; + + /* + * If we never used the HPA to begin with, it wasn't initialized, and so + * we shouldn't try to e.g. acquire its mutexes during fork. This + * tracks that knowledge. + */ + bool ever_used_hpa; + + /* Allocates from a PAC. */ + pac_t pac; + + /* + * We place a small extent cache in front of the HPA, since we intend + * these configurations to use many fewer arenas, and therefore have a + * higher risk of hot locks. + */ + sec_t hpa_sec; + hpa_shard_t hpa_shard; + + /* The source of edata_t objects. */ + edata_cache_t edata_cache; + + unsigned ind; + + malloc_mutex_t *stats_mtx; + pa_shard_stats_t *stats; + + /* The emap this shard is tied to. */ + emap_t *emap; + + /* The base from which we get the ehooks and allocate metadat. */ + base_t *base; +}; + +static inline bool +pa_shard_dont_decay_muzzy(pa_shard_t *shard) { + return ecache_npages_get(&shard->pac.ecache_muzzy) == 0 && + pac_decay_ms_get(&shard->pac, extent_state_muzzy) <= 0; +} + +static inline ehooks_t * +pa_shard_ehooks_get(pa_shard_t *shard) { + return base_ehooks_get(shard->base); +} + +/* Returns true on error. */ +bool pa_central_init(pa_central_t *central, base_t *base, bool hpa, + const hpa_hooks_t *hpa_hooks); + +/* Returns true on error. */ +bool pa_shard_init(tsdn_t *tsdn, pa_shard_t *shard, pa_central_t *central, + emap_t *emap, base_t *base, unsigned ind, pa_shard_stats_t *stats, + malloc_mutex_t *stats_mtx, nstime_t *cur_time, size_t oversize_threshold, + ssize_t dirty_decay_ms, ssize_t muzzy_decay_ms); + +/* + * This isn't exposed to users; we allow late enablement of the HPA shard so + * that we can boot without worrying about the HPA, then turn it on in a0. + */ +bool pa_shard_enable_hpa(tsdn_t *tsdn, pa_shard_t *shard, + const hpa_shard_opts_t *hpa_opts, const sec_opts_t *hpa_sec_opts); + +/* + * We stop using the HPA when custom extent hooks are installed, but still + * redirect deallocations to it. + */ +void pa_shard_disable_hpa(tsdn_t *tsdn, pa_shard_t *shard); + +/* + * This does the PA-specific parts of arena reset (i.e. freeing all active + * allocations). + */ +void pa_shard_reset(tsdn_t *tsdn, pa_shard_t *shard); + +/* + * Destroy all the remaining retained extents. Should only be called after + * decaying all active, dirty, and muzzy extents to the retained state, as the + * last step in destroying the shard. + */ +void pa_shard_destroy(tsdn_t *tsdn, pa_shard_t *shard); + +/* Gets an edata for the given allocation. */ +edata_t *pa_alloc(tsdn_t *tsdn, pa_shard_t *shard, size_t size, + size_t alignment, bool slab, szind_t szind, bool zero, bool guarded, + bool *deferred_work_generated); +/* Returns true on error, in which case nothing changed. */ +bool pa_expand(tsdn_t *tsdn, pa_shard_t *shard, edata_t *edata, size_t old_size, + size_t new_size, szind_t szind, bool zero, bool *deferred_work_generated); +/* + * The same. Sets *generated_dirty to true if we produced new dirty pages, and + * false otherwise. + */ +bool pa_shrink(tsdn_t *tsdn, pa_shard_t *shard, edata_t *edata, size_t old_size, + size_t new_size, szind_t szind, bool *deferred_work_generated); +/* + * Frees the given edata back to the pa. Sets *generated_dirty if we produced + * new dirty pages (well, we always set it for now; but this need not be the + * case). + * (We could make generated_dirty the return value of course, but this is more + * consistent with the shrink pathway and our error codes here). + */ +void pa_dalloc(tsdn_t *tsdn, pa_shard_t *shard, edata_t *edata, + bool *deferred_work_generated); +bool pa_decay_ms_set(tsdn_t *tsdn, pa_shard_t *shard, extent_state_t state, + ssize_t decay_ms, pac_purge_eagerness_t eagerness); +ssize_t pa_decay_ms_get(pa_shard_t *shard, extent_state_t state); + +/* + * Do deferred work on this PA shard. + * + * Morally, this should do both PAC decay and the HPA deferred work. For now, + * though, the arena, background thread, and PAC modules are tightly interwoven + * in a way that's tricky to extricate, so we only do the HPA-specific parts. + */ +void pa_shard_set_deferral_allowed(tsdn_t *tsdn, pa_shard_t *shard, + bool deferral_allowed); +void pa_shard_do_deferred_work(tsdn_t *tsdn, pa_shard_t *shard); +void pa_shard_try_deferred_work(tsdn_t *tsdn, pa_shard_t *shard); +uint64_t pa_shard_time_until_deferred_work(tsdn_t *tsdn, pa_shard_t *shard); + +/******************************************************************************/ +/* + * Various bits of "boring" functionality that are still part of this module, + * but that we relegate to pa_extra.c, to keep the core logic in pa.c as + * readable as possible. + */ + +/* + * These fork phases are synchronized with the arena fork phase numbering to + * make it easy to keep straight. That's why there's no prefork1. + */ +void pa_shard_prefork0(tsdn_t *tsdn, pa_shard_t *shard); +void pa_shard_prefork2(tsdn_t *tsdn, pa_shard_t *shard); +void pa_shard_prefork3(tsdn_t *tsdn, pa_shard_t *shard); +void pa_shard_prefork4(tsdn_t *tsdn, pa_shard_t *shard); +void pa_shard_prefork5(tsdn_t *tsdn, pa_shard_t *shard); +void pa_shard_postfork_parent(tsdn_t *tsdn, pa_shard_t *shard); +void pa_shard_postfork_child(tsdn_t *tsdn, pa_shard_t *shard); + +size_t pa_shard_nactive(pa_shard_t *shard); +size_t pa_shard_ndirty(pa_shard_t *shard); +size_t pa_shard_nmuzzy(pa_shard_t *shard); + +void pa_shard_basic_stats_merge(pa_shard_t *shard, size_t *nactive, + size_t *ndirty, size_t *nmuzzy); + +void pa_shard_stats_merge(tsdn_t *tsdn, pa_shard_t *shard, + pa_shard_stats_t *pa_shard_stats_out, pac_estats_t *estats_out, + hpa_shard_stats_t *hpa_stats_out, sec_stats_t *sec_stats_out, + size_t *resident); + +/* + * Reads the PA-owned mutex stats into the output stats array, at the + * appropriate positions. Morally, these stats should really live in + * pa_shard_stats_t, but the indices are sort of baked into the various mutex + * prof macros. This would be a good thing to do at some point. + */ +void pa_shard_mtx_stats_read(tsdn_t *tsdn, pa_shard_t *shard, + mutex_prof_data_t mutex_prof_data[mutex_prof_num_arena_mutexes]); + +#endif /* JEMALLOC_INTERNAL_PA_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/pac.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/pac.h new file mode 100644 index 000000000..0b173a584 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/pac.h @@ -0,0 +1,183 @@ +#ifndef JEMALLOC_INTERNAL_PAC_H +#define JEMALLOC_INTERNAL_PAC_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/decay.h" +#include "jemalloc/internal/ecache.h" +#include "jemalloc/internal/edata_cache.h" +#include "jemalloc/internal/exp_grow.h" +#include "jemalloc/internal/lockedint.h" +#include "jemalloc/internal/pai.h" +#include "san_bump.h" + +/* + * Page allocator classic; an implementation of the PAI interface that: + * - Can be used for arenas with custom extent hooks. + * - Can always satisfy any allocation request (including highly-fragmentary + * ones). + * - Can use efficient OS-level zeroing primitives for demand-filled pages. + */ + +/* How "eager" decay/purging should be. */ +enum pac_purge_eagerness_e { + PAC_PURGE_ALWAYS, + PAC_PURGE_NEVER, + PAC_PURGE_ON_EPOCH_ADVANCE +}; +typedef enum pac_purge_eagerness_e pac_purge_eagerness_t; + +typedef struct pac_decay_stats_s pac_decay_stats_t; +struct pac_decay_stats_s { + /* Total number of purge sweeps. */ + locked_u64_t npurge; + /* Total number of madvise calls made. */ + locked_u64_t nmadvise; + /* Total number of pages purged. */ + locked_u64_t purged; +}; + +typedef struct pac_estats_s pac_estats_t; +struct pac_estats_s { + /* + * Stats for a given index in the range [0, SC_NPSIZES] in the various + * ecache_ts. + * We track both bytes and # of extents: two extents in the same bucket + * may have different sizes if adjacent size classes differ by more than + * a page, so bytes cannot always be derived from # of extents. + */ + size_t ndirty; + size_t dirty_bytes; + size_t nmuzzy; + size_t muzzy_bytes; + size_t nretained; + size_t retained_bytes; +}; + +typedef struct pac_stats_s pac_stats_t; +struct pac_stats_s { + pac_decay_stats_t decay_dirty; + pac_decay_stats_t decay_muzzy; + + /* + * Number of unused virtual memory bytes currently retained. Retained + * bytes are technically mapped (though always decommitted or purged), + * but they are excluded from the mapped statistic (above). + */ + size_t retained; /* Derived. */ + + /* + * Number of bytes currently mapped, excluding retained memory (and any + * base-allocated memory, which is tracked by the arena stats). + * + * We name this "pac_mapped" to avoid confusion with the arena_stats + * "mapped". + */ + atomic_zu_t pac_mapped; + + /* VM space had to be leaked (undocumented). Normally 0. */ + atomic_zu_t abandoned_vm; +}; + +typedef struct pac_s pac_t; +struct pac_s { + /* + * Must be the first member (we convert it to a PAC given only a + * pointer). The handle to the allocation interface. + */ + pai_t pai; + /* + * Collections of extents that were previously allocated. These are + * used when allocating extents, in an attempt to re-use address space. + * + * Synchronization: internal. + */ + ecache_t ecache_dirty; + ecache_t ecache_muzzy; + ecache_t ecache_retained; + + base_t *base; + emap_t *emap; + edata_cache_t *edata_cache; + + /* The grow info for the retained ecache. */ + exp_grow_t exp_grow; + malloc_mutex_t grow_mtx; + + /* Special allocator for guarded frequently reused extents. */ + san_bump_alloc_t sba; + + /* How large extents should be before getting auto-purged. */ + atomic_zu_t oversize_threshold; + + /* + * Decay-based purging state, responsible for scheduling extent state + * transitions. + * + * Synchronization: via the internal mutex. + */ + decay_t decay_dirty; /* dirty --> muzzy */ + decay_t decay_muzzy; /* muzzy --> retained */ + + malloc_mutex_t *stats_mtx; + pac_stats_t *stats; + + /* Extent serial number generator state. */ + atomic_zu_t extent_sn_next; +}; + +bool pac_init(tsdn_t *tsdn, pac_t *pac, base_t *base, emap_t *emap, + edata_cache_t *edata_cache, nstime_t *cur_time, size_t oversize_threshold, + ssize_t dirty_decay_ms, ssize_t muzzy_decay_ms, pac_stats_t *pac_stats, + malloc_mutex_t *stats_mtx); + +static inline size_t +pac_mapped(pac_t *pac) { + return atomic_load_zu(&pac->stats->pac_mapped, ATOMIC_RELAXED); +} + +static inline ehooks_t * +pac_ehooks_get(pac_t *pac) { + return base_ehooks_get(pac->base); +} + +/* + * All purging functions require holding decay->mtx. This is one of the few + * places external modules are allowed to peek inside pa_shard_t internals. + */ + +/* + * Decays the number of pages currently in the ecache. This might not leave the + * ecache empty if other threads are inserting dirty objects into it + * concurrently with the call. + */ +void pac_decay_all(tsdn_t *tsdn, pac_t *pac, decay_t *decay, + pac_decay_stats_t *decay_stats, ecache_t *ecache, bool fully_decay); +/* + * Updates decay settings for the current time, and conditionally purges in + * response (depending on decay_purge_setting). Returns whether or not the + * epoch advanced. + */ +bool pac_maybe_decay_purge(tsdn_t *tsdn, pac_t *pac, decay_t *decay, + pac_decay_stats_t *decay_stats, ecache_t *ecache, + pac_purge_eagerness_t eagerness); + +/* + * Gets / sets the maximum amount that we'll grow an arena down the + * grow-retained pathways (unless forced to by an allocaction request). + * + * Set new_limit to NULL if it's just a query, or old_limit to NULL if you don't + * care about the previous value. + * + * Returns true on error (if the new limit is not valid). + */ +bool pac_retain_grow_limit_get_set(tsdn_t *tsdn, pac_t *pac, size_t *old_limit, + size_t *new_limit); + +bool pac_decay_ms_set(tsdn_t *tsdn, pac_t *pac, extent_state_t state, + ssize_t decay_ms, pac_purge_eagerness_t eagerness); +ssize_t pac_decay_ms_get(pac_t *pac, extent_state_t state); + +void pac_reset(tsdn_t *tsdn, pac_t *pac); +void pac_destroy(tsdn_t *tsdn, pac_t *pac); + +#endif /* JEMALLOC_INTERNAL_PAC_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/pages.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/pages.h new file mode 100644 index 000000000..b4e9678e9 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/pages.h @@ -0,0 +1,125 @@ +#ifndef JEMALLOC_INTERNAL_PAGES_EXTERNS_H +#define JEMALLOC_INTERNAL_PAGES_EXTERNS_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_types.h" + +/* Actual operating system page size, detected during bootstrap, <= PAGE. */ +extern size_t os_page; + +/* Page size. LG_PAGE is determined by the configure script. */ +#ifdef PAGE_MASK +# undef PAGE_MASK +#endif +#define PAGE ((size_t)(1U << LG_PAGE)) +#define PAGE_MASK ((size_t)(PAGE - 1)) +/* Return the page base address for the page containing address a. */ +#define PAGE_ADDR2BASE(a) \ + ALIGNMENT_ADDR2BASE(a, PAGE) +/* Return the smallest pagesize multiple that is >= s. */ +#define PAGE_CEILING(s) \ + (((s) + PAGE_MASK) & ~PAGE_MASK) +/* Return the largest pagesize multiple that is <=s. */ +#define PAGE_FLOOR(s) \ + ((s) & ~PAGE_MASK) + +/* Huge page size. LG_HUGEPAGE is determined by the configure script. */ +#define HUGEPAGE ((size_t)(1U << LG_HUGEPAGE)) +#define HUGEPAGE_MASK ((size_t)(HUGEPAGE - 1)) + +#if LG_HUGEPAGE != 0 +# define HUGEPAGE_PAGES (HUGEPAGE / PAGE) +#else +/* + * It's convenient to define arrays (or bitmaps) of HUGEPAGE_PAGES lengths. If + * we can't autodetect the hugepage size, it gets treated as 0, in which case + * we'll trigger a compiler error in those arrays. Avoid this case by ensuring + * that this value is at least 1. (We won't ever run in this degraded state; + * hpa_supported() returns false in this case. + */ +# define HUGEPAGE_PAGES 1 +#endif + +/* Return the huge page base address for the huge page containing address a. */ +#define HUGEPAGE_ADDR2BASE(a) \ + ALIGNMENT_ADDR2BASE(a, HUGEPAGE) +/* Return the smallest pagesize multiple that is >= s. */ +#define HUGEPAGE_CEILING(s) \ + (((s) + HUGEPAGE_MASK) & ~HUGEPAGE_MASK) + +/* PAGES_CAN_PURGE_LAZY is defined if lazy purging is supported. */ +#if defined(_WIN32) || defined(JEMALLOC_PURGE_MADVISE_FREE) +# define PAGES_CAN_PURGE_LAZY +#endif +/* + * PAGES_CAN_PURGE_FORCED is defined if forced purging is supported. + * + * The only supported way to hard-purge on Windows is to decommit and then + * re-commit, but doing so is racy, and if re-commit fails it's a pain to + * propagate the "poisoned" memory state. Since we typically decommit as the + * next step after purging on Windows anyway, there's no point in adding such + * complexity. + */ +#if !defined(_WIN32) && ((defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \ + defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)) || \ + defined(JEMALLOC_MAPS_COALESCE)) +# define PAGES_CAN_PURGE_FORCED +#endif + +static const bool pages_can_purge_lazy = +#ifdef PAGES_CAN_PURGE_LAZY + true +#else + false +#endif + ; +static const bool pages_can_purge_forced = +#ifdef PAGES_CAN_PURGE_FORCED + true +#else + false +#endif + ; + +#if defined(JEMALLOC_HAVE_MADVISE_HUGE) || defined(JEMALLOC_HAVE_MEMCNTL) +# define PAGES_CAN_HUGIFY +#endif + +static const bool pages_can_hugify = +#ifdef PAGES_CAN_HUGIFY + true +#else + false +#endif + ; + +typedef enum { + thp_mode_default = 0, /* Do not change hugepage settings. */ + thp_mode_always = 1, /* Always set MADV_HUGEPAGE. */ + thp_mode_never = 2, /* Always set MADV_NOHUGEPAGE. */ + + thp_mode_names_limit = 3, /* Used for option processing. */ + thp_mode_not_supported = 3 /* No THP support detected. */ +} thp_mode_t; + +#define THP_MODE_DEFAULT thp_mode_default +extern thp_mode_t opt_thp; +extern thp_mode_t init_system_thp_mode; /* Initial system wide state. */ +extern const char *const thp_mode_names[]; + +void *pages_map(void *addr, size_t size, size_t alignment, bool *commit); +void pages_unmap(void *addr, size_t size); +bool pages_commit(void *addr, size_t size); +bool pages_decommit(void *addr, size_t size); +bool pages_purge_lazy(void *addr, size_t size); +bool pages_purge_forced(void *addr, size_t size); +bool pages_huge(void *addr, size_t size); +bool pages_nohuge(void *addr, size_t size); +bool pages_dontdump(void *addr, size_t size); +bool pages_dodump(void *addr, size_t size); +bool pages_boot(void); +void pages_set_thp_state (void *ptr, size_t size); +void pages_mark_guards(void *head, void *tail); +void pages_unmark_guards(void *head, void *tail); + +#endif /* JEMALLOC_INTERNAL_PAGES_EXTERNS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/pai.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/pai.h new file mode 100644 index 000000000..557d30d1e --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/pai.h @@ -0,0 +1,101 @@ +#ifndef JEMALLOC_INTERNAL_PAI_H +#define JEMALLOC_INTERNAL_PAI_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/edata.h" +#include "jemalloc/internal/tsd_types.h" + +/* An interface for page allocation. */ + +typedef struct pai_s pai_t; +struct pai_s { + /* Returns NULL on failure. */ + edata_t *(*alloc)(tsdn_t *tsdn, pai_t *self, size_t size, + size_t alignment, bool zero, bool guarded, bool frequent_reuse, + bool *deferred_work_generated); + /* + * Returns the number of extents added to the list (which may be fewer + * than requested, in case of OOM). The list should already be + * initialized. The only alignment guarantee is page-alignment, and + * the results are not necessarily zeroed. + */ + size_t (*alloc_batch)(tsdn_t *tsdn, pai_t *self, size_t size, + size_t nallocs, edata_list_active_t *results, bool frequent_reuse, + bool *deferred_work_generated); + bool (*expand)(tsdn_t *tsdn, pai_t *self, edata_t *edata, + size_t old_size, size_t new_size, bool zero, + bool *deferred_work_generated); + bool (*shrink)(tsdn_t *tsdn, pai_t *self, edata_t *edata, + size_t old_size, size_t new_size, bool *deferred_work_generated); + void (*dalloc)(tsdn_t *tsdn, pai_t *self, edata_t *edata, + bool *deferred_work_generated); + /* This function empties out list as a side-effect of being called. */ + void (*dalloc_batch)(tsdn_t *tsdn, pai_t *self, + edata_list_active_t *list, bool *deferred_work_generated); + uint64_t (*time_until_deferred_work)(tsdn_t *tsdn, pai_t *self); +}; + +/* + * These are just simple convenience functions to avoid having to reference the + * same pai_t twice on every invocation. + */ + +static inline edata_t * +pai_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, + bool zero, bool guarded, bool frequent_reuse, + bool *deferred_work_generated) { + return self->alloc(tsdn, self, size, alignment, zero, guarded, + frequent_reuse, deferred_work_generated); +} + +static inline size_t +pai_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size, size_t nallocs, + edata_list_active_t *results, bool frequent_reuse, + bool *deferred_work_generated) { + return self->alloc_batch(tsdn, self, size, nallocs, results, + frequent_reuse, deferred_work_generated); +} + +static inline bool +pai_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, + size_t new_size, bool zero, bool *deferred_work_generated) { + return self->expand(tsdn, self, edata, old_size, new_size, zero, + deferred_work_generated); +} + +static inline bool +pai_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, + size_t new_size, bool *deferred_work_generated) { + return self->shrink(tsdn, self, edata, old_size, new_size, + deferred_work_generated); +} + +static inline void +pai_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata, + bool *deferred_work_generated) { + self->dalloc(tsdn, self, edata, deferred_work_generated); +} + +static inline void +pai_dalloc_batch(tsdn_t *tsdn, pai_t *self, edata_list_active_t *list, + bool *deferred_work_generated) { + self->dalloc_batch(tsdn, self, list, deferred_work_generated); +} + +static inline uint64_t +pai_time_until_deferred_work(tsdn_t *tsdn, pai_t *self) { + return self->time_until_deferred_work(tsdn, self); +} + +/* + * An implementation of batch allocation that simply calls alloc once for + * each item in the list. + */ +size_t pai_alloc_batch_default(tsdn_t *tsdn, pai_t *self, size_t size, + size_t nallocs, edata_list_active_t *results, bool frequent_reuse, + bool *deferred_work_generated); +/* Ditto, for dalloc. */ +void pai_dalloc_batch_default(tsdn_t *tsdn, pai_t *self, + edata_list_active_t *list, bool *deferred_work_generated); + +#endif /* JEMALLOC_INTERNAL_PAI_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/peak.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/peak.h new file mode 100644 index 000000000..2a973cb83 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/peak.h @@ -0,0 +1,39 @@ +#ifndef JEMALLOC_INTERNAL_PEAK_H +#define JEMALLOC_INTERNAL_PEAK_H + +#include "jemalloc/internal/jemalloc_preamble.h" + +typedef struct peak_s peak_t; +struct peak_s { + /* The highest recorded peak value, after adjustment (see below). */ + uint64_t cur_max; + /* + * The difference between alloc and dalloc at the last set_zero call; + * this lets us cancel out the appropriate amount of excess. + */ + uint64_t adjustment; +}; + +#define PEAK_INITIALIZER {0, 0} + +static inline uint64_t +peak_max(peak_t *peak) { + return peak->cur_max; +} + +static inline void +peak_update(peak_t *peak, uint64_t alloc, uint64_t dalloc) { + int64_t candidate_max = (int64_t)(alloc - dalloc - peak->adjustment); + if (candidate_max > (int64_t)peak->cur_max) { + peak->cur_max = candidate_max; + } +} + +/* Resets the counter to zero; all peaks are now relative to this point. */ +static inline void +peak_set_zero(peak_t *peak, uint64_t alloc, uint64_t dalloc) { + peak->cur_max = 0; + peak->adjustment = alloc - dalloc; +} + +#endif /* JEMALLOC_INTERNAL_PEAK_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/peak_event.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/peak_event.h new file mode 100644 index 000000000..cc2a14018 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/peak_event.h @@ -0,0 +1,27 @@ +#ifndef JEMALLOC_INTERNAL_PEAK_EVENT_H +#define JEMALLOC_INTERNAL_PEAK_EVENT_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/tsd_types.h" + +/* + * While peak.h contains the simple helper struct that tracks state, this + * contains the allocator tie-ins (and knows about tsd, the event module, etc.). + */ + +/* Update the peak with current tsd state. */ +void peak_event_update(tsd_t *tsd); +/* Set current state to zero. */ +void peak_event_zero(tsd_t *tsd); +uint64_t peak_event_max(tsd_t *tsd); + +/* Manual hooks. */ +/* The activity-triggered hooks. */ +uint64_t peak_alloc_new_event_wait(tsd_t *tsd); +uint64_t peak_alloc_postponed_event_wait(tsd_t *tsd); +void peak_alloc_event_handler(tsd_t *tsd, uint64_t elapsed); +uint64_t peak_dalloc_new_event_wait(tsd_t *tsd); +uint64_t peak_dalloc_postponed_event_wait(tsd_t *tsd); +void peak_dalloc_event_handler(tsd_t *tsd, uint64_t elapsed); + +#endif /* JEMALLOC_INTERNAL_PEAK_EVENT_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/ph.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/ph.h new file mode 100644 index 000000000..ef9634be7 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/ph.h @@ -0,0 +1,496 @@ +#ifndef JEMALLOC_INTERNAL_PH_H +#define JEMALLOC_INTERNAL_PH_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/bit_util.h" + +/* + * A Pairing Heap implementation. + * + * "The Pairing Heap: A New Form of Self-Adjusting Heap" + * https://www.cs.cmu.edu/~sleator/papers/pairing-heaps.pdf + * + * With auxiliary twopass list, described in a follow on paper. + * + * "Pairing Heaps: Experiments and Analysis" + * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.106.2988&rep=rep1&type=pdf + * + ******************************************************************************* + * + * We include a non-obvious optimization: + * - First, we introduce a new pop-and-link operation; pop the two most + * recently-inserted items off the aux-list, link them, and push the resulting + * heap. + * - We maintain a count of the number of insertions since the last time we + * merged the aux-list (i.e. via first() or remove_first()). After N inserts, + * we do ffs(N) pop-and-link operations. + * + * One way to think of this is that we're progressively building up a tree in + * the aux-list, rather than a linked-list (think of the series of merges that + * will be performed as the aux-count grows). + * + * There's a couple reasons we benefit from this: + * - Ordinarily, after N insertions, the aux-list is of size N. With our + * strategy, it's of size O(log(N)). So we decrease the worst-case time of + * first() calls, and reduce the average cost of remove_min calls. Since + * these almost always occur while holding a lock, we practically reduce the + * frequency of unusually long hold times. + * - This moves the bulk of the work of merging the aux-list onto the threads + * that are inserting into the heap. In some common scenarios, insertions + * happen in bulk, from a single thread (think tcache flushing; we potentially + * move many slabs from slabs_full to slabs_nonfull). All the nodes in this + * case are in the inserting threads cache, and linking them is very cheap + * (cache misses dominate linking cost). Without this optimization, linking + * happens on the next call to remove_first. Since that remove_first call + * likely happens on a different thread (or at least, after the cache has + * gotten cold if done on the same thread), deferring linking trades cheap + * link operations now for expensive ones later. + * + * The ffs trick keeps amortized insert cost at constant time. Similar + * strategies based on periodically sorting the list after a batch of operations + * perform worse than this in practice, even with various fancy tricks; they + * all took amortized complexity of an insert from O(1) to O(log(n)). + */ + +typedef int (*ph_cmp_t)(void *, void *); + +/* Node structure. */ +typedef struct phn_link_s phn_link_t; +struct phn_link_s { + void *prev; + void *next; + void *lchild; +}; + +typedef struct ph_s ph_t; +struct ph_s { + void *root; + /* + * Inserts done since the last aux-list merge. This is not necessarily + * the size of the aux-list, since it's possible that removals have + * happened since, and we don't track whether or not those removals are + * from the aux list. + */ + size_t auxcount; +}; + +JEMALLOC_ALWAYS_INLINE phn_link_t * +phn_link_get(void *phn, size_t offset) { + return (phn_link_t *)(((char *)phn) + offset); +} + +JEMALLOC_ALWAYS_INLINE void +phn_link_init(void *phn, size_t offset) { + phn_link_get(phn, offset)->prev = NULL; + phn_link_get(phn, offset)->next = NULL; + phn_link_get(phn, offset)->lchild = NULL; +} + +/* Internal utility helpers. */ +JEMALLOC_ALWAYS_INLINE void * +phn_lchild_get(void *phn, size_t offset) { + return phn_link_get(phn, offset)->lchild; +} + +JEMALLOC_ALWAYS_INLINE void +phn_lchild_set(void *phn, void *lchild, size_t offset) { + phn_link_get(phn, offset)->lchild = lchild; +} + +JEMALLOC_ALWAYS_INLINE void * +phn_next_get(void *phn, size_t offset) { + return phn_link_get(phn, offset)->next; +} + +JEMALLOC_ALWAYS_INLINE void +phn_next_set(void *phn, void *next, size_t offset) { + phn_link_get(phn, offset)->next = next; +} + +JEMALLOC_ALWAYS_INLINE void * +phn_prev_get(void *phn, size_t offset) { + return phn_link_get(phn, offset)->prev; +} + +JEMALLOC_ALWAYS_INLINE void +phn_prev_set(void *phn, void *prev, size_t offset) { + phn_link_get(phn, offset)->prev = prev; +} + +JEMALLOC_ALWAYS_INLINE void +phn_merge_ordered(void *phn0, void *phn1, size_t offset, + ph_cmp_t cmp) { + void *phn0child; + + assert(phn0 != NULL); + assert(phn1 != NULL); + assert(cmp(phn0, phn1) <= 0); + + phn_prev_set(phn1, phn0, offset); + phn0child = phn_lchild_get(phn0, offset); + phn_next_set(phn1, phn0child, offset); + if (phn0child != NULL) { + /* NOLINTNEXTLINE(readability-suspicious-call-argument) */ + phn_prev_set(phn0child, phn1, offset); + } + phn_lchild_set(phn0, phn1, offset); +} + +JEMALLOC_ALWAYS_INLINE void * +phn_merge(void *phn0, void *phn1, size_t offset, ph_cmp_t cmp) { + void *result; + if (phn0 == NULL) { + result = phn1; + } else if (phn1 == NULL) { + result = phn0; + } else if (cmp(phn0, phn1) < 0) { + phn_merge_ordered(phn0, phn1, offset, cmp); + result = phn0; + } else { + /* NOLINTNEXTLINE(readability-suspicious-call-argument) */ + phn_merge_ordered(phn1, phn0, offset, cmp); + result = phn1; + } + return result; +} + +JEMALLOC_ALWAYS_INLINE void * +phn_merge_siblings(void *phn, size_t offset, ph_cmp_t cmp) { + void *head = NULL; + void *tail = NULL; + void *phn0 = phn; + void *phn1 = phn_next_get(phn0, offset); + + if (phn1 == NULL) { + return phn0; + } + + /* + * Multipass merge, wherein the first two elements of a FIFO + * are repeatedly merged, and each result is appended to the + * singly linked FIFO, until the FIFO contains only a single + * element. We start with a sibling list but no reference to + * its tail, so we do a single pass over the sibling list to + * populate the FIFO. + */ + void *phnrest = phn_next_get(phn1, offset); + if (phnrest != NULL) { + phn_prev_set(phnrest, NULL, offset); + } + phn_prev_set(phn0, NULL, offset); + phn_next_set(phn0, NULL, offset); + phn_prev_set(phn1, NULL, offset); + phn_next_set(phn1, NULL, offset); + phn0 = phn_merge(phn0, phn1, offset, cmp); + head = tail = phn0; + phn0 = phnrest; + while (phn0 != NULL) { + phn1 = phn_next_get(phn0, offset); + if (phn1 != NULL) { + phnrest = phn_next_get(phn1, offset); + if (phnrest != NULL) { + phn_prev_set(phnrest, NULL, offset); + } + phn_prev_set(phn0, NULL, offset); + phn_next_set(phn0, NULL, offset); + phn_prev_set(phn1, NULL, offset); + phn_next_set(phn1, NULL, offset); + phn0 = phn_merge(phn0, phn1, offset, cmp); + /* NOLINTNEXTLINE(readability-suspicious-call-argument) */ + phn_next_set(tail, phn0, offset); + tail = phn0; + phn0 = phnrest; + } else { + /* NOLINTNEXTLINE(readability-suspicious-call-argument) */ + phn_next_set(tail, phn0, offset); + tail = phn0; + phn0 = NULL; + } + } + phn0 = head; + phn1 = phn_next_get(phn0, offset); + if (phn1 != NULL) { + while (true) { + head = phn_next_get(phn1, offset); + assert(phn_prev_get(phn0, offset) == NULL); + phn_next_set(phn0, NULL, offset); + assert(phn_prev_get(phn1, offset) == NULL); + phn_next_set(phn1, NULL, offset); + phn0 = phn_merge(phn0, phn1, offset, cmp); + if (head == NULL) { + break; + } + /* NOLINTNEXTLINE(readability-suspicious-call-argument) */ + phn_next_set(tail, phn0, offset); + tail = phn0; + phn0 = head; + phn1 = phn_next_get(phn0, offset); + } + } + + return phn0; +} + +JEMALLOC_ALWAYS_INLINE void +ph_merge_aux(ph_t *ph, size_t offset, ph_cmp_t cmp) { + ph->auxcount = 0; + void *phn = phn_next_get(ph->root, offset); + if (phn != NULL) { + phn_prev_set(ph->root, NULL, offset); + phn_next_set(ph->root, NULL, offset); + phn_prev_set(phn, NULL, offset); + phn = phn_merge_siblings(phn, offset, cmp); + assert(phn_next_get(phn, offset) == NULL); + phn_merge_ordered(ph->root, phn, offset, cmp); + } +} + +JEMALLOC_ALWAYS_INLINE void * +ph_merge_children(void *phn, size_t offset, ph_cmp_t cmp) { + void *result; + void *lchild = phn_lchild_get(phn, offset); + if (lchild == NULL) { + result = NULL; + } else { + result = phn_merge_siblings(lchild, offset, cmp); + } + return result; +} + +JEMALLOC_ALWAYS_INLINE void +ph_new(ph_t *ph) { + ph->root = NULL; + ph->auxcount = 0; +} + +JEMALLOC_ALWAYS_INLINE bool +ph_empty(ph_t *ph) { + return ph->root == NULL; +} + +JEMALLOC_ALWAYS_INLINE void * +ph_first(ph_t *ph, size_t offset, ph_cmp_t cmp) { + if (ph->root == NULL) { + return NULL; + } + ph_merge_aux(ph, offset, cmp); + return ph->root; +} + +JEMALLOC_ALWAYS_INLINE void * +ph_any(ph_t *ph, size_t offset) { + if (ph->root == NULL) { + return NULL; + } + void *aux = phn_next_get(ph->root, offset); + if (aux != NULL) { + return aux; + } + return ph->root; +} + +/* Returns true if we should stop trying to merge. */ +JEMALLOC_ALWAYS_INLINE bool +ph_try_aux_merge_pair(ph_t *ph, size_t offset, ph_cmp_t cmp) { + assert(ph->root != NULL); + void *phn0 = phn_next_get(ph->root, offset); + if (phn0 == NULL) { + return true; + } + void *phn1 = phn_next_get(phn0, offset); + if (phn1 == NULL) { + return true; + } + void *next_phn1 = phn_next_get(phn1, offset); + phn_next_set(phn0, NULL, offset); + phn_prev_set(phn0, NULL, offset); + phn_next_set(phn1, NULL, offset); + phn_prev_set(phn1, NULL, offset); + phn0 = phn_merge(phn0, phn1, offset, cmp); + phn_next_set(phn0, next_phn1, offset); + if (next_phn1 != NULL) { + /* NOLINTNEXTLINE(readability-suspicious-call-argument) */ + phn_prev_set(next_phn1, phn0, offset); + } + phn_next_set(ph->root, phn0, offset); + phn_prev_set(phn0, ph->root, offset); + return next_phn1 == NULL; +} + +JEMALLOC_ALWAYS_INLINE void +ph_insert(ph_t *ph, void *phn, size_t offset, ph_cmp_t cmp) { + phn_link_init(phn, offset); + + /* + * Treat the root as an aux list during insertion, and lazily merge + * during a_prefix##remove_first(). For elements that are inserted, + * then removed via a_prefix##remove() before the aux list is ever + * processed, this makes insert/remove constant-time, whereas eager + * merging would make insert O(log n). + */ + if (ph->root == NULL) { + ph->root = phn; + return; + } + + /* + * As a special case, check to see if we can replace the root. + * This is practically common in some important cases, and lets + * us defer some insertions (hopefully, until the point where + * some of the items in the aux list have been removed, savings + * us from linking them at all). + */ + if (cmp(phn, ph->root) < 0) { + phn_lchild_set(phn, ph->root, offset); + phn_prev_set(ph->root, phn, offset); + ph->root = phn; + ph->auxcount = 0; + return; + } + + phn_next_set(phn, phn_next_get(ph->root, offset), offset); + if (phn_next_get(ph->root, offset) != NULL) { + phn_prev_set(phn_next_get(ph->root, offset), phn, + offset); + } + phn_prev_set(phn, ph->root, offset); + phn_next_set(ph->root, phn, offset); + + ph->auxcount++; + unsigned nmerges = ffs_zu(ph->auxcount); + bool done = false; + for (unsigned i = 0; i < nmerges && !done; i++) { + done = ph_try_aux_merge_pair(ph, offset, cmp); + } +} + +JEMALLOC_ALWAYS_INLINE void * +ph_remove_first(ph_t *ph, size_t offset, ph_cmp_t cmp) { + void *ret; + + if (ph->root == NULL) { + return NULL; + } + ph_merge_aux(ph, offset, cmp); + ret = ph->root; + ph->root = ph_merge_children(ph->root, offset, cmp); + + return ret; + +} + +JEMALLOC_ALWAYS_INLINE void +ph_remove(ph_t *ph, void *phn, size_t offset, ph_cmp_t cmp) { + if (ph->root == phn) { + ph_merge_aux(ph, offset, cmp); + ph->root = ph_merge_children(phn, offset, cmp); + return; + } + + void* prev = phn_prev_get(phn, offset); + void* next = phn_next_get(phn, offset); + + /* If we have children, then we integrate them back in the heap. */ + void* replace = ph_merge_children(phn, offset, cmp); + if (replace != NULL) { + phn_next_set(replace, next, offset); + if (next != NULL) { + phn_prev_set(next, replace, offset); + } + + next = replace; + } + + if (next != NULL) { + phn_prev_set(next, prev, offset); + } + + assert(prev != NULL); + if (phn_lchild_get(prev, offset) == phn) { + phn_lchild_set(prev, next, offset); + } else { + phn_next_set(prev, next, offset); + } +} + +#define ph_structs(a_prefix, a_type) \ +typedef struct { \ + phn_link_t link; \ +} a_prefix##_link_t; \ + \ +typedef struct { \ + ph_t ph; \ +} a_prefix##_t; + +/* + * The ph_proto() macro generates function prototypes that correspond to the + * functions generated by an equivalently parameterized call to ph_gen(). + */ +#define ph_proto(a_attr, a_prefix, a_type) \ + \ +a_attr void a_prefix##_new(a_prefix##_t *ph); \ +a_attr bool a_prefix##_empty(a_prefix##_t *ph); \ +a_attr a_type *a_prefix##_first(a_prefix##_t *ph); \ +a_attr a_type *a_prefix##_any(a_prefix##_t *ph); \ +a_attr void a_prefix##_insert(a_prefix##_t *ph, a_type *phn); \ +a_attr a_type *a_prefix##_remove_first(a_prefix##_t *ph); \ +a_attr void a_prefix##_remove(a_prefix##_t *ph, a_type *phn); \ +a_attr a_type *a_prefix##_remove_any(a_prefix##_t *ph); + +/* The ph_gen() macro generates a type-specific pairing heap implementation. */ +#define ph_gen(a_attr, a_prefix, a_type, a_field, a_cmp) \ +JEMALLOC_ALWAYS_INLINE int \ +a_prefix##_ph_cmp(void *a, void *b) { \ + return a_cmp((a_type *)a, (a_type *)b); \ +} \ + \ +a_attr void \ +a_prefix##_new(a_prefix##_t *ph) { \ + ph_new(&ph->ph); \ +} \ + \ +a_attr bool \ +a_prefix##_empty(a_prefix##_t *ph) { \ + return ph_empty(&ph->ph); \ +} \ + \ +a_attr a_type * \ +a_prefix##_first(a_prefix##_t *ph) { \ + return ph_first(&ph->ph, offsetof(a_type, a_field), \ + &a_prefix##_ph_cmp); \ +} \ + \ +a_attr a_type * \ +a_prefix##_any(a_prefix##_t *ph) { \ + return ph_any(&ph->ph, offsetof(a_type, a_field)); \ +} \ + \ +a_attr void \ +a_prefix##_insert(a_prefix##_t *ph, a_type *phn) { \ + ph_insert(&ph->ph, phn, offsetof(a_type, a_field), \ + a_prefix##_ph_cmp); \ +} \ + \ +a_attr a_type * \ +a_prefix##_remove_first(a_prefix##_t *ph) { \ + return ph_remove_first(&ph->ph, offsetof(a_type, a_field), \ + a_prefix##_ph_cmp); \ +} \ + \ +a_attr void \ +a_prefix##_remove(a_prefix##_t *ph, a_type *phn) { \ + ph_remove(&ph->ph, phn, offsetof(a_type, a_field), \ + a_prefix##_ph_cmp); \ +} \ + \ +a_attr a_type * \ +a_prefix##_remove_any(a_prefix##_t *ph) { \ + a_type *ret = a_prefix##_any(ph); \ + if (ret != NULL) { \ + a_prefix##_remove(ph, ret); \ + } \ + return ret; \ +} + +#endif /* JEMALLOC_INTERNAL_PH_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/private_namespace.gen.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/private_namespace.gen.h new file mode 100644 index 000000000..c68f27fd2 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/private_namespace.gen.h @@ -0,0 +1,726 @@ +#define a0dalloc JEMALLOC_N(a0dalloc) +#define a0malloc JEMALLOC_N(a0malloc) +#define arena_choose_hard JEMALLOC_N(arena_choose_hard) +#define arena_cleanup JEMALLOC_N(arena_cleanup) +#define arena_init JEMALLOC_N(arena_init) +#define arena_migrate JEMALLOC_N(arena_migrate) +#define arena_set JEMALLOC_N(arena_set) +#define arenas JEMALLOC_N(arenas) +#define arenas_lock JEMALLOC_N(arenas_lock) +#define batch_alloc JEMALLOC_N(batch_alloc) +#define bootstrap_calloc JEMALLOC_N(bootstrap_calloc) +#define bootstrap_free JEMALLOC_N(bootstrap_free) +#define bootstrap_malloc JEMALLOC_N(bootstrap_malloc) +#define free_default JEMALLOC_N(free_default) +#define iarena_cleanup JEMALLOC_N(iarena_cleanup) +#define je_sdallocx_noflags JEMALLOC_N(je_sdallocx_noflags) +#define jemalloc_postfork_child JEMALLOC_N(jemalloc_postfork_child) +#define jemalloc_postfork_parent JEMALLOC_N(jemalloc_postfork_parent) +#define jemalloc_prefork JEMALLOC_N(jemalloc_prefork) +#define junk_alloc_callback JEMALLOC_N(junk_alloc_callback) +#define junk_free_callback JEMALLOC_N(junk_free_callback) +#define malloc_default JEMALLOC_N(malloc_default) +#define malloc_init_state JEMALLOC_N(malloc_init_state) +#define malloc_slow JEMALLOC_N(malloc_slow) +#define manual_arena_base JEMALLOC_N(manual_arena_base) +#define narenas_auto JEMALLOC_N(narenas_auto) +#define narenas_total_get JEMALLOC_N(narenas_total_get) +#define ncpus JEMALLOC_N(ncpus) +#define opt_abort JEMALLOC_N(opt_abort) +#define opt_abort_conf JEMALLOC_N(opt_abort_conf) +#define opt_cache_oblivious JEMALLOC_N(opt_cache_oblivious) +#define opt_confirm_conf JEMALLOC_N(opt_confirm_conf) +#define opt_experimental_infallible_new JEMALLOC_N(opt_experimental_infallible_new) +#define opt_hpa JEMALLOC_N(opt_hpa) +#define opt_hpa_opts JEMALLOC_N(opt_hpa_opts) +#define opt_hpa_sec_opts JEMALLOC_N(opt_hpa_sec_opts) +#define opt_junk JEMALLOC_N(opt_junk) +#define opt_junk_alloc JEMALLOC_N(opt_junk_alloc) +#define opt_junk_free JEMALLOC_N(opt_junk_free) +#define opt_narenas JEMALLOC_N(opt_narenas) +#define opt_narenas_ratio JEMALLOC_N(opt_narenas_ratio) +#define opt_trust_madvise JEMALLOC_N(opt_trust_madvise) +#define opt_utrace JEMALLOC_N(opt_utrace) +#define opt_xmalloc JEMALLOC_N(opt_xmalloc) +#define opt_zero JEMALLOC_N(opt_zero) +#define opt_zero_realloc_action JEMALLOC_N(opt_zero_realloc_action) +#define sdallocx_default JEMALLOC_N(sdallocx_default) +#define zero_realloc_count JEMALLOC_N(zero_realloc_count) +#define zero_realloc_mode_names JEMALLOC_N(zero_realloc_mode_names) +#define arena_basic_stats_merge JEMALLOC_N(arena_basic_stats_merge) +#define arena_bin_choose JEMALLOC_N(arena_bin_choose) +#define arena_bin_offsets JEMALLOC_N(arena_bin_offsets) +#define arena_binind_div_info JEMALLOC_N(arena_binind_div_info) +#define arena_boot JEMALLOC_N(arena_boot) +#define arena_cache_bin_fill_small JEMALLOC_N(arena_cache_bin_fill_small) +#define arena_choose_huge JEMALLOC_N(arena_choose_huge) +#define arena_config_default JEMALLOC_N(arena_config_default) +#define arena_dalloc_bin_locked_handle_newly_empty JEMALLOC_N(arena_dalloc_bin_locked_handle_newly_empty) +#define arena_dalloc_bin_locked_handle_newly_nonempty JEMALLOC_N(arena_dalloc_bin_locked_handle_newly_nonempty) +#define arena_dalloc_promoted JEMALLOC_N(arena_dalloc_promoted) +#define arena_dalloc_small JEMALLOC_N(arena_dalloc_small) +#define arena_decay JEMALLOC_N(arena_decay) +#define arena_decay_ms_get JEMALLOC_N(arena_decay_ms_get) +#define arena_decay_ms_set JEMALLOC_N(arena_decay_ms_set) +#define arena_destroy JEMALLOC_N(arena_destroy) +#define arena_dirty_decay_ms_default_get JEMALLOC_N(arena_dirty_decay_ms_default_get) +#define arena_dirty_decay_ms_default_set JEMALLOC_N(arena_dirty_decay_ms_default_set) +#define arena_do_deferred_work JEMALLOC_N(arena_do_deferred_work) +#define arena_dss_prec_get JEMALLOC_N(arena_dss_prec_get) +#define arena_dss_prec_set JEMALLOC_N(arena_dss_prec_set) +#define arena_emap_global JEMALLOC_N(arena_emap_global) +#define arena_extent_alloc_large JEMALLOC_N(arena_extent_alloc_large) +#define arena_extent_dalloc_large_prep JEMALLOC_N(arena_extent_dalloc_large_prep) +#define arena_extent_ralloc_large_expand JEMALLOC_N(arena_extent_ralloc_large_expand) +#define arena_extent_ralloc_large_shrink JEMALLOC_N(arena_extent_ralloc_large_shrink) +#define arena_fill_small_fresh JEMALLOC_N(arena_fill_small_fresh) +#define arena_get_ehooks JEMALLOC_N(arena_get_ehooks) +#define arena_handle_deferred_work JEMALLOC_N(arena_handle_deferred_work) +#define arena_init_huge JEMALLOC_N(arena_init_huge) +#define arena_is_huge JEMALLOC_N(arena_is_huge) +#define arena_malloc_hard JEMALLOC_N(arena_malloc_hard) +#define arena_muzzy_decay_ms_default_get JEMALLOC_N(arena_muzzy_decay_ms_default_get) +#define arena_muzzy_decay_ms_default_set JEMALLOC_N(arena_muzzy_decay_ms_default_set) +#define arena_new JEMALLOC_N(arena_new) +#define arena_nthreads_dec JEMALLOC_N(arena_nthreads_dec) +#define arena_nthreads_get JEMALLOC_N(arena_nthreads_get) +#define arena_nthreads_inc JEMALLOC_N(arena_nthreads_inc) +#define arena_pa_central_global JEMALLOC_N(arena_pa_central_global) +#define arena_palloc JEMALLOC_N(arena_palloc) +#define arena_postfork_child JEMALLOC_N(arena_postfork_child) +#define arena_postfork_parent JEMALLOC_N(arena_postfork_parent) +#define arena_prefork0 JEMALLOC_N(arena_prefork0) +#define arena_prefork1 JEMALLOC_N(arena_prefork1) +#define arena_prefork2 JEMALLOC_N(arena_prefork2) +#define arena_prefork3 JEMALLOC_N(arena_prefork3) +#define arena_prefork4 JEMALLOC_N(arena_prefork4) +#define arena_prefork5 JEMALLOC_N(arena_prefork5) +#define arena_prefork6 JEMALLOC_N(arena_prefork6) +#define arena_prefork7 JEMALLOC_N(arena_prefork7) +#define arena_prefork8 JEMALLOC_N(arena_prefork8) +#define arena_prof_promote JEMALLOC_N(arena_prof_promote) +#define arena_ralloc JEMALLOC_N(arena_ralloc) +#define arena_ralloc_no_move JEMALLOC_N(arena_ralloc_no_move) +#define arena_reset JEMALLOC_N(arena_reset) +#define arena_retain_grow_limit_get_set JEMALLOC_N(arena_retain_grow_limit_get_set) +#define arena_set_extent_hooks JEMALLOC_N(arena_set_extent_hooks) +#define arena_slab_dalloc JEMALLOC_N(arena_slab_dalloc) +#define arena_stats_merge JEMALLOC_N(arena_stats_merge) +#define opt_dirty_decay_ms JEMALLOC_N(opt_dirty_decay_ms) +#define opt_muzzy_decay_ms JEMALLOC_N(opt_muzzy_decay_ms) +#define opt_oversize_threshold JEMALLOC_N(opt_oversize_threshold) +#define opt_percpu_arena JEMALLOC_N(opt_percpu_arena) +#define oversize_threshold JEMALLOC_N(oversize_threshold) +#define percpu_arena_mode_names JEMALLOC_N(percpu_arena_mode_names) +#define background_thread_boot0 JEMALLOC_N(background_thread_boot0) +#define background_thread_boot1 JEMALLOC_N(background_thread_boot1) +#define background_thread_create JEMALLOC_N(background_thread_create) +#define background_thread_ctl_init JEMALLOC_N(background_thread_ctl_init) +#define background_thread_enabled_state JEMALLOC_N(background_thread_enabled_state) +#define background_thread_info JEMALLOC_N(background_thread_info) +#define background_thread_is_started JEMALLOC_N(background_thread_is_started) +#define background_thread_lock JEMALLOC_N(background_thread_lock) +#define background_thread_postfork_child JEMALLOC_N(background_thread_postfork_child) +#define background_thread_postfork_parent JEMALLOC_N(background_thread_postfork_parent) +#define background_thread_prefork0 JEMALLOC_N(background_thread_prefork0) +#define background_thread_prefork1 JEMALLOC_N(background_thread_prefork1) +#define background_thread_stats_read JEMALLOC_N(background_thread_stats_read) +#define background_thread_wakeup_early JEMALLOC_N(background_thread_wakeup_early) +#define background_threads_disable JEMALLOC_N(background_threads_disable) +#define background_threads_enable JEMALLOC_N(background_threads_enable) +#define max_background_threads JEMALLOC_N(max_background_threads) +#define n_background_threads JEMALLOC_N(n_background_threads) +#define opt_background_thread JEMALLOC_N(opt_background_thread) +#define opt_max_background_threads JEMALLOC_N(opt_max_background_threads) +#define b0get JEMALLOC_N(b0get) +#define base_alloc JEMALLOC_N(base_alloc) +#define base_alloc_edata JEMALLOC_N(base_alloc_edata) +#define base_boot JEMALLOC_N(base_boot) +#define base_delete JEMALLOC_N(base_delete) +#define base_ehooks_get JEMALLOC_N(base_ehooks_get) +#define base_ehooks_get_for_metadata JEMALLOC_N(base_ehooks_get_for_metadata) +#define base_extent_hooks_set JEMALLOC_N(base_extent_hooks_set) +#define base_new JEMALLOC_N(base_new) +#define base_postfork_child JEMALLOC_N(base_postfork_child) +#define base_postfork_parent JEMALLOC_N(base_postfork_parent) +#define base_prefork JEMALLOC_N(base_prefork) +#define base_stats_get JEMALLOC_N(base_stats_get) +#define metadata_thp_mode_names JEMALLOC_N(metadata_thp_mode_names) +#define opt_metadata_thp JEMALLOC_N(opt_metadata_thp) +#define bin_init JEMALLOC_N(bin_init) +#define bin_postfork_child JEMALLOC_N(bin_postfork_child) +#define bin_postfork_parent JEMALLOC_N(bin_postfork_parent) +#define bin_prefork JEMALLOC_N(bin_prefork) +#define bin_shard_sizes_boot JEMALLOC_N(bin_shard_sizes_boot) +#define bin_update_shard_size JEMALLOC_N(bin_update_shard_size) +#define bin_info_boot JEMALLOC_N(bin_info_boot) +#define bin_infos JEMALLOC_N(bin_infos) +#define bitmap_info_init JEMALLOC_N(bitmap_info_init) +#define bitmap_init JEMALLOC_N(bitmap_init) +#define bitmap_size JEMALLOC_N(bitmap_size) +#define buf_writer_cb JEMALLOC_N(buf_writer_cb) +#define buf_writer_flush JEMALLOC_N(buf_writer_flush) +#define buf_writer_init JEMALLOC_N(buf_writer_init) +#define buf_writer_pipe JEMALLOC_N(buf_writer_pipe) +#define buf_writer_terminate JEMALLOC_N(buf_writer_terminate) +#define cache_bin_info_compute_alloc JEMALLOC_N(cache_bin_info_compute_alloc) +#define cache_bin_info_init JEMALLOC_N(cache_bin_info_init) +#define cache_bin_init JEMALLOC_N(cache_bin_init) +#define cache_bin_postincrement JEMALLOC_N(cache_bin_postincrement) +#define cache_bin_preincrement JEMALLOC_N(cache_bin_preincrement) +#define cache_bin_still_zero_initialized JEMALLOC_N(cache_bin_still_zero_initialized) +#define ckh_count JEMALLOC_N(ckh_count) +#define ckh_delete JEMALLOC_N(ckh_delete) +#define ckh_insert JEMALLOC_N(ckh_insert) +#define ckh_iter JEMALLOC_N(ckh_iter) +#define ckh_new JEMALLOC_N(ckh_new) +#define ckh_pointer_hash JEMALLOC_N(ckh_pointer_hash) +#define ckh_pointer_keycomp JEMALLOC_N(ckh_pointer_keycomp) +#define ckh_remove JEMALLOC_N(ckh_remove) +#define ckh_search JEMALLOC_N(ckh_search) +#define ckh_string_hash JEMALLOC_N(ckh_string_hash) +#define ckh_string_keycomp JEMALLOC_N(ckh_string_keycomp) +#define counter_accum_init JEMALLOC_N(counter_accum_init) +#define counter_postfork_child JEMALLOC_N(counter_postfork_child) +#define counter_postfork_parent JEMALLOC_N(counter_postfork_parent) +#define counter_prefork JEMALLOC_N(counter_prefork) +#define ctl_boot JEMALLOC_N(ctl_boot) +#define ctl_bymib JEMALLOC_N(ctl_bymib) +#define ctl_bymibname JEMALLOC_N(ctl_bymibname) +#define ctl_byname JEMALLOC_N(ctl_byname) +#define ctl_mibnametomib JEMALLOC_N(ctl_mibnametomib) +#define ctl_mtx_assert_held JEMALLOC_N(ctl_mtx_assert_held) +#define ctl_nametomib JEMALLOC_N(ctl_nametomib) +#define ctl_postfork_child JEMALLOC_N(ctl_postfork_child) +#define ctl_postfork_parent JEMALLOC_N(ctl_postfork_parent) +#define ctl_prefork JEMALLOC_N(ctl_prefork) +#define decay_deadline_init JEMALLOC_N(decay_deadline_init) +#define decay_init JEMALLOC_N(decay_init) +#define decay_maybe_advance_epoch JEMALLOC_N(decay_maybe_advance_epoch) +#define decay_ms_valid JEMALLOC_N(decay_ms_valid) +#define decay_npages_purge_in JEMALLOC_N(decay_npages_purge_in) +#define decay_ns_until_purge JEMALLOC_N(decay_ns_until_purge) +#define decay_reinit JEMALLOC_N(decay_reinit) +#define div_init JEMALLOC_N(div_init) +#define ecache_init JEMALLOC_N(ecache_init) +#define ecache_postfork_child JEMALLOC_N(ecache_postfork_child) +#define ecache_postfork_parent JEMALLOC_N(ecache_postfork_parent) +#define ecache_prefork JEMALLOC_N(ecache_prefork) +#define edata_avail_any JEMALLOC_N(edata_avail_any) +#define edata_avail_empty JEMALLOC_N(edata_avail_empty) +#define edata_avail_first JEMALLOC_N(edata_avail_first) +#define edata_avail_insert JEMALLOC_N(edata_avail_insert) +#define edata_avail_new JEMALLOC_N(edata_avail_new) +#define edata_avail_remove JEMALLOC_N(edata_avail_remove) +#define edata_avail_remove_any JEMALLOC_N(edata_avail_remove_any) +#define edata_avail_remove_first JEMALLOC_N(edata_avail_remove_first) +#define edata_heap_any JEMALLOC_N(edata_heap_any) +#define edata_heap_empty JEMALLOC_N(edata_heap_empty) +#define edata_heap_first JEMALLOC_N(edata_heap_first) +#define edata_heap_insert JEMALLOC_N(edata_heap_insert) +#define edata_heap_new JEMALLOC_N(edata_heap_new) +#define edata_heap_remove JEMALLOC_N(edata_heap_remove) +#define edata_heap_remove_any JEMALLOC_N(edata_heap_remove_any) +#define edata_heap_remove_first JEMALLOC_N(edata_heap_remove_first) +#define edata_cache_fast_disable JEMALLOC_N(edata_cache_fast_disable) +#define edata_cache_fast_get JEMALLOC_N(edata_cache_fast_get) +#define edata_cache_fast_init JEMALLOC_N(edata_cache_fast_init) +#define edata_cache_fast_put JEMALLOC_N(edata_cache_fast_put) +#define edata_cache_get JEMALLOC_N(edata_cache_get) +#define edata_cache_init JEMALLOC_N(edata_cache_init) +#define edata_cache_postfork_child JEMALLOC_N(edata_cache_postfork_child) +#define edata_cache_postfork_parent JEMALLOC_N(edata_cache_postfork_parent) +#define edata_cache_prefork JEMALLOC_N(edata_cache_prefork) +#define edata_cache_put JEMALLOC_N(edata_cache_put) +#define ehooks_default_alloc_impl JEMALLOC_N(ehooks_default_alloc_impl) +#define ehooks_default_commit_impl JEMALLOC_N(ehooks_default_commit_impl) +#define ehooks_default_dalloc_impl JEMALLOC_N(ehooks_default_dalloc_impl) +#define ehooks_default_decommit_impl JEMALLOC_N(ehooks_default_decommit_impl) +#define ehooks_default_destroy_impl JEMALLOC_N(ehooks_default_destroy_impl) +#define ehooks_default_extent_hooks JEMALLOC_N(ehooks_default_extent_hooks) +#define ehooks_default_guard_impl JEMALLOC_N(ehooks_default_guard_impl) +#define ehooks_default_merge JEMALLOC_N(ehooks_default_merge) +#define ehooks_default_merge_impl JEMALLOC_N(ehooks_default_merge_impl) +#define ehooks_default_purge_forced_impl JEMALLOC_N(ehooks_default_purge_forced_impl) +#define ehooks_default_purge_lazy_impl JEMALLOC_N(ehooks_default_purge_lazy_impl) +#define ehooks_default_split_impl JEMALLOC_N(ehooks_default_split_impl) +#define ehooks_default_unguard_impl JEMALLOC_N(ehooks_default_unguard_impl) +#define ehooks_default_zero_impl JEMALLOC_N(ehooks_default_zero_impl) +#define ehooks_init JEMALLOC_N(ehooks_init) +#define emap_deregister_boundary JEMALLOC_N(emap_deregister_boundary) +#define emap_deregister_interior JEMALLOC_N(emap_deregister_interior) +#define emap_do_assert_mapped JEMALLOC_N(emap_do_assert_mapped) +#define emap_do_assert_not_mapped JEMALLOC_N(emap_do_assert_not_mapped) +#define emap_init JEMALLOC_N(emap_init) +#define emap_merge_commit JEMALLOC_N(emap_merge_commit) +#define emap_merge_prepare JEMALLOC_N(emap_merge_prepare) +#define emap_register_boundary JEMALLOC_N(emap_register_boundary) +#define emap_register_interior JEMALLOC_N(emap_register_interior) +#define emap_release_edata JEMALLOC_N(emap_release_edata) +#define emap_remap JEMALLOC_N(emap_remap) +#define emap_split_commit JEMALLOC_N(emap_split_commit) +#define emap_split_prepare JEMALLOC_N(emap_split_prepare) +#define emap_try_acquire_edata_neighbor JEMALLOC_N(emap_try_acquire_edata_neighbor) +#define emap_try_acquire_edata_neighbor_expand JEMALLOC_N(emap_try_acquire_edata_neighbor_expand) +#define emap_update_edata_state JEMALLOC_N(emap_update_edata_state) +#define eset_fit JEMALLOC_N(eset_fit) +#define eset_init JEMALLOC_N(eset_init) +#define eset_insert JEMALLOC_N(eset_insert) +#define eset_nbytes_get JEMALLOC_N(eset_nbytes_get) +#define eset_nextents_get JEMALLOC_N(eset_nextents_get) +#define eset_npages_get JEMALLOC_N(eset_npages_get) +#define eset_remove JEMALLOC_N(eset_remove) +#define exp_grow_init JEMALLOC_N(exp_grow_init) +#define ecache_alloc JEMALLOC_N(ecache_alloc) +#define ecache_alloc_grow JEMALLOC_N(ecache_alloc_grow) +#define ecache_dalloc JEMALLOC_N(ecache_dalloc) +#define ecache_evict JEMALLOC_N(ecache_evict) +#define extent_alloc_wrapper JEMALLOC_N(extent_alloc_wrapper) +#define extent_boot JEMALLOC_N(extent_boot) +#define extent_commit_wrapper JEMALLOC_N(extent_commit_wrapper) +#define extent_commit_zero JEMALLOC_N(extent_commit_zero) +#define extent_dalloc_gap JEMALLOC_N(extent_dalloc_gap) +#define extent_dalloc_wrapper JEMALLOC_N(extent_dalloc_wrapper) +#define extent_decommit_wrapper JEMALLOC_N(extent_decommit_wrapper) +#define extent_destroy_wrapper JEMALLOC_N(extent_destroy_wrapper) +#define extent_gdump_add JEMALLOC_N(extent_gdump_add) +#define extent_merge_wrapper JEMALLOC_N(extent_merge_wrapper) +#define extent_purge_forced_wrapper JEMALLOC_N(extent_purge_forced_wrapper) +#define extent_purge_lazy_wrapper JEMALLOC_N(extent_purge_lazy_wrapper) +#define extent_record JEMALLOC_N(extent_record) +#define extent_sn_next JEMALLOC_N(extent_sn_next) +#define extent_split_wrapper JEMALLOC_N(extent_split_wrapper) +#define opt_lg_extent_max_active_fit JEMALLOC_N(opt_lg_extent_max_active_fit) +#define dss_prec_names JEMALLOC_N(dss_prec_names) +#define extent_alloc_dss JEMALLOC_N(extent_alloc_dss) +#define extent_dss_boot JEMALLOC_N(extent_dss_boot) +#define extent_dss_mergeable JEMALLOC_N(extent_dss_mergeable) +#define extent_dss_prec_get JEMALLOC_N(extent_dss_prec_get) +#define extent_dss_prec_set JEMALLOC_N(extent_dss_prec_set) +#define extent_in_dss JEMALLOC_N(extent_in_dss) +#define opt_dss JEMALLOC_N(opt_dss) +#define extent_alloc_mmap JEMALLOC_N(extent_alloc_mmap) +#define extent_dalloc_mmap JEMALLOC_N(extent_dalloc_mmap) +#define opt_retain JEMALLOC_N(opt_retain) +#define fxp_parse JEMALLOC_N(fxp_parse) +#define fxp_print JEMALLOC_N(fxp_print) +#define opt_lg_san_uaf_align JEMALLOC_N(opt_lg_san_uaf_align) +#define opt_san_guard_large JEMALLOC_N(opt_san_guard_large) +#define opt_san_guard_small JEMALLOC_N(opt_san_guard_small) +#define san_cache_bin_nonfast_mask JEMALLOC_N(san_cache_bin_nonfast_mask) +#define san_check_stashed_ptrs JEMALLOC_N(san_check_stashed_ptrs) +#define san_guard_pages JEMALLOC_N(san_guard_pages) +#define san_init JEMALLOC_N(san_init) +#define san_unguard_pages JEMALLOC_N(san_unguard_pages) +#define san_unguard_pages_pre_destroy JEMALLOC_N(san_unguard_pages_pre_destroy) +#define tsd_san_init JEMALLOC_N(tsd_san_init) +#define san_bump_alloc JEMALLOC_N(san_bump_alloc) +#define hook_boot JEMALLOC_N(hook_boot) +#define hook_install JEMALLOC_N(hook_install) +#define hook_invoke_alloc JEMALLOC_N(hook_invoke_alloc) +#define hook_invoke_dalloc JEMALLOC_N(hook_invoke_dalloc) +#define hook_invoke_expand JEMALLOC_N(hook_invoke_expand) +#define hook_remove JEMALLOC_N(hook_remove) +#define hpa_central_extract JEMALLOC_N(hpa_central_extract) +#define hpa_central_init JEMALLOC_N(hpa_central_init) +#define hpa_shard_destroy JEMALLOC_N(hpa_shard_destroy) +#define hpa_shard_disable JEMALLOC_N(hpa_shard_disable) +#define hpa_shard_do_deferred_work JEMALLOC_N(hpa_shard_do_deferred_work) +#define hpa_shard_init JEMALLOC_N(hpa_shard_init) +#define hpa_shard_postfork_child JEMALLOC_N(hpa_shard_postfork_child) +#define hpa_shard_postfork_parent JEMALLOC_N(hpa_shard_postfork_parent) +#define hpa_shard_prefork3 JEMALLOC_N(hpa_shard_prefork3) +#define hpa_shard_prefork4 JEMALLOC_N(hpa_shard_prefork4) +#define hpa_shard_set_deferral_allowed JEMALLOC_N(hpa_shard_set_deferral_allowed) +#define hpa_shard_stats_accum JEMALLOC_N(hpa_shard_stats_accum) +#define hpa_shard_stats_merge JEMALLOC_N(hpa_shard_stats_merge) +#define hpa_supported JEMALLOC_N(hpa_supported) +#define hpa_hooks_default JEMALLOC_N(hpa_hooks_default) +#define hpdata_age_heap_any JEMALLOC_N(hpdata_age_heap_any) +#define hpdata_age_heap_empty JEMALLOC_N(hpdata_age_heap_empty) +#define hpdata_age_heap_first JEMALLOC_N(hpdata_age_heap_first) +#define hpdata_age_heap_insert JEMALLOC_N(hpdata_age_heap_insert) +#define hpdata_age_heap_new JEMALLOC_N(hpdata_age_heap_new) +#define hpdata_age_heap_remove JEMALLOC_N(hpdata_age_heap_remove) +#define hpdata_age_heap_remove_any JEMALLOC_N(hpdata_age_heap_remove_any) +#define hpdata_age_heap_remove_first JEMALLOC_N(hpdata_age_heap_remove_first) +#define hpdata_dehugify JEMALLOC_N(hpdata_dehugify) +#define hpdata_hugify JEMALLOC_N(hpdata_hugify) +#define hpdata_init JEMALLOC_N(hpdata_init) +#define hpdata_purge_begin JEMALLOC_N(hpdata_purge_begin) +#define hpdata_purge_end JEMALLOC_N(hpdata_purge_end) +#define hpdata_purge_next JEMALLOC_N(hpdata_purge_next) +#define hpdata_reserve_alloc JEMALLOC_N(hpdata_reserve_alloc) +#define hpdata_unreserve JEMALLOC_N(hpdata_unreserve) +#define inspect_extent_util_stats_get JEMALLOC_N(inspect_extent_util_stats_get) +#define inspect_extent_util_stats_verbose_get JEMALLOC_N(inspect_extent_util_stats_verbose_get) +#define large_dalloc JEMALLOC_N(large_dalloc) +#define large_dalloc_finish JEMALLOC_N(large_dalloc_finish) +#define large_dalloc_prep_locked JEMALLOC_N(large_dalloc_prep_locked) +#define large_malloc JEMALLOC_N(large_malloc) +#define large_palloc JEMALLOC_N(large_palloc) +#define large_prof_info_get JEMALLOC_N(large_prof_info_get) +#define large_prof_info_set JEMALLOC_N(large_prof_info_set) +#define large_prof_tctx_reset JEMALLOC_N(large_prof_tctx_reset) +#define large_ralloc JEMALLOC_N(large_ralloc) +#define large_ralloc_no_move JEMALLOC_N(large_ralloc_no_move) +#define large_salloc JEMALLOC_N(large_salloc) +#define log_init_done JEMALLOC_N(log_init_done) +#define log_var_names JEMALLOC_N(log_var_names) +#define log_var_update_state JEMALLOC_N(log_var_update_state) +#define buferror JEMALLOC_N(buferror) +#define malloc_cprintf JEMALLOC_N(malloc_cprintf) +#define malloc_printf JEMALLOC_N(malloc_printf) +#define malloc_snprintf JEMALLOC_N(malloc_snprintf) +#define malloc_strtoumax JEMALLOC_N(malloc_strtoumax) +#define malloc_vcprintf JEMALLOC_N(malloc_vcprintf) +#define malloc_vsnprintf JEMALLOC_N(malloc_vsnprintf) +#define malloc_write JEMALLOC_N(malloc_write) +#define wrtmessage JEMALLOC_N(wrtmessage) +#define malloc_mutex_boot JEMALLOC_N(malloc_mutex_boot) +#define malloc_mutex_init JEMALLOC_N(malloc_mutex_init) +#define malloc_mutex_lock_slow JEMALLOC_N(malloc_mutex_lock_slow) +#define malloc_mutex_postfork_child JEMALLOC_N(malloc_mutex_postfork_child) +#define malloc_mutex_postfork_parent JEMALLOC_N(malloc_mutex_postfork_parent) +#define malloc_mutex_prefork JEMALLOC_N(malloc_mutex_prefork) +#define malloc_mutex_prof_data_reset JEMALLOC_N(malloc_mutex_prof_data_reset) +#define opt_mutex_max_spin JEMALLOC_N(opt_mutex_max_spin) +#define nstime_add JEMALLOC_N(nstime_add) +#define nstime_compare JEMALLOC_N(nstime_compare) +#define nstime_copy JEMALLOC_N(nstime_copy) +#define nstime_divide JEMALLOC_N(nstime_divide) +#define nstime_iadd JEMALLOC_N(nstime_iadd) +#define nstime_idivide JEMALLOC_N(nstime_idivide) +#define nstime_imultiply JEMALLOC_N(nstime_imultiply) +#define nstime_init JEMALLOC_N(nstime_init) +#define nstime_init2 JEMALLOC_N(nstime_init2) +#define nstime_init_update JEMALLOC_N(nstime_init_update) +#define nstime_isubtract JEMALLOC_N(nstime_isubtract) +#define nstime_monotonic JEMALLOC_N(nstime_monotonic) +#define nstime_msec JEMALLOC_N(nstime_msec) +#define nstime_ns JEMALLOC_N(nstime_ns) +#define nstime_ns_since JEMALLOC_N(nstime_ns_since) +#define nstime_nsec JEMALLOC_N(nstime_nsec) +#define nstime_prof_init_update JEMALLOC_N(nstime_prof_init_update) +#define nstime_prof_update JEMALLOC_N(nstime_prof_update) +#define nstime_sec JEMALLOC_N(nstime_sec) +#define nstime_subtract JEMALLOC_N(nstime_subtract) +#define nstime_update JEMALLOC_N(nstime_update) +#define opt_prof_time_res JEMALLOC_N(opt_prof_time_res) +#define prof_time_res_mode_names JEMALLOC_N(prof_time_res_mode_names) +#define pa_alloc JEMALLOC_N(pa_alloc) +#define pa_central_init JEMALLOC_N(pa_central_init) +#define pa_dalloc JEMALLOC_N(pa_dalloc) +#define pa_decay_ms_get JEMALLOC_N(pa_decay_ms_get) +#define pa_decay_ms_set JEMALLOC_N(pa_decay_ms_set) +#define pa_expand JEMALLOC_N(pa_expand) +#define pa_shard_destroy JEMALLOC_N(pa_shard_destroy) +#define pa_shard_disable_hpa JEMALLOC_N(pa_shard_disable_hpa) +#define pa_shard_do_deferred_work JEMALLOC_N(pa_shard_do_deferred_work) +#define pa_shard_enable_hpa JEMALLOC_N(pa_shard_enable_hpa) +#define pa_shard_init JEMALLOC_N(pa_shard_init) +#define pa_shard_reset JEMALLOC_N(pa_shard_reset) +#define pa_shard_retain_grow_limit_get_set JEMALLOC_N(pa_shard_retain_grow_limit_get_set) +#define pa_shard_set_deferral_allowed JEMALLOC_N(pa_shard_set_deferral_allowed) +#define pa_shard_time_until_deferred_work JEMALLOC_N(pa_shard_time_until_deferred_work) +#define pa_shrink JEMALLOC_N(pa_shrink) +#define pa_shard_basic_stats_merge JEMALLOC_N(pa_shard_basic_stats_merge) +#define pa_shard_mtx_stats_read JEMALLOC_N(pa_shard_mtx_stats_read) +#define pa_shard_postfork_child JEMALLOC_N(pa_shard_postfork_child) +#define pa_shard_postfork_parent JEMALLOC_N(pa_shard_postfork_parent) +#define pa_shard_prefork0 JEMALLOC_N(pa_shard_prefork0) +#define pa_shard_prefork2 JEMALLOC_N(pa_shard_prefork2) +#define pa_shard_prefork3 JEMALLOC_N(pa_shard_prefork3) +#define pa_shard_prefork4 JEMALLOC_N(pa_shard_prefork4) +#define pa_shard_prefork5 JEMALLOC_N(pa_shard_prefork5) +#define pa_shard_stats_merge JEMALLOC_N(pa_shard_stats_merge) +#define pai_alloc_batch_default JEMALLOC_N(pai_alloc_batch_default) +#define pai_dalloc_batch_default JEMALLOC_N(pai_dalloc_batch_default) +#define pac_decay_all JEMALLOC_N(pac_decay_all) +#define pac_decay_ms_get JEMALLOC_N(pac_decay_ms_get) +#define pac_decay_ms_set JEMALLOC_N(pac_decay_ms_set) +#define pac_destroy JEMALLOC_N(pac_destroy) +#define pac_init JEMALLOC_N(pac_init) +#define pac_maybe_decay_purge JEMALLOC_N(pac_maybe_decay_purge) +#define pac_reset JEMALLOC_N(pac_reset) +#define pac_retain_grow_limit_get_set JEMALLOC_N(pac_retain_grow_limit_get_set) +#define init_system_thp_mode JEMALLOC_N(init_system_thp_mode) +#define opt_thp JEMALLOC_N(opt_thp) +#define pages_boot JEMALLOC_N(pages_boot) +#define pages_commit JEMALLOC_N(pages_commit) +#define pages_decommit JEMALLOC_N(pages_decommit) +#define pages_dodump JEMALLOC_N(pages_dodump) +#define pages_dontdump JEMALLOC_N(pages_dontdump) +#define pages_huge JEMALLOC_N(pages_huge) +#define pages_map JEMALLOC_N(pages_map) +#define pages_mark_guards JEMALLOC_N(pages_mark_guards) +#define pages_nohuge JEMALLOC_N(pages_nohuge) +#define pages_purge_forced JEMALLOC_N(pages_purge_forced) +#define pages_purge_lazy JEMALLOC_N(pages_purge_lazy) +#define pages_set_thp_state JEMALLOC_N(pages_set_thp_state) +#define pages_unmap JEMALLOC_N(pages_unmap) +#define pages_unmark_guards JEMALLOC_N(pages_unmark_guards) +#define thp_mode_names JEMALLOC_N(thp_mode_names) +#define peak_alloc_event_handler JEMALLOC_N(peak_alloc_event_handler) +#define peak_alloc_new_event_wait JEMALLOC_N(peak_alloc_new_event_wait) +#define peak_alloc_postponed_event_wait JEMALLOC_N(peak_alloc_postponed_event_wait) +#define peak_dalloc_event_handler JEMALLOC_N(peak_dalloc_event_handler) +#define peak_dalloc_new_event_wait JEMALLOC_N(peak_dalloc_new_event_wait) +#define peak_dalloc_postponed_event_wait JEMALLOC_N(peak_dalloc_postponed_event_wait) +#define peak_event_max JEMALLOC_N(peak_event_max) +#define peak_event_update JEMALLOC_N(peak_event_update) +#define peak_event_zero JEMALLOC_N(peak_event_zero) +#define lg_prof_sample JEMALLOC_N(lg_prof_sample) +#define opt_lg_prof_interval JEMALLOC_N(opt_lg_prof_interval) +#define opt_lg_prof_sample JEMALLOC_N(opt_lg_prof_sample) +#define opt_prof JEMALLOC_N(opt_prof) +#define opt_prof_accum JEMALLOC_N(opt_prof_accum) +#define opt_prof_active JEMALLOC_N(opt_prof_active) +#define opt_prof_final JEMALLOC_N(opt_prof_final) +#define opt_prof_gdump JEMALLOC_N(opt_prof_gdump) +#define opt_prof_leak JEMALLOC_N(opt_prof_leak) +#define opt_prof_leak_error JEMALLOC_N(opt_prof_leak_error) +#define opt_prof_prefix JEMALLOC_N(opt_prof_prefix) +#define opt_prof_sys_thread_name JEMALLOC_N(opt_prof_sys_thread_name) +#define opt_prof_thread_active_init JEMALLOC_N(opt_prof_thread_active_init) +#define opt_prof_unbias JEMALLOC_N(opt_prof_unbias) +#define prof_active_get JEMALLOC_N(prof_active_get) +#define prof_active_set JEMALLOC_N(prof_active_set) +#define prof_active_state JEMALLOC_N(prof_active_state) +#define prof_alloc_rollback JEMALLOC_N(prof_alloc_rollback) +#define prof_backtrace_hook JEMALLOC_N(prof_backtrace_hook) +#define prof_backtrace_hook_get JEMALLOC_N(prof_backtrace_hook_get) +#define prof_backtrace_hook_set JEMALLOC_N(prof_backtrace_hook_set) +#define prof_boot0 JEMALLOC_N(prof_boot0) +#define prof_boot1 JEMALLOC_N(prof_boot1) +#define prof_boot2 JEMALLOC_N(prof_boot2) +#define prof_booted JEMALLOC_N(prof_booted) +#define prof_dump_hook JEMALLOC_N(prof_dump_hook) +#define prof_dump_hook_get JEMALLOC_N(prof_dump_hook_get) +#define prof_dump_hook_set JEMALLOC_N(prof_dump_hook_set) +#define prof_free_sampled_object JEMALLOC_N(prof_free_sampled_object) +#define prof_gdump JEMALLOC_N(prof_gdump) +#define prof_gdump_get JEMALLOC_N(prof_gdump_get) +#define prof_gdump_set JEMALLOC_N(prof_gdump_set) +#define prof_gdump_val JEMALLOC_N(prof_gdump_val) +#define prof_idump JEMALLOC_N(prof_idump) +#define prof_interval JEMALLOC_N(prof_interval) +#define prof_malloc_sample_object JEMALLOC_N(prof_malloc_sample_object) +#define prof_mdump JEMALLOC_N(prof_mdump) +#define prof_postfork_child JEMALLOC_N(prof_postfork_child) +#define prof_postfork_parent JEMALLOC_N(prof_postfork_parent) +#define prof_prefork0 JEMALLOC_N(prof_prefork0) +#define prof_prefork1 JEMALLOC_N(prof_prefork1) +#define prof_sample_event_handler JEMALLOC_N(prof_sample_event_handler) +#define prof_sample_new_event_wait JEMALLOC_N(prof_sample_new_event_wait) +#define prof_sample_postponed_event_wait JEMALLOC_N(prof_sample_postponed_event_wait) +#define prof_tctx_create JEMALLOC_N(prof_tctx_create) +#define prof_tdata_cleanup JEMALLOC_N(prof_tdata_cleanup) +#define prof_tdata_init JEMALLOC_N(prof_tdata_init) +#define prof_tdata_reinit JEMALLOC_N(prof_tdata_reinit) +#define prof_thread_active_get JEMALLOC_N(prof_thread_active_get) +#define prof_thread_active_init_get JEMALLOC_N(prof_thread_active_init_get) +#define prof_thread_active_init_set JEMALLOC_N(prof_thread_active_init_set) +#define prof_thread_active_set JEMALLOC_N(prof_thread_active_set) +#define prof_thread_name_get JEMALLOC_N(prof_thread_name_get) +#define prof_thread_name_set JEMALLOC_N(prof_thread_name_set) +#define bt2gctx_mtx JEMALLOC_N(bt2gctx_mtx) +#define gctx_locks JEMALLOC_N(gctx_locks) +#define prof_bt_count JEMALLOC_N(prof_bt_count) +#define prof_bt_hash JEMALLOC_N(prof_bt_hash) +#define prof_bt_keycomp JEMALLOC_N(prof_bt_keycomp) +#define prof_cnt_all JEMALLOC_N(prof_cnt_all) +#define prof_data_init JEMALLOC_N(prof_data_init) +#define prof_dump_impl JEMALLOC_N(prof_dump_impl) +#define prof_dump_mtx JEMALLOC_N(prof_dump_mtx) +#define prof_lookup JEMALLOC_N(prof_lookup) +#define prof_reset JEMALLOC_N(prof_reset) +#define prof_shifted_unbiased_cnt JEMALLOC_N(prof_shifted_unbiased_cnt) +#define prof_tctx_try_destroy JEMALLOC_N(prof_tctx_try_destroy) +#define prof_tdata_count JEMALLOC_N(prof_tdata_count) +#define prof_tdata_detach JEMALLOC_N(prof_tdata_detach) +#define prof_tdata_init_impl JEMALLOC_N(prof_tdata_init_impl) +#define prof_thread_name_alloc JEMALLOC_N(prof_thread_name_alloc) +#define prof_thread_name_set_impl JEMALLOC_N(prof_thread_name_set_impl) +#define prof_unbias_map_init JEMALLOC_N(prof_unbias_map_init) +#define prof_unbiased_sz JEMALLOC_N(prof_unbiased_sz) +#define tdata_locks JEMALLOC_N(tdata_locks) +#define tdatas_mtx JEMALLOC_N(tdatas_mtx) +#define log_mtx JEMALLOC_N(log_mtx) +#define opt_prof_log JEMALLOC_N(opt_prof_log) +#define prof_log_alloc_count JEMALLOC_N(prof_log_alloc_count) +#define prof_log_bt_count JEMALLOC_N(prof_log_bt_count) +#define prof_log_dummy_set JEMALLOC_N(prof_log_dummy_set) +#define prof_log_init JEMALLOC_N(prof_log_init) +#define prof_log_is_logging JEMALLOC_N(prof_log_is_logging) +#define prof_log_rep_check JEMALLOC_N(prof_log_rep_check) +#define prof_log_start JEMALLOC_N(prof_log_start) +#define prof_log_stop JEMALLOC_N(prof_log_stop) +#define prof_log_thr_count JEMALLOC_N(prof_log_thr_count) +#define prof_logging_state JEMALLOC_N(prof_logging_state) +#define prof_try_log JEMALLOC_N(prof_try_log) +#define edata_prof_recent_alloc_get_no_lock_test JEMALLOC_N(edata_prof_recent_alloc_get_no_lock_test) +#define edata_prof_recent_alloc_init JEMALLOC_N(edata_prof_recent_alloc_init) +#define opt_prof_recent_alloc_max JEMALLOC_N(opt_prof_recent_alloc_max) +#define prof_recent_alloc JEMALLOC_N(prof_recent_alloc) +#define prof_recent_alloc_dump JEMALLOC_N(prof_recent_alloc_dump) +#define prof_recent_alloc_edata_get_no_lock_test JEMALLOC_N(prof_recent_alloc_edata_get_no_lock_test) +#define prof_recent_alloc_list JEMALLOC_N(prof_recent_alloc_list) +#define prof_recent_alloc_max_ctl_read JEMALLOC_N(prof_recent_alloc_max_ctl_read) +#define prof_recent_alloc_max_ctl_write JEMALLOC_N(prof_recent_alloc_max_ctl_write) +#define prof_recent_alloc_mtx JEMALLOC_N(prof_recent_alloc_mtx) +#define prof_recent_alloc_prepare JEMALLOC_N(prof_recent_alloc_prepare) +#define prof_recent_alloc_reset JEMALLOC_N(prof_recent_alloc_reset) +#define prof_recent_dump_mtx JEMALLOC_N(prof_recent_dump_mtx) +#define prof_recent_init JEMALLOC_N(prof_recent_init) +#define opt_prof_stats JEMALLOC_N(opt_prof_stats) +#define prof_stats_dec JEMALLOC_N(prof_stats_dec) +#define prof_stats_get_accum JEMALLOC_N(prof_stats_get_accum) +#define prof_stats_get_live JEMALLOC_N(prof_stats_get_live) +#define prof_stats_inc JEMALLOC_N(prof_stats_inc) +#define prof_stats_mtx JEMALLOC_N(prof_stats_mtx) +#define bt_init JEMALLOC_N(bt_init) +#define prof_backtrace JEMALLOC_N(prof_backtrace) +#define prof_base JEMALLOC_N(prof_base) +#define prof_do_mock JEMALLOC_N(prof_do_mock) +#define prof_dump_filename_mtx JEMALLOC_N(prof_dump_filename_mtx) +#define prof_dump_open_file JEMALLOC_N(prof_dump_open_file) +#define prof_dump_open_maps JEMALLOC_N(prof_dump_open_maps) +#define prof_dump_write_file JEMALLOC_N(prof_dump_write_file) +#define prof_fdump_impl JEMALLOC_N(prof_fdump_impl) +#define prof_gdump_impl JEMALLOC_N(prof_gdump_impl) +#define prof_get_default_filename JEMALLOC_N(prof_get_default_filename) +#define prof_getpid JEMALLOC_N(prof_getpid) +#define prof_hooks_init JEMALLOC_N(prof_hooks_init) +#define prof_idump_impl JEMALLOC_N(prof_idump_impl) +#define prof_mdump_impl JEMALLOC_N(prof_mdump_impl) +#define prof_prefix_set JEMALLOC_N(prof_prefix_set) +#define prof_sys_thread_name_fetch JEMALLOC_N(prof_sys_thread_name_fetch) +#define prof_sys_thread_name_read JEMALLOC_N(prof_sys_thread_name_read) +#define prof_unwind_init JEMALLOC_N(prof_unwind_init) +#define psset_init JEMALLOC_N(psset_init) +#define psset_insert JEMALLOC_N(psset_insert) +#define psset_pick_alloc JEMALLOC_N(psset_pick_alloc) +#define psset_pick_hugify JEMALLOC_N(psset_pick_hugify) +#define psset_pick_purge JEMALLOC_N(psset_pick_purge) +#define psset_remove JEMALLOC_N(psset_remove) +#define psset_stats_accum JEMALLOC_N(psset_stats_accum) +#define psset_update_begin JEMALLOC_N(psset_update_begin) +#define psset_update_end JEMALLOC_N(psset_update_end) +#define rtree_ctx_data_init JEMALLOC_N(rtree_ctx_data_init) +#define rtree_leaf_elm_lookup_hard JEMALLOC_N(rtree_leaf_elm_lookup_hard) +#define rtree_new JEMALLOC_N(rtree_new) +#define safety_check_fail JEMALLOC_N(safety_check_fail) +#define safety_check_fail_sized_dealloc JEMALLOC_N(safety_check_fail_sized_dealloc) +#define safety_check_set_abort JEMALLOC_N(safety_check_set_abort) +#define reg_size_compute JEMALLOC_N(reg_size_compute) +#define sc_boot JEMALLOC_N(sc_boot) +#define sc_data_init JEMALLOC_N(sc_data_init) +#define sc_data_update_slab_size JEMALLOC_N(sc_data_update_slab_size) +#define sec_disable JEMALLOC_N(sec_disable) +#define sec_flush JEMALLOC_N(sec_flush) +#define sec_init JEMALLOC_N(sec_init) +#define sec_mutex_stats_read JEMALLOC_N(sec_mutex_stats_read) +#define sec_postfork_child JEMALLOC_N(sec_postfork_child) +#define sec_postfork_parent JEMALLOC_N(sec_postfork_parent) +#define sec_prefork2 JEMALLOC_N(sec_prefork2) +#define sec_stats_merge JEMALLOC_N(sec_stats_merge) +#define arena_mutex_names JEMALLOC_N(arena_mutex_names) +#define global_mutex_names JEMALLOC_N(global_mutex_names) +#define opt_stats_interval JEMALLOC_N(opt_stats_interval) +#define opt_stats_interval_opts JEMALLOC_N(opt_stats_interval_opts) +#define opt_stats_print JEMALLOC_N(opt_stats_print) +#define opt_stats_print_opts JEMALLOC_N(opt_stats_print_opts) +#define stats_boot JEMALLOC_N(stats_boot) +#define stats_interval_event_handler JEMALLOC_N(stats_interval_event_handler) +#define stats_interval_new_event_wait JEMALLOC_N(stats_interval_new_event_wait) +#define stats_interval_postponed_event_wait JEMALLOC_N(stats_interval_postponed_event_wait) +#define stats_postfork_child JEMALLOC_N(stats_postfork_child) +#define stats_postfork_parent JEMALLOC_N(stats_postfork_parent) +#define stats_prefork JEMALLOC_N(stats_prefork) +#define stats_print JEMALLOC_N(stats_print) +#define sz_boot JEMALLOC_N(sz_boot) +#define sz_index2size_tab JEMALLOC_N(sz_index2size_tab) +#define sz_large_pad JEMALLOC_N(sz_large_pad) +#define sz_pind2sz_tab JEMALLOC_N(sz_pind2sz_tab) +#define sz_psz_quantize_ceil JEMALLOC_N(sz_psz_quantize_ceil) +#define sz_psz_quantize_floor JEMALLOC_N(sz_psz_quantize_floor) +#define sz_size2index_tab JEMALLOC_N(sz_size2index_tab) +#define nhbins JEMALLOC_N(nhbins) +#define opt_lg_tcache_flush_large_div JEMALLOC_N(opt_lg_tcache_flush_large_div) +#define opt_lg_tcache_flush_small_div JEMALLOC_N(opt_lg_tcache_flush_small_div) +#define opt_lg_tcache_nslots_mul JEMALLOC_N(opt_lg_tcache_nslots_mul) +#define opt_tcache JEMALLOC_N(opt_tcache) +#define opt_tcache_gc_delay_bytes JEMALLOC_N(opt_tcache_gc_delay_bytes) +#define opt_tcache_gc_incr_bytes JEMALLOC_N(opt_tcache_gc_incr_bytes) +#define opt_tcache_max JEMALLOC_N(opt_tcache_max) +#define opt_tcache_nslots_large JEMALLOC_N(opt_tcache_nslots_large) +#define opt_tcache_nslots_small_max JEMALLOC_N(opt_tcache_nslots_small_max) +#define opt_tcache_nslots_small_min JEMALLOC_N(opt_tcache_nslots_small_min) +#define tcache_alloc_small_hard JEMALLOC_N(tcache_alloc_small_hard) +#define tcache_arena_associate JEMALLOC_N(tcache_arena_associate) +#define tcache_arena_reassociate JEMALLOC_N(tcache_arena_reassociate) +#define tcache_assert_initialized JEMALLOC_N(tcache_assert_initialized) +#define tcache_bin_flush_large JEMALLOC_N(tcache_bin_flush_large) +#define tcache_bin_flush_small JEMALLOC_N(tcache_bin_flush_small) +#define tcache_bin_flush_stashed JEMALLOC_N(tcache_bin_flush_stashed) +#define tcache_bin_info JEMALLOC_N(tcache_bin_info) +#define tcache_boot JEMALLOC_N(tcache_boot) +#define tcache_cleanup JEMALLOC_N(tcache_cleanup) +#define tcache_create_explicit JEMALLOC_N(tcache_create_explicit) +#define tcache_flush JEMALLOC_N(tcache_flush) +#define tcache_gc_dalloc_event_handler JEMALLOC_N(tcache_gc_dalloc_event_handler) +#define tcache_gc_dalloc_new_event_wait JEMALLOC_N(tcache_gc_dalloc_new_event_wait) +#define tcache_gc_dalloc_postponed_event_wait JEMALLOC_N(tcache_gc_dalloc_postponed_event_wait) +#define tcache_gc_event_handler JEMALLOC_N(tcache_gc_event_handler) +#define tcache_gc_new_event_wait JEMALLOC_N(tcache_gc_new_event_wait) +#define tcache_gc_postponed_event_wait JEMALLOC_N(tcache_gc_postponed_event_wait) +#define tcache_maxclass JEMALLOC_N(tcache_maxclass) +#define tcache_postfork_child JEMALLOC_N(tcache_postfork_child) +#define tcache_postfork_parent JEMALLOC_N(tcache_postfork_parent) +#define tcache_prefork JEMALLOC_N(tcache_prefork) +#define tcache_salloc JEMALLOC_N(tcache_salloc) +#define tcache_stats_merge JEMALLOC_N(tcache_stats_merge) +#define tcaches JEMALLOC_N(tcaches) +#define tcaches_create JEMALLOC_N(tcaches_create) +#define tcaches_destroy JEMALLOC_N(tcaches_destroy) +#define tcaches_flush JEMALLOC_N(tcaches_flush) +#define tsd_tcache_data_init JEMALLOC_N(tsd_tcache_data_init) +#define tsd_tcache_enabled_data_init JEMALLOC_N(tsd_tcache_enabled_data_init) +#define test_hooks_arena_new_hook JEMALLOC_N(test_hooks_arena_new_hook) +#define test_hooks_libc_hook JEMALLOC_N(test_hooks_libc_hook) +#define te_assert_invariants_debug JEMALLOC_N(te_assert_invariants_debug) +#define te_event_trigger JEMALLOC_N(te_event_trigger) +#define te_recompute_fast_threshold JEMALLOC_N(te_recompute_fast_threshold) +#define tsd_te_init JEMALLOC_N(tsd_te_init) +#define ticker_geom_table JEMALLOC_N(ticker_geom_table) +#define malloc_tsd_boot0 JEMALLOC_N(malloc_tsd_boot0) +#define malloc_tsd_boot1 JEMALLOC_N(malloc_tsd_boot1) +#define malloc_tsd_dalloc JEMALLOC_N(malloc_tsd_dalloc) +#define malloc_tsd_malloc JEMALLOC_N(malloc_tsd_malloc) +#define tsd_boot_wrapper JEMALLOC_N(tsd_boot_wrapper) +#define tsd_booted JEMALLOC_N(tsd_booted) +#define tsd_cleanup JEMALLOC_N(tsd_cleanup) +#define tsd_fetch_slow JEMALLOC_N(tsd_fetch_slow) +#define tsd_global_slow JEMALLOC_N(tsd_global_slow) +#define tsd_global_slow_dec JEMALLOC_N(tsd_global_slow_dec) +#define tsd_global_slow_inc JEMALLOC_N(tsd_global_slow_inc) +#define tsd_init_check_recursion JEMALLOC_N(tsd_init_check_recursion) +#define tsd_init_finish JEMALLOC_N(tsd_init_finish) +#define tsd_init_head JEMALLOC_N(tsd_init_head) +#define tsd_postfork_child JEMALLOC_N(tsd_postfork_child) +#define tsd_postfork_parent JEMALLOC_N(tsd_postfork_parent) +#define tsd_prefork JEMALLOC_N(tsd_prefork) +#define tsd_slow_update JEMALLOC_N(tsd_slow_update) +#define tsd_state_set JEMALLOC_N(tsd_state_set) +#define tsd_tsd JEMALLOC_N(tsd_tsd) +#define witness_depth_error JEMALLOC_N(witness_depth_error) +#define witness_init JEMALLOC_N(witness_init) +#define witness_lock_error JEMALLOC_N(witness_lock_error) +#define witness_not_owner_error JEMALLOC_N(witness_not_owner_error) +#define witness_owner_error JEMALLOC_N(witness_owner_error) +#define witness_postfork_child JEMALLOC_N(witness_postfork_child) +#define witness_postfork_parent JEMALLOC_N(witness_postfork_parent) +#define witness_prefork JEMALLOC_N(witness_prefork) +#define witnesses_cleanup JEMALLOC_N(witnesses_cleanup) +#define zone_register JEMALLOC_N(zone_register) diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/private_namespace.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/private_namespace.h new file mode 100644 index 000000000..efe904660 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/private_namespace.h @@ -0,0 +1,776 @@ +#define a0dalloc JEMALLOC_N(a0dalloc) +#define a0malloc JEMALLOC_N(a0malloc) +#define arena_choose_hard JEMALLOC_N(arena_choose_hard) +#define arena_cleanup JEMALLOC_N(arena_cleanup) +#define arena_init JEMALLOC_N(arena_init) +#define arena_migrate JEMALLOC_N(arena_migrate) +#define arena_set JEMALLOC_N(arena_set) +#define arenas JEMALLOC_N(arenas) +#define arenas_lock JEMALLOC_N(arenas_lock) +#define batch_alloc JEMALLOC_N(batch_alloc) +#define bootstrap_calloc JEMALLOC_N(bootstrap_calloc) +#define bootstrap_free JEMALLOC_N(bootstrap_free) +#define bootstrap_malloc JEMALLOC_N(bootstrap_malloc) +#define free_default JEMALLOC_N(free_default) +#define iarena_cleanup JEMALLOC_N(iarena_cleanup) +#define je_sdallocx_noflags JEMALLOC_N(je_sdallocx_noflags) +#define jemalloc_postfork_child JEMALLOC_N(jemalloc_postfork_child) +#define jemalloc_postfork_parent JEMALLOC_N(jemalloc_postfork_parent) +#define jemalloc_prefork JEMALLOC_N(jemalloc_prefork) +#define junk_alloc_callback JEMALLOC_N(junk_alloc_callback) +#define junk_free_callback JEMALLOC_N(junk_free_callback) +#define malloc_default JEMALLOC_N(malloc_default) +#define malloc_init_state JEMALLOC_N(malloc_init_state) +#define malloc_slow JEMALLOC_N(malloc_slow) +#define manual_arena_base JEMALLOC_N(manual_arena_base) +#define narenas_auto JEMALLOC_N(narenas_auto) +#define narenas_total_get JEMALLOC_N(narenas_total_get) +#define ncpus JEMALLOC_N(ncpus) +#define opt_abort JEMALLOC_N(opt_abort) +#define opt_abort_conf JEMALLOC_N(opt_abort_conf) +#define opt_cache_oblivious JEMALLOC_N(opt_cache_oblivious) +#define opt_confirm_conf JEMALLOC_N(opt_confirm_conf) +#define opt_experimental_infallible_new JEMALLOC_N(opt_experimental_infallible_new) +#define opt_hpa JEMALLOC_N(opt_hpa) +#define opt_hpa_opts JEMALLOC_N(opt_hpa_opts) +#define opt_hpa_sec_opts JEMALLOC_N(opt_hpa_sec_opts) +#define opt_junk JEMALLOC_N(opt_junk) +#define opt_junk_alloc JEMALLOC_N(opt_junk_alloc) +#define opt_junk_free JEMALLOC_N(opt_junk_free) +#define opt_narenas JEMALLOC_N(opt_narenas) +#define opt_narenas_ratio JEMALLOC_N(opt_narenas_ratio) +#define opt_trust_madvise JEMALLOC_N(opt_trust_madvise) +#define opt_utrace JEMALLOC_N(opt_utrace) +#define opt_xmalloc JEMALLOC_N(opt_xmalloc) +#define opt_zero JEMALLOC_N(opt_zero) +#define opt_zero_realloc_action JEMALLOC_N(opt_zero_realloc_action) +#define sdallocx_default JEMALLOC_N(sdallocx_default) +#define zero_realloc_count JEMALLOC_N(zero_realloc_count) +#define zero_realloc_mode_names JEMALLOC_N(zero_realloc_mode_names) +#define arena_basic_stats_merge JEMALLOC_N(arena_basic_stats_merge) +#define arena_bin_choose JEMALLOC_N(arena_bin_choose) +#define arena_bin_offsets JEMALLOC_N(arena_bin_offsets) +#define arena_binind_div_info JEMALLOC_N(arena_binind_div_info) +#define arena_boot JEMALLOC_N(arena_boot) +#define arena_cache_bin_fill_small JEMALLOC_N(arena_cache_bin_fill_small) +#define arena_choose_huge JEMALLOC_N(arena_choose_huge) +#define arena_config_default JEMALLOC_N(arena_config_default) +#define arena_dalloc_bin_locked_handle_newly_empty JEMALLOC_N(arena_dalloc_bin_locked_handle_newly_empty) +#define arena_dalloc_bin_locked_handle_newly_nonempty JEMALLOC_N(arena_dalloc_bin_locked_handle_newly_nonempty) +#define arena_dalloc_promoted JEMALLOC_N(arena_dalloc_promoted) +#define arena_dalloc_small JEMALLOC_N(arena_dalloc_small) +#define arena_decay JEMALLOC_N(arena_decay) +#define arena_decay_ms_get JEMALLOC_N(arena_decay_ms_get) +#define arena_decay_ms_set JEMALLOC_N(arena_decay_ms_set) +#define arena_destroy JEMALLOC_N(arena_destroy) +#define arena_dirty_decay_ms_default_get JEMALLOC_N(arena_dirty_decay_ms_default_get) +#define arena_dirty_decay_ms_default_set JEMALLOC_N(arena_dirty_decay_ms_default_set) +#define arena_do_deferred_work JEMALLOC_N(arena_do_deferred_work) +#define arena_dss_prec_get JEMALLOC_N(arena_dss_prec_get) +#define arena_dss_prec_set JEMALLOC_N(arena_dss_prec_set) +#define arena_emap_global JEMALLOC_N(arena_emap_global) +#define arena_extent_alloc_large JEMALLOC_N(arena_extent_alloc_large) +#define arena_extent_dalloc_large_prep JEMALLOC_N(arena_extent_dalloc_large_prep) +#define arena_extent_ralloc_large_expand JEMALLOC_N(arena_extent_ralloc_large_expand) +#define arena_extent_ralloc_large_shrink JEMALLOC_N(arena_extent_ralloc_large_shrink) +#define arena_fill_small_fresh JEMALLOC_N(arena_fill_small_fresh) +#define arena_get_ehooks JEMALLOC_N(arena_get_ehooks) +#define arena_handle_deferred_work JEMALLOC_N(arena_handle_deferred_work) +#define arena_init_huge JEMALLOC_N(arena_init_huge) +#define arena_is_huge JEMALLOC_N(arena_is_huge) +#define arena_malloc_hard JEMALLOC_N(arena_malloc_hard) +#define arena_muzzy_decay_ms_default_get JEMALLOC_N(arena_muzzy_decay_ms_default_get) +#define arena_muzzy_decay_ms_default_set JEMALLOC_N(arena_muzzy_decay_ms_default_set) +#define arena_new JEMALLOC_N(arena_new) +#define arena_nthreads_dec JEMALLOC_N(arena_nthreads_dec) +#define arena_nthreads_get JEMALLOC_N(arena_nthreads_get) +#define arena_nthreads_inc JEMALLOC_N(arena_nthreads_inc) +#define arena_pa_central_global JEMALLOC_N(arena_pa_central_global) +#define arena_palloc JEMALLOC_N(arena_palloc) +#define arena_postfork_child JEMALLOC_N(arena_postfork_child) +#define arena_postfork_parent JEMALLOC_N(arena_postfork_parent) +#define arena_prefork0 JEMALLOC_N(arena_prefork0) +#define arena_prefork1 JEMALLOC_N(arena_prefork1) +#define arena_prefork2 JEMALLOC_N(arena_prefork2) +#define arena_prefork3 JEMALLOC_N(arena_prefork3) +#define arena_prefork4 JEMALLOC_N(arena_prefork4) +#define arena_prefork5 JEMALLOC_N(arena_prefork5) +#define arena_prefork6 JEMALLOC_N(arena_prefork6) +#define arena_prefork7 JEMALLOC_N(arena_prefork7) +#define arena_prefork8 JEMALLOC_N(arena_prefork8) +#define arena_prof_promote JEMALLOC_N(arena_prof_promote) +#define arena_ralloc JEMALLOC_N(arena_ralloc) +#define arena_ralloc_no_move JEMALLOC_N(arena_ralloc_no_move) +#define arena_reset JEMALLOC_N(arena_reset) +#define arena_retain_grow_limit_get_set JEMALLOC_N(arena_retain_grow_limit_get_set) +#define arena_set_extent_hooks JEMALLOC_N(arena_set_extent_hooks) +#define arena_slab_dalloc JEMALLOC_N(arena_slab_dalloc) +#define arena_stats_merge JEMALLOC_N(arena_stats_merge) +#define opt_dirty_decay_ms JEMALLOC_N(opt_dirty_decay_ms) +#define opt_muzzy_decay_ms JEMALLOC_N(opt_muzzy_decay_ms) +#define opt_oversize_threshold JEMALLOC_N(opt_oversize_threshold) +#define opt_percpu_arena JEMALLOC_N(opt_percpu_arena) +#define oversize_threshold JEMALLOC_N(oversize_threshold) +#define percpu_arena_mode_names JEMALLOC_N(percpu_arena_mode_names) +#define background_thread_boot0 JEMALLOC_N(background_thread_boot0) +#define background_thread_boot1 JEMALLOC_N(background_thread_boot1) +#define background_thread_create JEMALLOC_N(background_thread_create) +#define background_thread_ctl_init JEMALLOC_N(background_thread_ctl_init) +#define background_thread_enabled_state JEMALLOC_N(background_thread_enabled_state) +#define background_thread_info JEMALLOC_N(background_thread_info) +#define background_thread_is_started JEMALLOC_N(background_thread_is_started) +#define background_thread_lock JEMALLOC_N(background_thread_lock) +#define background_thread_postfork_child JEMALLOC_N(background_thread_postfork_child) +#define background_thread_postfork_parent JEMALLOC_N(background_thread_postfork_parent) +#define background_thread_prefork0 JEMALLOC_N(background_thread_prefork0) +#define background_thread_prefork1 JEMALLOC_N(background_thread_prefork1) +#define background_thread_stats_read JEMALLOC_N(background_thread_stats_read) +#define background_thread_wakeup_early JEMALLOC_N(background_thread_wakeup_early) +#define background_threads_disable JEMALLOC_N(background_threads_disable) +#define background_threads_enable JEMALLOC_N(background_threads_enable) +#define max_background_threads JEMALLOC_N(max_background_threads) +#define n_background_threads JEMALLOC_N(n_background_threads) +#define opt_background_thread JEMALLOC_N(opt_background_thread) +#define opt_max_background_threads JEMALLOC_N(opt_max_background_threads) +#define b0get JEMALLOC_N(b0get) +#define base_alloc JEMALLOC_N(base_alloc) +#define base_alloc_edata JEMALLOC_N(base_alloc_edata) +#define base_boot JEMALLOC_N(base_boot) +#define base_delete JEMALLOC_N(base_delete) +#define base_ehooks_get JEMALLOC_N(base_ehooks_get) +#define base_ehooks_get_for_metadata JEMALLOC_N(base_ehooks_get_for_metadata) +#define base_extent_hooks_set JEMALLOC_N(base_extent_hooks_set) +#define base_new JEMALLOC_N(base_new) +#define base_postfork_child JEMALLOC_N(base_postfork_child) +#define base_postfork_parent JEMALLOC_N(base_postfork_parent) +#define base_prefork JEMALLOC_N(base_prefork) +#define base_stats_get JEMALLOC_N(base_stats_get) +#define metadata_thp_mode_names JEMALLOC_N(metadata_thp_mode_names) +#define opt_metadata_thp JEMALLOC_N(opt_metadata_thp) +#define bin_init JEMALLOC_N(bin_init) +#define bin_postfork_child JEMALLOC_N(bin_postfork_child) +#define bin_postfork_parent JEMALLOC_N(bin_postfork_parent) +#define bin_prefork JEMALLOC_N(bin_prefork) +#define bin_shard_sizes_boot JEMALLOC_N(bin_shard_sizes_boot) +#define bin_update_shard_size JEMALLOC_N(bin_update_shard_size) +#define bin_info_boot JEMALLOC_N(bin_info_boot) +#define bin_infos JEMALLOC_N(bin_infos) +#define bitmap_info_init JEMALLOC_N(bitmap_info_init) +#define bitmap_init JEMALLOC_N(bitmap_init) +#define bitmap_size JEMALLOC_N(bitmap_size) +#define buf_writer_cb JEMALLOC_N(buf_writer_cb) +#define buf_writer_flush JEMALLOC_N(buf_writer_flush) +#define buf_writer_init JEMALLOC_N(buf_writer_init) +#define buf_writer_pipe JEMALLOC_N(buf_writer_pipe) +#define buf_writer_terminate JEMALLOC_N(buf_writer_terminate) +#define cache_bin_info_compute_alloc JEMALLOC_N(cache_bin_info_compute_alloc) +#define cache_bin_info_init JEMALLOC_N(cache_bin_info_init) +#define cache_bin_init JEMALLOC_N(cache_bin_init) +#define cache_bin_postincrement JEMALLOC_N(cache_bin_postincrement) +#define cache_bin_preincrement JEMALLOC_N(cache_bin_preincrement) +#define cache_bin_still_zero_initialized JEMALLOC_N(cache_bin_still_zero_initialized) +#define ckh_count JEMALLOC_N(ckh_count) +#define ckh_delete JEMALLOC_N(ckh_delete) +#define ckh_insert JEMALLOC_N(ckh_insert) +#define ckh_iter JEMALLOC_N(ckh_iter) +#define ckh_new JEMALLOC_N(ckh_new) +#define ckh_pointer_hash JEMALLOC_N(ckh_pointer_hash) +#define ckh_pointer_keycomp JEMALLOC_N(ckh_pointer_keycomp) +#define ckh_remove JEMALLOC_N(ckh_remove) +#define ckh_search JEMALLOC_N(ckh_search) +#define ckh_string_hash JEMALLOC_N(ckh_string_hash) +#define ckh_string_keycomp JEMALLOC_N(ckh_string_keycomp) +#define counter_accum_init JEMALLOC_N(counter_accum_init) +#define counter_postfork_child JEMALLOC_N(counter_postfork_child) +#define counter_postfork_parent JEMALLOC_N(counter_postfork_parent) +#define counter_prefork JEMALLOC_N(counter_prefork) +#define ctl_boot JEMALLOC_N(ctl_boot) +#define ctl_bymib JEMALLOC_N(ctl_bymib) +#define ctl_bymibname JEMALLOC_N(ctl_bymibname) +#define ctl_byname JEMALLOC_N(ctl_byname) +#define ctl_mibnametomib JEMALLOC_N(ctl_mibnametomib) +#define ctl_mtx_assert_held JEMALLOC_N(ctl_mtx_assert_held) +#define ctl_nametomib JEMALLOC_N(ctl_nametomib) +#define ctl_postfork_child JEMALLOC_N(ctl_postfork_child) +#define ctl_postfork_parent JEMALLOC_N(ctl_postfork_parent) +#define ctl_prefork JEMALLOC_N(ctl_prefork) +#define decay_deadline_init JEMALLOC_N(decay_deadline_init) +#define decay_init JEMALLOC_N(decay_init) +#define decay_maybe_advance_epoch JEMALLOC_N(decay_maybe_advance_epoch) +#define decay_ms_valid JEMALLOC_N(decay_ms_valid) +#define decay_npages_purge_in JEMALLOC_N(decay_npages_purge_in) +#define decay_ns_until_purge JEMALLOC_N(decay_ns_until_purge) +#define decay_reinit JEMALLOC_N(decay_reinit) +#define div_init JEMALLOC_N(div_init) +#define ecache_init JEMALLOC_N(ecache_init) +#define ecache_postfork_child JEMALLOC_N(ecache_postfork_child) +#define ecache_postfork_parent JEMALLOC_N(ecache_postfork_parent) +#define ecache_prefork JEMALLOC_N(ecache_prefork) +#define edata_avail_any JEMALLOC_N(edata_avail_any) +#define edata_avail_empty JEMALLOC_N(edata_avail_empty) +#define edata_avail_first JEMALLOC_N(edata_avail_first) +#define edata_avail_insert JEMALLOC_N(edata_avail_insert) +#define edata_avail_new JEMALLOC_N(edata_avail_new) +#define edata_avail_remove JEMALLOC_N(edata_avail_remove) +#define edata_avail_remove_any JEMALLOC_N(edata_avail_remove_any) +#define edata_avail_remove_first JEMALLOC_N(edata_avail_remove_first) +#define edata_heap_any JEMALLOC_N(edata_heap_any) +#define edata_heap_empty JEMALLOC_N(edata_heap_empty) +#define edata_heap_first JEMALLOC_N(edata_heap_first) +#define edata_heap_insert JEMALLOC_N(edata_heap_insert) +#define edata_heap_new JEMALLOC_N(edata_heap_new) +#define edata_heap_remove JEMALLOC_N(edata_heap_remove) +#define edata_heap_remove_any JEMALLOC_N(edata_heap_remove_any) +#define edata_heap_remove_first JEMALLOC_N(edata_heap_remove_first) +#define edata_cache_fast_disable JEMALLOC_N(edata_cache_fast_disable) +#define edata_cache_fast_get JEMALLOC_N(edata_cache_fast_get) +#define edata_cache_fast_init JEMALLOC_N(edata_cache_fast_init) +#define edata_cache_fast_put JEMALLOC_N(edata_cache_fast_put) +#define edata_cache_get JEMALLOC_N(edata_cache_get) +#define edata_cache_init JEMALLOC_N(edata_cache_init) +#define edata_cache_postfork_child JEMALLOC_N(edata_cache_postfork_child) +#define edata_cache_postfork_parent JEMALLOC_N(edata_cache_postfork_parent) +#define edata_cache_prefork JEMALLOC_N(edata_cache_prefork) +#define edata_cache_put JEMALLOC_N(edata_cache_put) +#define ehooks_default_alloc_impl JEMALLOC_N(ehooks_default_alloc_impl) +#define ehooks_default_commit_impl JEMALLOC_N(ehooks_default_commit_impl) +#define ehooks_default_dalloc_impl JEMALLOC_N(ehooks_default_dalloc_impl) +#define ehooks_default_decommit_impl JEMALLOC_N(ehooks_default_decommit_impl) +#define ehooks_default_destroy_impl JEMALLOC_N(ehooks_default_destroy_impl) +#define ehooks_default_extent_hooks JEMALLOC_N(ehooks_default_extent_hooks) +#define ehooks_default_guard_impl JEMALLOC_N(ehooks_default_guard_impl) +#define ehooks_default_merge JEMALLOC_N(ehooks_default_merge) +#define ehooks_default_merge_impl JEMALLOC_N(ehooks_default_merge_impl) +#define ehooks_default_purge_forced_impl JEMALLOC_N(ehooks_default_purge_forced_impl) +#define ehooks_default_purge_lazy_impl JEMALLOC_N(ehooks_default_purge_lazy_impl) +#define ehooks_default_split_impl JEMALLOC_N(ehooks_default_split_impl) +#define ehooks_default_unguard_impl JEMALLOC_N(ehooks_default_unguard_impl) +#define ehooks_default_zero_impl JEMALLOC_N(ehooks_default_zero_impl) +#define ehooks_init JEMALLOC_N(ehooks_init) +#define emap_deregister_boundary JEMALLOC_N(emap_deregister_boundary) +#define emap_deregister_interior JEMALLOC_N(emap_deregister_interior) +#define emap_do_assert_mapped JEMALLOC_N(emap_do_assert_mapped) +#define emap_do_assert_not_mapped JEMALLOC_N(emap_do_assert_not_mapped) +#define emap_init JEMALLOC_N(emap_init) +#define emap_merge_commit JEMALLOC_N(emap_merge_commit) +#define emap_merge_prepare JEMALLOC_N(emap_merge_prepare) +#define emap_register_boundary JEMALLOC_N(emap_register_boundary) +#define emap_register_interior JEMALLOC_N(emap_register_interior) +#define emap_release_edata JEMALLOC_N(emap_release_edata) +#define emap_remap JEMALLOC_N(emap_remap) +#define emap_split_commit JEMALLOC_N(emap_split_commit) +#define emap_split_prepare JEMALLOC_N(emap_split_prepare) +#define emap_try_acquire_edata_neighbor JEMALLOC_N(emap_try_acquire_edata_neighbor) +#define emap_try_acquire_edata_neighbor_expand JEMALLOC_N(emap_try_acquire_edata_neighbor_expand) +#define emap_update_edata_state JEMALLOC_N(emap_update_edata_state) +#define eset_fit JEMALLOC_N(eset_fit) +#define eset_init JEMALLOC_N(eset_init) +#define eset_insert JEMALLOC_N(eset_insert) +#define eset_nbytes_get JEMALLOC_N(eset_nbytes_get) +#define eset_nextents_get JEMALLOC_N(eset_nextents_get) +#define eset_npages_get JEMALLOC_N(eset_npages_get) +#define eset_remove JEMALLOC_N(eset_remove) +#define exp_grow_init JEMALLOC_N(exp_grow_init) +#define ecache_alloc JEMALLOC_N(ecache_alloc) +#define ecache_alloc_grow JEMALLOC_N(ecache_alloc_grow) +#define ecache_dalloc JEMALLOC_N(ecache_dalloc) +#define ecache_evict JEMALLOC_N(ecache_evict) +#define extent_alloc_wrapper JEMALLOC_N(extent_alloc_wrapper) +#define extent_boot JEMALLOC_N(extent_boot) +#define extent_commit_wrapper JEMALLOC_N(extent_commit_wrapper) +#define extent_commit_zero JEMALLOC_N(extent_commit_zero) +#define extent_dalloc_gap JEMALLOC_N(extent_dalloc_gap) +#define extent_dalloc_wrapper JEMALLOC_N(extent_dalloc_wrapper) +#define extent_decommit_wrapper JEMALLOC_N(extent_decommit_wrapper) +#define extent_destroy_wrapper JEMALLOC_N(extent_destroy_wrapper) +#define extent_gdump_add JEMALLOC_N(extent_gdump_add) +#define extent_merge_wrapper JEMALLOC_N(extent_merge_wrapper) +#define extent_purge_forced_wrapper JEMALLOC_N(extent_purge_forced_wrapper) +#define extent_purge_lazy_wrapper JEMALLOC_N(extent_purge_lazy_wrapper) +#define extent_record JEMALLOC_N(extent_record) +#define extent_sn_next JEMALLOC_N(extent_sn_next) +#define extent_split_wrapper JEMALLOC_N(extent_split_wrapper) +#define opt_lg_extent_max_active_fit JEMALLOC_N(opt_lg_extent_max_active_fit) +#define dss_prec_names JEMALLOC_N(dss_prec_names) +#define extent_alloc_dss JEMALLOC_N(extent_alloc_dss) +#define extent_dss_boot JEMALLOC_N(extent_dss_boot) +#define extent_dss_mergeable JEMALLOC_N(extent_dss_mergeable) +#define extent_dss_prec_get JEMALLOC_N(extent_dss_prec_get) +#define extent_dss_prec_set JEMALLOC_N(extent_dss_prec_set) +#define extent_in_dss JEMALLOC_N(extent_in_dss) +#define opt_dss JEMALLOC_N(opt_dss) +#define extent_alloc_mmap JEMALLOC_N(extent_alloc_mmap) +#define extent_dalloc_mmap JEMALLOC_N(extent_dalloc_mmap) +#define opt_retain JEMALLOC_N(opt_retain) +#define fxp_parse JEMALLOC_N(fxp_parse) +#define fxp_print JEMALLOC_N(fxp_print) +#define opt_lg_san_uaf_align JEMALLOC_N(opt_lg_san_uaf_align) +#define opt_san_guard_large JEMALLOC_N(opt_san_guard_large) +#define opt_san_guard_small JEMALLOC_N(opt_san_guard_small) +#define san_cache_bin_nonfast_mask JEMALLOC_N(san_cache_bin_nonfast_mask) +#define san_check_stashed_ptrs JEMALLOC_N(san_check_stashed_ptrs) +#define san_guard_pages JEMALLOC_N(san_guard_pages) +#define san_init JEMALLOC_N(san_init) +#define san_unguard_pages JEMALLOC_N(san_unguard_pages) +#define san_unguard_pages_pre_destroy JEMALLOC_N(san_unguard_pages_pre_destroy) +#define tsd_san_init JEMALLOC_N(tsd_san_init) +#define san_bump_alloc JEMALLOC_N(san_bump_alloc) +#define hook_boot JEMALLOC_N(hook_boot) +#define hook_install JEMALLOC_N(hook_install) +#define hook_invoke_alloc JEMALLOC_N(hook_invoke_alloc) +#define hook_invoke_dalloc JEMALLOC_N(hook_invoke_dalloc) +#define hook_invoke_expand JEMALLOC_N(hook_invoke_expand) +#define hook_remove JEMALLOC_N(hook_remove) +#define hpa_central_extract JEMALLOC_N(hpa_central_extract) +#define hpa_central_init JEMALLOC_N(hpa_central_init) +#define hpa_shard_destroy JEMALLOC_N(hpa_shard_destroy) +#define hpa_shard_disable JEMALLOC_N(hpa_shard_disable) +#define hpa_shard_do_deferred_work JEMALLOC_N(hpa_shard_do_deferred_work) +#define hpa_shard_init JEMALLOC_N(hpa_shard_init) +#define hpa_shard_postfork_child JEMALLOC_N(hpa_shard_postfork_child) +#define hpa_shard_postfork_parent JEMALLOC_N(hpa_shard_postfork_parent) +#define hpa_shard_prefork3 JEMALLOC_N(hpa_shard_prefork3) +#define hpa_shard_prefork4 JEMALLOC_N(hpa_shard_prefork4) +#define hpa_shard_set_deferral_allowed JEMALLOC_N(hpa_shard_set_deferral_allowed) +#define hpa_shard_stats_accum JEMALLOC_N(hpa_shard_stats_accum) +#define hpa_shard_stats_merge JEMALLOC_N(hpa_shard_stats_merge) +#define hpa_supported JEMALLOC_N(hpa_supported) +#define hpa_hooks_default JEMALLOC_N(hpa_hooks_default) +#define hpdata_age_heap_any JEMALLOC_N(hpdata_age_heap_any) +#define hpdata_age_heap_empty JEMALLOC_N(hpdata_age_heap_empty) +#define hpdata_age_heap_first JEMALLOC_N(hpdata_age_heap_first) +#define hpdata_age_heap_insert JEMALLOC_N(hpdata_age_heap_insert) +#define hpdata_age_heap_new JEMALLOC_N(hpdata_age_heap_new) +#define hpdata_age_heap_remove JEMALLOC_N(hpdata_age_heap_remove) +#define hpdata_age_heap_remove_any JEMALLOC_N(hpdata_age_heap_remove_any) +#define hpdata_age_heap_remove_first JEMALLOC_N(hpdata_age_heap_remove_first) +#define hpdata_dehugify JEMALLOC_N(hpdata_dehugify) +#define hpdata_hugify JEMALLOC_N(hpdata_hugify) +#define hpdata_init JEMALLOC_N(hpdata_init) +#define hpdata_purge_begin JEMALLOC_N(hpdata_purge_begin) +#define hpdata_purge_end JEMALLOC_N(hpdata_purge_end) +#define hpdata_purge_next JEMALLOC_N(hpdata_purge_next) +#define hpdata_reserve_alloc JEMALLOC_N(hpdata_reserve_alloc) +#define hpdata_unreserve JEMALLOC_N(hpdata_unreserve) +#define inspect_extent_util_stats_get JEMALLOC_N(inspect_extent_util_stats_get) +#define inspect_extent_util_stats_verbose_get JEMALLOC_N(inspect_extent_util_stats_verbose_get) +#define large_dalloc JEMALLOC_N(large_dalloc) +#define large_dalloc_finish JEMALLOC_N(large_dalloc_finish) +#define large_dalloc_prep_locked JEMALLOC_N(large_dalloc_prep_locked) +#define large_malloc JEMALLOC_N(large_malloc) +#define large_palloc JEMALLOC_N(large_palloc) +#define large_prof_info_get JEMALLOC_N(large_prof_info_get) +#define large_prof_info_set JEMALLOC_N(large_prof_info_set) +#define large_prof_tctx_reset JEMALLOC_N(large_prof_tctx_reset) +#define large_ralloc JEMALLOC_N(large_ralloc) +#define large_ralloc_no_move JEMALLOC_N(large_ralloc_no_move) +#define large_salloc JEMALLOC_N(large_salloc) +#define log_init_done JEMALLOC_N(log_init_done) +#define log_var_names JEMALLOC_N(log_var_names) +#define log_var_update_state JEMALLOC_N(log_var_update_state) +#define buferror JEMALLOC_N(buferror) +#define malloc_cprintf JEMALLOC_N(malloc_cprintf) +#define malloc_printf JEMALLOC_N(malloc_printf) +#define malloc_snprintf JEMALLOC_N(malloc_snprintf) +#define malloc_strtoumax JEMALLOC_N(malloc_strtoumax) +#define malloc_vcprintf JEMALLOC_N(malloc_vcprintf) +#define malloc_vsnprintf JEMALLOC_N(malloc_vsnprintf) +#define malloc_write JEMALLOC_N(malloc_write) +#define wrtmessage JEMALLOC_N(wrtmessage) +#define malloc_mutex_boot JEMALLOC_N(malloc_mutex_boot) +#define malloc_mutex_init JEMALLOC_N(malloc_mutex_init) +#define malloc_mutex_lock_slow JEMALLOC_N(malloc_mutex_lock_slow) +#define malloc_mutex_postfork_child JEMALLOC_N(malloc_mutex_postfork_child) +#define malloc_mutex_postfork_parent JEMALLOC_N(malloc_mutex_postfork_parent) +#define malloc_mutex_prefork JEMALLOC_N(malloc_mutex_prefork) +#define malloc_mutex_prof_data_reset JEMALLOC_N(malloc_mutex_prof_data_reset) +#define opt_mutex_max_spin JEMALLOC_N(opt_mutex_max_spin) +#define nstime_add JEMALLOC_N(nstime_add) +#define nstime_compare JEMALLOC_N(nstime_compare) +#define nstime_copy JEMALLOC_N(nstime_copy) +#define nstime_divide JEMALLOC_N(nstime_divide) +#define nstime_iadd JEMALLOC_N(nstime_iadd) +#define nstime_idivide JEMALLOC_N(nstime_idivide) +#define nstime_imultiply JEMALLOC_N(nstime_imultiply) +#define nstime_init JEMALLOC_N(nstime_init) +#define nstime_init2 JEMALLOC_N(nstime_init2) +#define nstime_init_update JEMALLOC_N(nstime_init_update) +#define nstime_isubtract JEMALLOC_N(nstime_isubtract) +#define nstime_monotonic JEMALLOC_N(nstime_monotonic) +#define nstime_msec JEMALLOC_N(nstime_msec) +#define nstime_ns JEMALLOC_N(nstime_ns) +#define nstime_ns_since JEMALLOC_N(nstime_ns_since) +#define nstime_nsec JEMALLOC_N(nstime_nsec) +#define nstime_prof_init_update JEMALLOC_N(nstime_prof_init_update) +#define nstime_prof_update JEMALLOC_N(nstime_prof_update) +#define nstime_sec JEMALLOC_N(nstime_sec) +#define nstime_subtract JEMALLOC_N(nstime_subtract) +#define nstime_update JEMALLOC_N(nstime_update) +#define opt_prof_time_res JEMALLOC_N(opt_prof_time_res) +#define prof_time_res_mode_names JEMALLOC_N(prof_time_res_mode_names) +#define pa_alloc JEMALLOC_N(pa_alloc) +#define pa_central_init JEMALLOC_N(pa_central_init) +#define pa_dalloc JEMALLOC_N(pa_dalloc) +#define pa_decay_ms_get JEMALLOC_N(pa_decay_ms_get) +#define pa_decay_ms_set JEMALLOC_N(pa_decay_ms_set) +#define pa_expand JEMALLOC_N(pa_expand) +#define pa_shard_destroy JEMALLOC_N(pa_shard_destroy) +#define pa_shard_disable_hpa JEMALLOC_N(pa_shard_disable_hpa) +#define pa_shard_do_deferred_work JEMALLOC_N(pa_shard_do_deferred_work) +#define pa_shard_enable_hpa JEMALLOC_N(pa_shard_enable_hpa) +#define pa_shard_init JEMALLOC_N(pa_shard_init) +#define pa_shard_reset JEMALLOC_N(pa_shard_reset) +#define pa_shard_retain_grow_limit_get_set JEMALLOC_N(pa_shard_retain_grow_limit_get_set) +#define pa_shard_set_deferral_allowed JEMALLOC_N(pa_shard_set_deferral_allowed) +#define pa_shard_time_until_deferred_work JEMALLOC_N(pa_shard_time_until_deferred_work) +#define pa_shrink JEMALLOC_N(pa_shrink) +#define pa_shard_basic_stats_merge JEMALLOC_N(pa_shard_basic_stats_merge) +#define pa_shard_mtx_stats_read JEMALLOC_N(pa_shard_mtx_stats_read) +#define pa_shard_postfork_child JEMALLOC_N(pa_shard_postfork_child) +#define pa_shard_postfork_parent JEMALLOC_N(pa_shard_postfork_parent) +#define pa_shard_prefork0 JEMALLOC_N(pa_shard_prefork0) +#define pa_shard_prefork2 JEMALLOC_N(pa_shard_prefork2) +#define pa_shard_prefork3 JEMALLOC_N(pa_shard_prefork3) +#define pa_shard_prefork4 JEMALLOC_N(pa_shard_prefork4) +#define pa_shard_prefork5 JEMALLOC_N(pa_shard_prefork5) +#define pa_shard_stats_merge JEMALLOC_N(pa_shard_stats_merge) +#define pai_alloc_batch_default JEMALLOC_N(pai_alloc_batch_default) +#define pai_dalloc_batch_default JEMALLOC_N(pai_dalloc_batch_default) +#define pac_decay_all JEMALLOC_N(pac_decay_all) +#define pac_decay_ms_get JEMALLOC_N(pac_decay_ms_get) +#define pac_decay_ms_set JEMALLOC_N(pac_decay_ms_set) +#define pac_destroy JEMALLOC_N(pac_destroy) +#define pac_init JEMALLOC_N(pac_init) +#define pac_maybe_decay_purge JEMALLOC_N(pac_maybe_decay_purge) +#define pac_reset JEMALLOC_N(pac_reset) +#define pac_retain_grow_limit_get_set JEMALLOC_N(pac_retain_grow_limit_get_set) +#define init_system_thp_mode JEMALLOC_N(init_system_thp_mode) +#define opt_thp JEMALLOC_N(opt_thp) +#define pages_boot JEMALLOC_N(pages_boot) +#define pages_commit JEMALLOC_N(pages_commit) +#define pages_decommit JEMALLOC_N(pages_decommit) +#define pages_dodump JEMALLOC_N(pages_dodump) +#define pages_dontdump JEMALLOC_N(pages_dontdump) +#define pages_huge JEMALLOC_N(pages_huge) +#define pages_map JEMALLOC_N(pages_map) +#define pages_mark_guards JEMALLOC_N(pages_mark_guards) +#define pages_nohuge JEMALLOC_N(pages_nohuge) +#define pages_purge_forced JEMALLOC_N(pages_purge_forced) +#define pages_purge_lazy JEMALLOC_N(pages_purge_lazy) +#define pages_set_thp_state JEMALLOC_N(pages_set_thp_state) +#define pages_unmap JEMALLOC_N(pages_unmap) +#define pages_unmark_guards JEMALLOC_N(pages_unmark_guards) +#define thp_mode_names JEMALLOC_N(thp_mode_names) +#define peak_alloc_event_handler JEMALLOC_N(peak_alloc_event_handler) +#define peak_alloc_new_event_wait JEMALLOC_N(peak_alloc_new_event_wait) +#define peak_alloc_postponed_event_wait JEMALLOC_N(peak_alloc_postponed_event_wait) +#define peak_dalloc_event_handler JEMALLOC_N(peak_dalloc_event_handler) +#define peak_dalloc_new_event_wait JEMALLOC_N(peak_dalloc_new_event_wait) +#define peak_dalloc_postponed_event_wait JEMALLOC_N(peak_dalloc_postponed_event_wait) +#define peak_event_max JEMALLOC_N(peak_event_max) +#define peak_event_update JEMALLOC_N(peak_event_update) +#define peak_event_zero JEMALLOC_N(peak_event_zero) +#define lg_prof_sample JEMALLOC_N(lg_prof_sample) +#define opt_lg_prof_interval JEMALLOC_N(opt_lg_prof_interval) +#define opt_lg_prof_sample JEMALLOC_N(opt_lg_prof_sample) +#define opt_prof JEMALLOC_N(opt_prof) +#define opt_prof_accum JEMALLOC_N(opt_prof_accum) +#define opt_prof_active JEMALLOC_N(opt_prof_active) +#define opt_prof_final JEMALLOC_N(opt_prof_final) +#define opt_prof_gdump JEMALLOC_N(opt_prof_gdump) +#define opt_prof_leak JEMALLOC_N(opt_prof_leak) +#define opt_prof_leak_error JEMALLOC_N(opt_prof_leak_error) +#define opt_prof_prefix JEMALLOC_N(opt_prof_prefix) +#define opt_prof_sys_thread_name JEMALLOC_N(opt_prof_sys_thread_name) +#define opt_prof_thread_active_init JEMALLOC_N(opt_prof_thread_active_init) +#define opt_prof_unbias JEMALLOC_N(opt_prof_unbias) +#define prof_active_get JEMALLOC_N(prof_active_get) +#define prof_active_set JEMALLOC_N(prof_active_set) +#define prof_active_state JEMALLOC_N(prof_active_state) +#define prof_alloc_rollback JEMALLOC_N(prof_alloc_rollback) +#define prof_backtrace_hook JEMALLOC_N(prof_backtrace_hook) +#define prof_backtrace_hook_get JEMALLOC_N(prof_backtrace_hook_get) +#define prof_backtrace_hook_set JEMALLOC_N(prof_backtrace_hook_set) +#define prof_boot0 JEMALLOC_N(prof_boot0) +#define prof_boot1 JEMALLOC_N(prof_boot1) +#define prof_boot2 JEMALLOC_N(prof_boot2) +#define prof_booted JEMALLOC_N(prof_booted) +#define prof_dump_hook JEMALLOC_N(prof_dump_hook) +#define prof_dump_hook_get JEMALLOC_N(prof_dump_hook_get) +#define prof_dump_hook_set JEMALLOC_N(prof_dump_hook_set) +#define prof_free_sampled_object JEMALLOC_N(prof_free_sampled_object) +#define prof_gdump JEMALLOC_N(prof_gdump) +#define prof_gdump_get JEMALLOC_N(prof_gdump_get) +#define prof_gdump_set JEMALLOC_N(prof_gdump_set) +#define prof_gdump_val JEMALLOC_N(prof_gdump_val) +#define prof_idump JEMALLOC_N(prof_idump) +#define prof_interval JEMALLOC_N(prof_interval) +#define prof_malloc_sample_object JEMALLOC_N(prof_malloc_sample_object) +#define prof_mdump JEMALLOC_N(prof_mdump) +#define prof_postfork_child JEMALLOC_N(prof_postfork_child) +#define prof_postfork_parent JEMALLOC_N(prof_postfork_parent) +#define prof_prefork0 JEMALLOC_N(prof_prefork0) +#define prof_prefork1 JEMALLOC_N(prof_prefork1) +#define prof_sample_event_handler JEMALLOC_N(prof_sample_event_handler) +#define prof_sample_new_event_wait JEMALLOC_N(prof_sample_new_event_wait) +#define prof_sample_postponed_event_wait JEMALLOC_N(prof_sample_postponed_event_wait) +#define prof_tctx_create JEMALLOC_N(prof_tctx_create) +#define prof_tdata_cleanup JEMALLOC_N(prof_tdata_cleanup) +#define prof_tdata_init JEMALLOC_N(prof_tdata_init) +#define prof_tdata_reinit JEMALLOC_N(prof_tdata_reinit) +#define prof_thread_active_get JEMALLOC_N(prof_thread_active_get) +#define prof_thread_active_init_get JEMALLOC_N(prof_thread_active_init_get) +#define prof_thread_active_init_set JEMALLOC_N(prof_thread_active_init_set) +#define prof_thread_active_set JEMALLOC_N(prof_thread_active_set) +#define prof_thread_name_get JEMALLOC_N(prof_thread_name_get) +#define prof_thread_name_set JEMALLOC_N(prof_thread_name_set) +#define bt2gctx_mtx JEMALLOC_N(bt2gctx_mtx) +#define gctx_locks JEMALLOC_N(gctx_locks) +#define prof_bt_count JEMALLOC_N(prof_bt_count) +#define prof_bt_hash JEMALLOC_N(prof_bt_hash) +#define prof_bt_keycomp JEMALLOC_N(prof_bt_keycomp) +#define prof_cnt_all JEMALLOC_N(prof_cnt_all) +#define prof_data_init JEMALLOC_N(prof_data_init) +#define prof_dump_impl JEMALLOC_N(prof_dump_impl) +#define prof_dump_mtx JEMALLOC_N(prof_dump_mtx) +#define prof_lookup JEMALLOC_N(prof_lookup) +#define prof_reset JEMALLOC_N(prof_reset) +#define prof_shifted_unbiased_cnt JEMALLOC_N(prof_shifted_unbiased_cnt) +#define prof_tctx_try_destroy JEMALLOC_N(prof_tctx_try_destroy) +#define prof_tdata_count JEMALLOC_N(prof_tdata_count) +#define prof_tdata_detach JEMALLOC_N(prof_tdata_detach) +#define prof_tdata_init_impl JEMALLOC_N(prof_tdata_init_impl) +#define prof_thread_name_alloc JEMALLOC_N(prof_thread_name_alloc) +#define prof_thread_name_set_impl JEMALLOC_N(prof_thread_name_set_impl) +#define prof_unbias_map_init JEMALLOC_N(prof_unbias_map_init) +#define prof_unbiased_sz JEMALLOC_N(prof_unbiased_sz) +#define tdata_locks JEMALLOC_N(tdata_locks) +#define tdatas_mtx JEMALLOC_N(tdatas_mtx) +#define log_mtx JEMALLOC_N(log_mtx) +#define opt_prof_log JEMALLOC_N(opt_prof_log) +#define prof_log_alloc_count JEMALLOC_N(prof_log_alloc_count) +#define prof_log_bt_count JEMALLOC_N(prof_log_bt_count) +#define prof_log_dummy_set JEMALLOC_N(prof_log_dummy_set) +#define prof_log_init JEMALLOC_N(prof_log_init) +#define prof_log_is_logging JEMALLOC_N(prof_log_is_logging) +#define prof_log_rep_check JEMALLOC_N(prof_log_rep_check) +#define prof_log_start JEMALLOC_N(prof_log_start) +#define prof_log_stop JEMALLOC_N(prof_log_stop) +#define prof_log_thr_count JEMALLOC_N(prof_log_thr_count) +#define prof_logging_state JEMALLOC_N(prof_logging_state) +#define prof_try_log JEMALLOC_N(prof_try_log) +#define edata_prof_recent_alloc_get_no_lock_test JEMALLOC_N(edata_prof_recent_alloc_get_no_lock_test) +#define edata_prof_recent_alloc_init JEMALLOC_N(edata_prof_recent_alloc_init) +#define opt_prof_recent_alloc_max JEMALLOC_N(opt_prof_recent_alloc_max) +#define prof_recent_alloc JEMALLOC_N(prof_recent_alloc) +#define prof_recent_alloc_dump JEMALLOC_N(prof_recent_alloc_dump) +#define prof_recent_alloc_edata_get_no_lock_test JEMALLOC_N(prof_recent_alloc_edata_get_no_lock_test) +#define prof_recent_alloc_list JEMALLOC_N(prof_recent_alloc_list) +#define prof_recent_alloc_max_ctl_read JEMALLOC_N(prof_recent_alloc_max_ctl_read) +#define prof_recent_alloc_max_ctl_write JEMALLOC_N(prof_recent_alloc_max_ctl_write) +#define prof_recent_alloc_mtx JEMALLOC_N(prof_recent_alloc_mtx) +#define prof_recent_alloc_prepare JEMALLOC_N(prof_recent_alloc_prepare) +#define prof_recent_alloc_reset JEMALLOC_N(prof_recent_alloc_reset) +#define prof_recent_dump_mtx JEMALLOC_N(prof_recent_dump_mtx) +#define prof_recent_init JEMALLOC_N(prof_recent_init) +#define opt_prof_stats JEMALLOC_N(opt_prof_stats) +#define prof_stats_dec JEMALLOC_N(prof_stats_dec) +#define prof_stats_get_accum JEMALLOC_N(prof_stats_get_accum) +#define prof_stats_get_live JEMALLOC_N(prof_stats_get_live) +#define prof_stats_inc JEMALLOC_N(prof_stats_inc) +#define prof_stats_mtx JEMALLOC_N(prof_stats_mtx) +#define bt_init JEMALLOC_N(bt_init) +#define prof_backtrace JEMALLOC_N(prof_backtrace) +#define prof_base JEMALLOC_N(prof_base) +#define prof_do_mock JEMALLOC_N(prof_do_mock) +#define prof_dump_filename_mtx JEMALLOC_N(prof_dump_filename_mtx) +#define prof_dump_open_file JEMALLOC_N(prof_dump_open_file) +#define prof_dump_open_maps JEMALLOC_N(prof_dump_open_maps) +#define prof_dump_write_file JEMALLOC_N(prof_dump_write_file) +#define prof_fdump_impl JEMALLOC_N(prof_fdump_impl) +#define prof_gdump_impl JEMALLOC_N(prof_gdump_impl) +#define prof_get_default_filename JEMALLOC_N(prof_get_default_filename) +#define prof_getpid JEMALLOC_N(prof_getpid) +#define prof_hooks_init JEMALLOC_N(prof_hooks_init) +#define prof_idump_impl JEMALLOC_N(prof_idump_impl) +#define prof_mdump_impl JEMALLOC_N(prof_mdump_impl) +#define prof_prefix_set JEMALLOC_N(prof_prefix_set) +#define prof_sys_thread_name_fetch JEMALLOC_N(prof_sys_thread_name_fetch) +#define prof_sys_thread_name_read JEMALLOC_N(prof_sys_thread_name_read) +#define prof_unwind_init JEMALLOC_N(prof_unwind_init) +#define psset_init JEMALLOC_N(psset_init) +#define psset_insert JEMALLOC_N(psset_insert) +#define psset_pick_alloc JEMALLOC_N(psset_pick_alloc) +#define psset_pick_hugify JEMALLOC_N(psset_pick_hugify) +#define psset_pick_purge JEMALLOC_N(psset_pick_purge) +#define psset_remove JEMALLOC_N(psset_remove) +#define psset_stats_accum JEMALLOC_N(psset_stats_accum) +#define psset_update_begin JEMALLOC_N(psset_update_begin) +#define psset_update_end JEMALLOC_N(psset_update_end) +#define rtree_ctx_data_init JEMALLOC_N(rtree_ctx_data_init) +#define rtree_leaf_elm_lookup_hard JEMALLOC_N(rtree_leaf_elm_lookup_hard) +#define rtree_new JEMALLOC_N(rtree_new) +#define safety_check_fail JEMALLOC_N(safety_check_fail) +#define safety_check_fail_sized_dealloc JEMALLOC_N(safety_check_fail_sized_dealloc) +#define safety_check_set_abort JEMALLOC_N(safety_check_set_abort) +#define reg_size_compute JEMALLOC_N(reg_size_compute) +#define sc_boot JEMALLOC_N(sc_boot) +#define sc_data_init JEMALLOC_N(sc_data_init) +#define sc_data_update_slab_size JEMALLOC_N(sc_data_update_slab_size) +#define sec_disable JEMALLOC_N(sec_disable) +#define sec_flush JEMALLOC_N(sec_flush) +#define sec_init JEMALLOC_N(sec_init) +#define sec_mutex_stats_read JEMALLOC_N(sec_mutex_stats_read) +#define sec_postfork_child JEMALLOC_N(sec_postfork_child) +#define sec_postfork_parent JEMALLOC_N(sec_postfork_parent) +#define sec_prefork2 JEMALLOC_N(sec_prefork2) +#define sec_stats_merge JEMALLOC_N(sec_stats_merge) +#define arena_mutex_names JEMALLOC_N(arena_mutex_names) +#define global_mutex_names JEMALLOC_N(global_mutex_names) +#define opt_stats_interval JEMALLOC_N(opt_stats_interval) +#define opt_stats_interval_opts JEMALLOC_N(opt_stats_interval_opts) +#define opt_stats_print JEMALLOC_N(opt_stats_print) +#define opt_stats_print_opts JEMALLOC_N(opt_stats_print_opts) +#define stats_boot JEMALLOC_N(stats_boot) +#define stats_interval_event_handler JEMALLOC_N(stats_interval_event_handler) +#define stats_interval_new_event_wait JEMALLOC_N(stats_interval_new_event_wait) +#define stats_interval_postponed_event_wait JEMALLOC_N(stats_interval_postponed_event_wait) +#define stats_postfork_child JEMALLOC_N(stats_postfork_child) +#define stats_postfork_parent JEMALLOC_N(stats_postfork_parent) +#define stats_prefork JEMALLOC_N(stats_prefork) +#define stats_print JEMALLOC_N(stats_print) +#define sz_boot JEMALLOC_N(sz_boot) +#define sz_index2size_tab JEMALLOC_N(sz_index2size_tab) +#define sz_large_pad JEMALLOC_N(sz_large_pad) +#define sz_pind2sz_tab JEMALLOC_N(sz_pind2sz_tab) +#define sz_psz_quantize_ceil JEMALLOC_N(sz_psz_quantize_ceil) +#define sz_psz_quantize_floor JEMALLOC_N(sz_psz_quantize_floor) +#define sz_size2index_tab JEMALLOC_N(sz_size2index_tab) +#define nhbins JEMALLOC_N(nhbins) +#define opt_lg_tcache_flush_large_div JEMALLOC_N(opt_lg_tcache_flush_large_div) +#define opt_lg_tcache_flush_small_div JEMALLOC_N(opt_lg_tcache_flush_small_div) +#define opt_lg_tcache_nslots_mul JEMALLOC_N(opt_lg_tcache_nslots_mul) +#define opt_tcache JEMALLOC_N(opt_tcache) +#define opt_tcache_gc_delay_bytes JEMALLOC_N(opt_tcache_gc_delay_bytes) +#define opt_tcache_gc_incr_bytes JEMALLOC_N(opt_tcache_gc_incr_bytes) +#define opt_tcache_max JEMALLOC_N(opt_tcache_max) +#define opt_tcache_nslots_large JEMALLOC_N(opt_tcache_nslots_large) +#define opt_tcache_nslots_small_max JEMALLOC_N(opt_tcache_nslots_small_max) +#define opt_tcache_nslots_small_min JEMALLOC_N(opt_tcache_nslots_small_min) +#define tcache_alloc_small_hard JEMALLOC_N(tcache_alloc_small_hard) +#define tcache_arena_associate JEMALLOC_N(tcache_arena_associate) +#define tcache_arena_reassociate JEMALLOC_N(tcache_arena_reassociate) +#define tcache_assert_initialized JEMALLOC_N(tcache_assert_initialized) +#define tcache_bin_flush_large JEMALLOC_N(tcache_bin_flush_large) +#define tcache_bin_flush_small JEMALLOC_N(tcache_bin_flush_small) +#define tcache_bin_flush_stashed JEMALLOC_N(tcache_bin_flush_stashed) +#define tcache_bin_info JEMALLOC_N(tcache_bin_info) +#define tcache_boot JEMALLOC_N(tcache_boot) +#define tcache_cleanup JEMALLOC_N(tcache_cleanup) +#define tcache_create_explicit JEMALLOC_N(tcache_create_explicit) +#define tcache_flush JEMALLOC_N(tcache_flush) +#define tcache_gc_dalloc_event_handler JEMALLOC_N(tcache_gc_dalloc_event_handler) +#define tcache_gc_dalloc_new_event_wait JEMALLOC_N(tcache_gc_dalloc_new_event_wait) +#define tcache_gc_dalloc_postponed_event_wait JEMALLOC_N(tcache_gc_dalloc_postponed_event_wait) +#define tcache_gc_event_handler JEMALLOC_N(tcache_gc_event_handler) +#define tcache_gc_new_event_wait JEMALLOC_N(tcache_gc_new_event_wait) +#define tcache_gc_postponed_event_wait JEMALLOC_N(tcache_gc_postponed_event_wait) +#define tcache_maxclass JEMALLOC_N(tcache_maxclass) +#define tcache_postfork_child JEMALLOC_N(tcache_postfork_child) +#define tcache_postfork_parent JEMALLOC_N(tcache_postfork_parent) +#define tcache_prefork JEMALLOC_N(tcache_prefork) +#define tcache_salloc JEMALLOC_N(tcache_salloc) +#define tcache_stats_merge JEMALLOC_N(tcache_stats_merge) +#define tcaches JEMALLOC_N(tcaches) +#define tcaches_create JEMALLOC_N(tcaches_create) +#define tcaches_destroy JEMALLOC_N(tcaches_destroy) +#define tcaches_flush JEMALLOC_N(tcaches_flush) +#define tsd_tcache_data_init JEMALLOC_N(tsd_tcache_data_init) +#define tsd_tcache_enabled_data_init JEMALLOC_N(tsd_tcache_enabled_data_init) +#define test_hooks_arena_new_hook JEMALLOC_N(test_hooks_arena_new_hook) +#define test_hooks_libc_hook JEMALLOC_N(test_hooks_libc_hook) +#define te_assert_invariants_debug JEMALLOC_N(te_assert_invariants_debug) +#define te_event_trigger JEMALLOC_N(te_event_trigger) +#define te_recompute_fast_threshold JEMALLOC_N(te_recompute_fast_threshold) +#define tsd_te_init JEMALLOC_N(tsd_te_init) +#define ticker_geom_table JEMALLOC_N(ticker_geom_table) +#define malloc_tsd_boot0 JEMALLOC_N(malloc_tsd_boot0) +#define malloc_tsd_boot1 JEMALLOC_N(malloc_tsd_boot1) +#define malloc_tsd_dalloc JEMALLOC_N(malloc_tsd_dalloc) +#define malloc_tsd_malloc JEMALLOC_N(malloc_tsd_malloc) +#define tsd_boot_wrapper JEMALLOC_N(tsd_boot_wrapper) +#define tsd_booted JEMALLOC_N(tsd_booted) +#define tsd_cleanup JEMALLOC_N(tsd_cleanup) +#define tsd_fetch_slow JEMALLOC_N(tsd_fetch_slow) +#define tsd_global_slow JEMALLOC_N(tsd_global_slow) +#define tsd_global_slow_dec JEMALLOC_N(tsd_global_slow_dec) +#define tsd_global_slow_inc JEMALLOC_N(tsd_global_slow_inc) +#define tsd_init_check_recursion JEMALLOC_N(tsd_init_check_recursion) +#define tsd_init_finish JEMALLOC_N(tsd_init_finish) +#define tsd_init_head JEMALLOC_N(tsd_init_head) +#define tsd_postfork_child JEMALLOC_N(tsd_postfork_child) +#define tsd_postfork_parent JEMALLOC_N(tsd_postfork_parent) +#define tsd_prefork JEMALLOC_N(tsd_prefork) +#define tsd_slow_update JEMALLOC_N(tsd_slow_update) +#define tsd_state_set JEMALLOC_N(tsd_state_set) +#define tsd_tsd JEMALLOC_N(tsd_tsd) +#define witness_depth_error JEMALLOC_N(witness_depth_error) +#define witness_init JEMALLOC_N(witness_init) +#define witness_lock_error JEMALLOC_N(witness_lock_error) +#define witness_not_owner_error JEMALLOC_N(witness_not_owner_error) +#define witness_owner_error JEMALLOC_N(witness_owner_error) +#define witness_postfork_child JEMALLOC_N(witness_postfork_child) +#define witness_postfork_parent JEMALLOC_N(witness_postfork_parent) +#define witness_prefork JEMALLOC_N(witness_prefork) +#define witnesses_cleanup JEMALLOC_N(witnesses_cleanup) +#define zone_register JEMALLOC_N(zone_register) + +// DuckDB: added these so we can pass "exported_symbols_check.py" +#define JE_MALLOC_CONF_BUFFER JEMALLOC_N(JE_MALLOC_CONF_BUFFER) +#define arena_name_get JEMALLOC_N(arena_name_get) +#define arena_name_set JEMALLOC_N(arena_name_set) +#define b0_alloc_tcache_stack JEMALLOC_N(b0_alloc_tcache_stack) +#define b0_dalloc_tcache_stack JEMALLOC_N(b0_dalloc_tcache_stack) +#define base_alloc_rtree JEMALLOC_N(base_alloc_rtree) +#define cache_bin_stack_use_thp JEMALLOC_N(cache_bin_stack_use_thp) +#define disabled_bin JEMALLOC_N(disabled_bin) +#define global_do_not_change_tcache_maxclass JEMALLOC_N(global_do_not_change_tcache_maxclass) +#define global_do_not_change_tcache_nbins JEMALLOC_N(global_do_not_change_tcache_nbins) +#define invalid_conf_abort JEMALLOC_N(invalid_conf_abort) +#define je_free_aligned_sized JEMALLOC_N(je_free_aligned_sized) +#define je_free_sized JEMALLOC_N(je_free_sized) +#define _malloc_thread_cleanup JEMALLOC_N(_malloc_thread_cleanup) +#define _malloc_tsd_cleanup_register JEMALLOC_N(_malloc_tsd_cleanup_register) +#define multi_setting_parse_next JEMALLOC_N(multi_setting_parse_next) +#define opt_calloc_madvise_threshold JEMALLOC_N(opt_calloc_madvise_threshold) +#define opt_debug_double_free_max_scan JEMALLOC_N(opt_debug_double_free_max_scan) +#define opt_malloc_conf_env_var JEMALLOC_N(opt_malloc_conf_env_var) +#define opt_malloc_conf_symlink JEMALLOC_N(opt_malloc_conf_symlink) +#define opt_prof_bt_max JEMALLOC_N(opt_prof_bt_max) +#define opt_prof_pid_namespace JEMALLOC_N(opt_prof_pid_namespace) +#define os_page JEMALLOC_N(os_page) +#define pa_shard_nactive JEMALLOC_N(pa_shard_nactive) +#define pa_shard_ndirty JEMALLOC_N(pa_shard_ndirty) +#define pa_shard_nmuzzy JEMALLOC_N(pa_shard_nmuzzy) +#define prof_sample_free_hook_get JEMALLOC_N(prof_sample_free_hook_get) +#define prof_sample_free_hook_set JEMALLOC_N(prof_sample_free_hook_set) +#define prof_sample_hook_get JEMALLOC_N(prof_sample_hook_get) +#define prof_sample_hook_set JEMALLOC_N(prof_sample_hook_set) +#define pthread_create_wrapper JEMALLOC_N(pthread_create_wrapper) +#define tcache_bin_ncached_max_read JEMALLOC_N(tcache_bin_ncached_max_read) +#define tcache_bins_ncached_max_write JEMALLOC_N(tcache_bins_ncached_max_write) +#define tcache_enabled_set JEMALLOC_N(tcache_enabled_set) +#define thread_tcache_max_set JEMALLOC_N(thread_tcache_max_set) +#define tsd_tls JEMALLOC_N(tsd_tls) +#define batcher_pop_begin JEMALLOC_N(batcher_pop_begin) +#define batcher_pop_get_pushes JEMALLOC_N(batcher_pop_get_pushes) +#define batcher_postfork_child JEMALLOC_N(batcher_postfork_child) +#define batcher_postfork_parent JEMALLOC_N(batcher_postfork_parent) +#define batcher_prefork JEMALLOC_N(batcher_prefork) +#define batcher_push_begin JEMALLOC_N(batcher_push_begin) +#define bin_info_nbatched_bins JEMALLOC_N(bin_info_nbatched_bins) +#define bin_info_nbatched_sizes JEMALLOC_N(bin_info_nbatched_sizes) +#define bin_info_nunbatched_bins JEMALLOC_N(bin_info_nunbatched_bins) +#define opt_bin_info_max_batched_size JEMALLOC_N(opt_bin_info_max_batched_size) +#define opt_bin_info_remote_free_max JEMALLOC_N(opt_bin_info_remote_free_max) +#define opt_bin_info_remote_free_max_batch JEMALLOC_N(opt_bin_info_remote_free_max_batch) diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prng.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prng.h new file mode 100644 index 000000000..81060d320 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prng.h @@ -0,0 +1,169 @@ +#ifndef JEMALLOC_INTERNAL_PRNG_H +#define JEMALLOC_INTERNAL_PRNG_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/bit_util.h" + +/* + * Simple linear congruential pseudo-random number generator: + * + * prng(y) = (a*x + c) % m + * + * where the following constants ensure maximal period: + * + * a == Odd number (relatively prime to 2^n), and (a-1) is a multiple of 4. + * c == Odd number (relatively prime to 2^n). + * m == 2^32 + * + * See Knuth's TAOCP 3rd Ed., Vol. 2, pg. 17 for details on these constraints. + * + * This choice of m has the disadvantage that the quality of the bits is + * proportional to bit position. For example, the lowest bit has a cycle of 2, + * the next has a cycle of 4, etc. For this reason, we prefer to use the upper + * bits. + */ + +/******************************************************************************/ +/* INTERNAL DEFINITIONS -- IGNORE */ +/******************************************************************************/ +#define PRNG_A_32 UINT32_C(1103515241) +#define PRNG_C_32 UINT32_C(12347) + +#define PRNG_A_64 UINT64_C(6364136223846793005) +#define PRNG_C_64 UINT64_C(1442695040888963407) + +JEMALLOC_ALWAYS_INLINE uint32_t +prng_state_next_u32(uint32_t state) { + return (state * PRNG_A_32) + PRNG_C_32; +} + +JEMALLOC_ALWAYS_INLINE uint64_t +prng_state_next_u64(uint64_t state) { + return (state * PRNG_A_64) + PRNG_C_64; +} + +JEMALLOC_ALWAYS_INLINE size_t +prng_state_next_zu(size_t state) { +#if LG_SIZEOF_PTR == 2 + return (state * PRNG_A_32) + PRNG_C_32; +#elif LG_SIZEOF_PTR == 3 + return (state * PRNG_A_64) + PRNG_C_64; +#else +#error Unsupported pointer size +#endif +} + +/******************************************************************************/ +/* BEGIN PUBLIC API */ +/******************************************************************************/ + +/* + * The prng_lg_range functions give a uniform int in the half-open range [0, + * 2**lg_range). + */ + +JEMALLOC_ALWAYS_INLINE uint32_t +prng_lg_range_u32(uint32_t *state, unsigned lg_range) { + assert(lg_range > 0); + assert(lg_range <= 32); + + *state = prng_state_next_u32(*state); + uint32_t ret = *state >> (32 - lg_range); + + return ret; +} + +JEMALLOC_ALWAYS_INLINE uint64_t +prng_lg_range_u64(uint64_t *state, unsigned lg_range) { + assert(lg_range > 0); + assert(lg_range <= 64); + + *state = prng_state_next_u64(*state); + uint64_t ret = *state >> (64 - lg_range); + + return ret; +} + +JEMALLOC_ALWAYS_INLINE size_t +prng_lg_range_zu(size_t *state, unsigned lg_range) { + assert(lg_range > 0); + assert(lg_range <= ZU(1) << (3 + LG_SIZEOF_PTR)); + + *state = prng_state_next_zu(*state); + size_t ret = *state >> ((ZU(1) << (3 + LG_SIZEOF_PTR)) - lg_range); + + return ret; +} + +/* + * The prng_range functions behave like the prng_lg_range, but return a result + * in [0, range) instead of [0, 2**lg_range). + */ + +JEMALLOC_ALWAYS_INLINE uint32_t +prng_range_u32(uint32_t *state, uint32_t range) { + assert(range != 0); + /* + * If range were 1, lg_range would be 0, so the shift in + * prng_lg_range_u32 would be a shift of a 32-bit variable by 32 bits, + * which is UB. Just handle this case as a one-off. + */ + if (range == 1) { + return 0; + } + + /* Compute the ceiling of lg(range). */ + unsigned lg_range = ffs_u32(pow2_ceil_u32(range)); + + /* Generate a result in [0..range) via repeated trial. */ + uint32_t ret; + do { + ret = prng_lg_range_u32(state, lg_range); + } while (ret >= range); + + return ret; +} + +JEMALLOC_ALWAYS_INLINE uint64_t +prng_range_u64(uint64_t *state, uint64_t range) { + assert(range != 0); + + /* See the note in prng_range_u32. */ + if (range == 1) { + return 0; + } + + /* Compute the ceiling of lg(range). */ + unsigned lg_range = ffs_u64(pow2_ceil_u64(range)); + + /* Generate a result in [0..range) via repeated trial. */ + uint64_t ret; + do { + ret = prng_lg_range_u64(state, lg_range); + } while (ret >= range); + + return ret; +} + +JEMALLOC_ALWAYS_INLINE size_t +prng_range_zu(size_t *state, size_t range) { + assert(range != 0); + + /* See the note in prng_range_u32. */ + if (range == 1) { + return 0; + } + + /* Compute the ceiling of lg(range). */ + unsigned lg_range = ffs_u64(pow2_ceil_u64(range)); + + /* Generate a result in [0..range) via repeated trial. */ + size_t ret; + do { + ret = prng_lg_range_zu(state, lg_range); + } while (ret >= range); + + return ret; +} + +#endif /* JEMALLOC_INTERNAL_PRNG_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_data.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_data.h new file mode 100644 index 000000000..43e8d7e70 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_data.h @@ -0,0 +1,37 @@ +#ifndef JEMALLOC_INTERNAL_PROF_DATA_H +#define JEMALLOC_INTERNAL_PROF_DATA_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/mutex.h" + +extern malloc_mutex_t bt2gctx_mtx; +extern malloc_mutex_t tdatas_mtx; +extern malloc_mutex_t prof_dump_mtx; + +extern malloc_mutex_t *gctx_locks; +extern malloc_mutex_t *tdata_locks; + +extern size_t prof_unbiased_sz[PROF_SC_NSIZES]; +extern size_t prof_shifted_unbiased_cnt[PROF_SC_NSIZES]; + +void prof_bt_hash(const void *key, size_t r_hash[2]); +bool prof_bt_keycomp(const void *k1, const void *k2); + +bool prof_data_init(tsd_t *tsd); +prof_tctx_t *prof_lookup(tsd_t *tsd, prof_bt_t *bt); +int prof_thread_name_set_impl(tsd_t *tsd, const char *thread_name); +void prof_unbias_map_init(void); +void prof_dump_impl(tsd_t *tsd, write_cb_t *prof_dump_write, void *cbopaque, + prof_tdata_t *tdata, bool leakcheck); +prof_tdata_t * prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid, + uint64_t thr_discrim, char *thread_name, bool active); +void prof_tdata_detach(tsd_t *tsd, prof_tdata_t *tdata); +void prof_reset(tsd_t *tsd, size_t lg_sample); +void prof_tctx_try_destroy(tsd_t *tsd, prof_tctx_t *tctx); + +/* Used in unit tests. */ +size_t prof_tdata_count(void); +size_t prof_bt_count(void); +void prof_cnt_all(prof_cnt_t *cnt_all); + +#endif /* JEMALLOC_INTERNAL_PROF_DATA_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_externs.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_externs.h new file mode 100644 index 000000000..952ace7db --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_externs.h @@ -0,0 +1,108 @@ +#ifndef JEMALLOC_INTERNAL_PROF_EXTERNS_H +#define JEMALLOC_INTERNAL_PROF_EXTERNS_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/base.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/prof_hook.h" + +extern bool opt_prof; +extern bool opt_prof_active; +extern bool opt_prof_thread_active_init; +extern unsigned opt_prof_bt_max; +extern size_t opt_lg_prof_sample; /* Mean bytes between samples. */ +extern ssize_t opt_lg_prof_interval; /* lg(prof_interval). */ +extern bool opt_prof_gdump; /* High-water memory dumping. */ +extern bool opt_prof_final; /* Final profile dumping. */ +extern bool opt_prof_leak; /* Dump leak summary at exit. */ +extern bool opt_prof_leak_error; /* Exit with error code if memory leaked */ +extern bool opt_prof_accum; /* Report cumulative bytes. */ +extern bool opt_prof_log; /* Turn logging on at boot. */ +extern char opt_prof_prefix[ + /* Minimize memory bloat for non-prof builds. */ +#ifdef JEMALLOC_PROF + PATH_MAX + +#endif + 1]; +extern bool opt_prof_unbias; + +/* Include pid namespace in profile file names. */ +extern bool opt_prof_pid_namespace; + +/* For recording recent allocations */ +extern ssize_t opt_prof_recent_alloc_max; + +/* Whether to use thread name provided by the system or by mallctl. */ +extern bool opt_prof_sys_thread_name; + +/* Whether to record per size class counts and request size totals. */ +extern bool opt_prof_stats; + +/* Accessed via prof_active_[gs]et{_unlocked,}(). */ +extern bool prof_active_state; + +/* Accessed via prof_gdump_[gs]et{_unlocked,}(). */ +extern bool prof_gdump_val; + +/* Profile dump interval, measured in bytes allocated. */ +extern uint64_t prof_interval; + +/* + * Initialized as opt_lg_prof_sample, and potentially modified during profiling + * resets. + */ +extern size_t lg_prof_sample; + +extern bool prof_booted; + +void prof_backtrace_hook_set(prof_backtrace_hook_t hook); +prof_backtrace_hook_t prof_backtrace_hook_get(void); + +void prof_dump_hook_set(prof_dump_hook_t hook); +prof_dump_hook_t prof_dump_hook_get(void); + +void prof_sample_hook_set(prof_sample_hook_t hook); +prof_sample_hook_t prof_sample_hook_get(void); + +void prof_sample_free_hook_set(prof_sample_free_hook_t hook); +prof_sample_free_hook_t prof_sample_free_hook_get(void); + +/* Functions only accessed in prof_inlines.h */ +prof_tdata_t *prof_tdata_init(tsd_t *tsd); +prof_tdata_t *prof_tdata_reinit(tsd_t *tsd, prof_tdata_t *tdata); + +void prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx); +void prof_malloc_sample_object(tsd_t *tsd, const void *ptr, size_t size, + size_t usize, prof_tctx_t *tctx); +void prof_free_sampled_object(tsd_t *tsd, const void *ptr, size_t usize, + prof_info_t *prof_info); +prof_tctx_t *prof_tctx_create(tsd_t *tsd); +void prof_idump(tsdn_t *tsdn); +bool prof_mdump(tsd_t *tsd, const char *filename); +void prof_gdump(tsdn_t *tsdn); + +void prof_tdata_cleanup(tsd_t *tsd); +bool prof_active_get(tsdn_t *tsdn); +bool prof_active_set(tsdn_t *tsdn, bool active); +const char *prof_thread_name_get(tsd_t *tsd); +int prof_thread_name_set(tsd_t *tsd, const char *thread_name); +bool prof_thread_active_get(tsd_t *tsd); +bool prof_thread_active_set(tsd_t *tsd, bool active); +bool prof_thread_active_init_get(tsdn_t *tsdn); +bool prof_thread_active_init_set(tsdn_t *tsdn, bool active_init); +bool prof_gdump_get(tsdn_t *tsdn); +bool prof_gdump_set(tsdn_t *tsdn, bool active); +void prof_boot0(void); +void prof_boot1(void); +bool prof_boot2(tsd_t *tsd, base_t *base); +void prof_prefork0(tsdn_t *tsdn); +void prof_prefork1(tsdn_t *tsdn); +void prof_postfork_parent(tsdn_t *tsdn); +void prof_postfork_child(tsdn_t *tsdn); + +/* Only accessed by thread event. */ +uint64_t prof_sample_new_event_wait(tsd_t *tsd); +uint64_t prof_sample_postponed_event_wait(tsd_t *tsd); +void prof_sample_event_handler(tsd_t *tsd, uint64_t elapsed); + +#endif /* JEMALLOC_INTERNAL_PROF_EXTERNS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_hook.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_hook.h new file mode 100644 index 000000000..087dadc63 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_hook.h @@ -0,0 +1,29 @@ +#ifndef JEMALLOC_INTERNAL_PROF_HOOK_H +#define JEMALLOC_INTERNAL_PROF_HOOK_H + +#include "jemalloc/internal/jemalloc_preamble.h" + +/* + * The hooks types of which are declared in this file are experimental and + * undocumented, thus the typedefs are located in an 'internal' header. + */ + +/* + * A hook to mock out backtrace functionality. This can be handy, since it's + * otherwise difficult to guarantee that two allocations are reported as coming + * from the exact same stack trace in the presence of an optimizing compiler. + */ +typedef void (*prof_backtrace_hook_t)(void **, unsigned *, unsigned); + +/* + * A callback hook that notifies about recently dumped heap profile. + */ +typedef void (*prof_dump_hook_t)(const char *filename); + +/* ptr, size, backtrace vector, backtrace vector length, usize */ +typedef void (*prof_sample_hook_t)(const void *ptr, size_t size, void **backtrace, unsigned backtrace_length, size_t usize); + +/* ptr, size */ +typedef void (*prof_sample_free_hook_t)(const void *, size_t); + +#endif /* JEMALLOC_INTERNAL_PROF_HOOK_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_inlines.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_inlines.h new file mode 100644 index 000000000..75300ee40 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_inlines.h @@ -0,0 +1,302 @@ +#ifndef JEMALLOC_INTERNAL_PROF_INLINES_H +#define JEMALLOC_INTERNAL_PROF_INLINES_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/arena_inlines_b.h" +#include "jemalloc/internal/jemalloc_internal_inlines_c.h" +#include "jemalloc/internal/prof_externs.h" +#include "jemalloc/internal/prof_structs.h" +#include "jemalloc/internal/safety_check.h" +#include "jemalloc/internal/sz.h" +#include "jemalloc/internal/thread_event.h" + +JEMALLOC_ALWAYS_INLINE void +prof_active_assert(void) { + cassert(config_prof); + /* + * If opt_prof is off, then prof_active must always be off, regardless + * of whether prof_active_mtx is in effect or not. + */ + assert(opt_prof || !prof_active_state); +} + +JEMALLOC_ALWAYS_INLINE bool +prof_active_get_unlocked(void) { + prof_active_assert(); + /* + * Even if opt_prof is true, sampling can be temporarily disabled by + * setting prof_active to false. No locking is used when reading + * prof_active in the fast path, so there are no guarantees regarding + * how long it will take for all threads to notice state changes. + */ + return prof_active_state; +} + +JEMALLOC_ALWAYS_INLINE bool +prof_gdump_get_unlocked(void) { + /* + * No locking is used when reading prof_gdump_val in the fast path, so + * there are no guarantees regarding how long it will take for all + * threads to notice state changes. + */ + return prof_gdump_val; +} + +JEMALLOC_ALWAYS_INLINE void +prof_thread_name_assert(prof_tdata_t *tdata) { + if (!config_debug) { + return; + } + prof_active_assert(); + + bool terminated = false; + for (unsigned i = 0; i < PROF_THREAD_NAME_MAX_LEN; i++) { + if (tdata->thread_name[i] == '\0') { + terminated = true; + } + } + assert(terminated); +} + +JEMALLOC_ALWAYS_INLINE prof_tdata_t * +prof_tdata_get(tsd_t *tsd, bool create) { + prof_tdata_t *tdata; + + cassert(config_prof); + + tdata = tsd_prof_tdata_get(tsd); + if (create) { + assert(tsd_reentrancy_level_get(tsd) == 0); + if (unlikely(tdata == NULL)) { + if (tsd_nominal(tsd)) { + tdata = prof_tdata_init(tsd); + tsd_prof_tdata_set(tsd, tdata); + } + } else if (unlikely(tdata->expired)) { + tdata = prof_tdata_reinit(tsd, tdata); + tsd_prof_tdata_set(tsd, tdata); + } + assert(tdata == NULL || tdata->attached); + } + + if (tdata != NULL) { + prof_thread_name_assert(tdata); + } + + return tdata; +} + +JEMALLOC_ALWAYS_INLINE void +prof_info_get(tsd_t *tsd, const void *ptr, emap_alloc_ctx_t *alloc_ctx, + prof_info_t *prof_info) { + cassert(config_prof); + assert(ptr != NULL); + assert(prof_info != NULL); + + arena_prof_info_get(tsd, ptr, alloc_ctx, prof_info, false); +} + +JEMALLOC_ALWAYS_INLINE void +prof_info_get_and_reset_recent(tsd_t *tsd, const void *ptr, + emap_alloc_ctx_t *alloc_ctx, prof_info_t *prof_info) { + cassert(config_prof); + assert(ptr != NULL); + assert(prof_info != NULL); + + arena_prof_info_get(tsd, ptr, alloc_ctx, prof_info, true); +} + +JEMALLOC_ALWAYS_INLINE bool +prof_tctx_is_valid(const prof_tctx_t *tctx) { + return tctx != NULL && tctx != PROF_TCTX_SENTINEL; +} + +JEMALLOC_ALWAYS_INLINE void +prof_tctx_reset(tsd_t *tsd, const void *ptr, emap_alloc_ctx_t *alloc_ctx) { + cassert(config_prof); + assert(ptr != NULL); + + arena_prof_tctx_reset(tsd, ptr, alloc_ctx); +} + +JEMALLOC_ALWAYS_INLINE void +prof_tctx_reset_sampled(tsd_t *tsd, const void *ptr) { + cassert(config_prof); + assert(ptr != NULL); + + arena_prof_tctx_reset_sampled(tsd, ptr); +} + +JEMALLOC_ALWAYS_INLINE void +prof_info_set(tsd_t *tsd, edata_t *edata, prof_tctx_t *tctx, size_t size) { + cassert(config_prof); + assert(edata != NULL); + assert(prof_tctx_is_valid(tctx)); + + arena_prof_info_set(tsd, edata, tctx, size); +} + +JEMALLOC_ALWAYS_INLINE bool +prof_sample_should_skip(tsd_t *tsd, bool sample_event) { + cassert(config_prof); + + /* Fastpath: no need to load tdata */ + if (likely(!sample_event)) { + return true; + } + + /* + * sample_event is always obtained from the thread event module, and + * whenever it's true, it means that the thread event module has + * already checked the reentrancy level. + */ + assert(tsd_reentrancy_level_get(tsd) == 0); + + prof_tdata_t *tdata = prof_tdata_get(tsd, true); + if (unlikely(tdata == NULL)) { + return true; + } + + return !tdata->active; +} + +JEMALLOC_ALWAYS_INLINE prof_tctx_t * +prof_alloc_prep(tsd_t *tsd, bool prof_active, bool sample_event) { + prof_tctx_t *ret; + + if (!prof_active || + likely(prof_sample_should_skip(tsd, sample_event))) { + ret = PROF_TCTX_SENTINEL; + } else { + ret = prof_tctx_create(tsd); + } + + return ret; +} + +JEMALLOC_ALWAYS_INLINE void +prof_malloc(tsd_t *tsd, const void *ptr, size_t size, size_t usize, + emap_alloc_ctx_t *alloc_ctx, prof_tctx_t *tctx) { + cassert(config_prof); + assert(ptr != NULL); + assert(usize == isalloc(tsd_tsdn(tsd), ptr)); + + if (unlikely(prof_tctx_is_valid(tctx))) { + prof_malloc_sample_object(tsd, ptr, size, usize, tctx); + } else { + prof_tctx_reset(tsd, ptr, alloc_ctx); + } +} + +JEMALLOC_ALWAYS_INLINE void +prof_realloc(tsd_t *tsd, const void *ptr, size_t size, size_t usize, + prof_tctx_t *tctx, bool prof_active, const void *old_ptr, size_t old_usize, + prof_info_t *old_prof_info, bool sample_event) { + bool sampled, old_sampled, moved; + + cassert(config_prof); + assert(ptr != NULL || !prof_tctx_is_valid(tctx)); + + if (prof_active && ptr != NULL) { + assert(usize == isalloc(tsd_tsdn(tsd), ptr)); + if (prof_sample_should_skip(tsd, sample_event)) { + /* + * Don't sample. The usize passed to prof_alloc_prep() + * was larger than what actually got allocated, so a + * backtrace was captured for this allocation, even + * though its actual usize was insufficient to cross the + * sample threshold. + */ + prof_alloc_rollback(tsd, tctx); + tctx = PROF_TCTX_SENTINEL; + } + } + + sampled = prof_tctx_is_valid(tctx); + old_sampled = prof_tctx_is_valid(old_prof_info->alloc_tctx); + moved = (ptr != old_ptr); + + if (unlikely(sampled)) { + prof_malloc_sample_object(tsd, ptr, size, usize, tctx); + } else if (moved) { + prof_tctx_reset(tsd, ptr, NULL); + } else if (unlikely(old_sampled)) { + /* + * prof_tctx_reset() would work for the !moved case as well, + * but prof_tctx_reset_sampled() is slightly cheaper, and the + * proper thing to do here in the presence of explicit + * knowledge re: moved state. + */ + prof_tctx_reset_sampled(tsd, ptr); + } else { + prof_info_t prof_info; + prof_info_get(tsd, ptr, NULL, &prof_info); + assert(prof_info.alloc_tctx == PROF_TCTX_SENTINEL); + } + + /* + * The prof_free_sampled_object() call must come after the + * prof_malloc_sample_object() call, because tctx and old_tctx may be + * the same, in which case reversing the call order could cause the tctx + * to be prematurely destroyed as a side effect of momentarily zeroed + * counters. + */ + if (unlikely(old_sampled)) { + prof_free_sampled_object(tsd, old_ptr, old_usize, + old_prof_info); + } +} + +JEMALLOC_ALWAYS_INLINE size_t +prof_sample_align(size_t usize, size_t orig_align) { + /* + * Enforce alignment, so that sampled allocations can be identified + * w/o metadata lookup. + */ + assert(opt_prof); + return (orig_align < PROF_SAMPLE_ALIGNMENT && + (sz_can_use_slab(usize) || opt_cache_oblivious)) ? + PROF_SAMPLE_ALIGNMENT : orig_align; +} + +JEMALLOC_ALWAYS_INLINE bool +prof_sampled(tsd_t *tsd, const void *ptr) { + prof_info_t prof_info; + prof_info_get(tsd, ptr, NULL, &prof_info); + bool sampled = prof_tctx_is_valid(prof_info.alloc_tctx); + if (sampled) { + assert(prof_sample_aligned(ptr)); + } + return sampled; +} + +JEMALLOC_ALWAYS_INLINE void +prof_free(tsd_t *tsd, const void *ptr, size_t usize, + emap_alloc_ctx_t *alloc_ctx) { + prof_info_t prof_info; + prof_info_get_and_reset_recent(tsd, ptr, alloc_ctx, &prof_info); + + cassert(config_prof); + assert(usize == isalloc(tsd_tsdn(tsd), ptr)); + + if (unlikely(prof_tctx_is_valid(prof_info.alloc_tctx))) { + assert(prof_sample_aligned(ptr)); + prof_free_sampled_object(tsd, ptr, usize, &prof_info); + } +} + +JEMALLOC_ALWAYS_INLINE bool +prof_thread_name_empty(prof_tdata_t *tdata) { + prof_active_assert(); + + return (tdata->thread_name[0] == '\0'); +} + +JEMALLOC_ALWAYS_INLINE void +prof_thread_name_clear(prof_tdata_t *tdata) { + prof_active_assert(); + + tdata->thread_name[0] = '\0'; +} + +#endif /* JEMALLOC_INTERNAL_PROF_INLINES_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_log.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_log.h new file mode 100644 index 000000000..0b1271c89 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_log.h @@ -0,0 +1,23 @@ +#ifndef JEMALLOC_INTERNAL_PROF_LOG_H +#define JEMALLOC_INTERNAL_PROF_LOG_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/mutex.h" + +extern malloc_mutex_t log_mtx; + +void prof_try_log(tsd_t *tsd, size_t usize, prof_info_t *prof_info); +bool prof_log_init(tsd_t *tsdn); + +/* Used in unit tests. */ +size_t prof_log_bt_count(void); +size_t prof_log_alloc_count(void); +size_t prof_log_thr_count(void); +bool prof_log_is_logging(void); +bool prof_log_rep_check(void); +void prof_log_dummy_set(bool new_value); + +bool prof_log_start(tsdn_t *tsdn, const char *filename); +bool prof_log_stop(tsdn_t *tsdn); + +#endif /* JEMALLOC_INTERNAL_PROF_LOG_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_recent.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_recent.h new file mode 100644 index 000000000..33649e6da --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_recent.h @@ -0,0 +1,27 @@ +#ifndef JEMALLOC_INTERNAL_PROF_RECENT_H +#define JEMALLOC_INTERNAL_PROF_RECENT_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/edata.h" +#include "jemalloc/internal/mutex.h" + +extern malloc_mutex_t prof_recent_alloc_mtx; +extern malloc_mutex_t prof_recent_dump_mtx; + +bool prof_recent_alloc_prepare(tsd_t *tsd, prof_tctx_t *tctx); +void prof_recent_alloc(tsd_t *tsd, edata_t *edata, size_t size, size_t usize); +void prof_recent_alloc_reset(tsd_t *tsd, edata_t *edata); +bool prof_recent_init(void); +void edata_prof_recent_alloc_init(edata_t *edata); + +/* Used in unit tests. */ +typedef ql_head(prof_recent_t) prof_recent_list_t; +extern prof_recent_list_t prof_recent_alloc_list; +edata_t *prof_recent_alloc_edata_get_no_lock_test(const prof_recent_t *node); +prof_recent_t *edata_prof_recent_alloc_get_no_lock_test(const edata_t *edata); + +ssize_t prof_recent_alloc_max_ctl_read(void); +ssize_t prof_recent_alloc_max_ctl_write(tsd_t *tsd, ssize_t max); +void prof_recent_alloc_dump(tsd_t *tsd, write_cb_t *write_cb, void *cbopaque); + +#endif /* JEMALLOC_INTERNAL_PROF_RECENT_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_stats.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_stats.h new file mode 100644 index 000000000..c4d269e54 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_stats.h @@ -0,0 +1,20 @@ +#ifndef JEMALLOC_INTERNAL_PROF_STATS_H +#define JEMALLOC_INTERNAL_PROF_STATS_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/mutex.h" + +typedef struct prof_stats_s prof_stats_t; +struct prof_stats_s { + uint64_t req_sum; + uint64_t count; +}; + +extern malloc_mutex_t prof_stats_mtx; + +void prof_stats_inc(tsd_t *tsd, szind_t ind, size_t size); +void prof_stats_dec(tsd_t *tsd, szind_t ind, size_t size); +void prof_stats_get_live(tsd_t *tsd, szind_t ind, prof_stats_t *stats); +void prof_stats_get_accum(tsd_t *tsd, szind_t ind, prof_stats_t *stats); + +#endif /* JEMALLOC_INTERNAL_PROF_STATS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_structs.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_structs.h new file mode 100644 index 000000000..084a549dc --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_structs.h @@ -0,0 +1,222 @@ +#ifndef JEMALLOC_INTERNAL_PROF_STRUCTS_H +#define JEMALLOC_INTERNAL_PROF_STRUCTS_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/ckh.h" +#include "jemalloc/internal/edata.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/prng.h" +#include "jemalloc/internal/rb.h" + +struct prof_bt_s { + /* Backtrace, stored as len program counters. */ + void **vec; + unsigned len; +}; + +#ifdef JEMALLOC_PROF_LIBGCC +/* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */ +typedef struct { + void **vec; + unsigned *len; + unsigned max; +} prof_unwind_data_t; +#endif + +struct prof_cnt_s { + /* Profiling counters. */ + uint64_t curobjs; + uint64_t curobjs_shifted_unbiased; + uint64_t curbytes; + uint64_t curbytes_unbiased; + uint64_t accumobjs; + uint64_t accumobjs_shifted_unbiased; + uint64_t accumbytes; + uint64_t accumbytes_unbiased; +}; + +typedef enum { + prof_tctx_state_initializing, + prof_tctx_state_nominal, + prof_tctx_state_dumping, + prof_tctx_state_purgatory /* Dumper must finish destroying. */ +} prof_tctx_state_t; + +struct prof_tctx_s { + /* Thread data for thread that performed the allocation. */ + prof_tdata_t *tdata; + + /* + * Copy of tdata->thr_{uid,discrim}, necessary because tdata may be + * defunct during teardown. + */ + uint64_t thr_uid; + uint64_t thr_discrim; + + /* + * Reference count of how many times this tctx object is referenced in + * recent allocation / deallocation records, protected by tdata->lock. + */ + uint64_t recent_count; + + /* Profiling counters, protected by tdata->lock. */ + prof_cnt_t cnts; + + /* Associated global context. */ + prof_gctx_t *gctx; + + /* + * UID that distinguishes multiple tctx's created by the same thread, + * but coexisting in gctx->tctxs. There are two ways that such + * coexistence can occur: + * - A dumper thread can cause a tctx to be retained in the purgatory + * state. + * - Although a single "producer" thread must create all tctx's which + * share the same thr_uid, multiple "consumers" can each concurrently + * execute portions of prof_tctx_destroy(). prof_tctx_destroy() only + * gets called once each time cnts.cur{objs,bytes} drop to 0, but this + * threshold can be hit again before the first consumer finishes + * executing prof_tctx_destroy(). + */ + uint64_t tctx_uid; + + /* Linkage into gctx's tctxs. */ + rb_node(prof_tctx_t) tctx_link; + + /* + * True during prof_alloc_prep()..prof_malloc_sample_object(), prevents + * sample vs destroy race. + */ + bool prepared; + + /* Current dump-related state, protected by gctx->lock. */ + prof_tctx_state_t state; + + /* + * Copy of cnts snapshotted during early dump phase, protected by + * dump_mtx. + */ + prof_cnt_t dump_cnts; +}; +typedef rb_tree(prof_tctx_t) prof_tctx_tree_t; + +struct prof_info_s { + /* Time when the allocation was made. */ + nstime_t alloc_time; + /* Points to the prof_tctx_t corresponding to the allocation. */ + prof_tctx_t *alloc_tctx; + /* Allocation request size. */ + size_t alloc_size; +}; + +struct prof_gctx_s { + /* Protects nlimbo, cnt_summed, and tctxs. */ + malloc_mutex_t *lock; + + /* + * Number of threads that currently cause this gctx to be in a state of + * limbo due to one of: + * - Initializing this gctx. + * - Initializing per thread counters associated with this gctx. + * - Preparing to destroy this gctx. + * - Dumping a heap profile that includes this gctx. + * nlimbo must be 1 (single destroyer) in order to safely destroy the + * gctx. + */ + unsigned nlimbo; + + /* + * Tree of profile counters, one for each thread that has allocated in + * this context. + */ + prof_tctx_tree_t tctxs; + + /* Linkage for tree of contexts to be dumped. */ + rb_node(prof_gctx_t) dump_link; + + /* Temporary storage for summation during dump. */ + prof_cnt_t cnt_summed; + + /* Associated backtrace. */ + prof_bt_t bt; + + /* Backtrace vector, variable size, referred to by bt. */ + void *vec[1]; +}; +typedef rb_tree(prof_gctx_t) prof_gctx_tree_t; + +struct prof_tdata_s { + malloc_mutex_t *lock; + + /* Monotonically increasing unique thread identifier. */ + uint64_t thr_uid; + + /* + * Monotonically increasing discriminator among tdata structures + * associated with the same thr_uid. + */ + uint64_t thr_discrim; + + rb_node(prof_tdata_t) tdata_link; + + /* + * Counter used to initialize prof_tctx_t's tctx_uid. No locking is + * necessary when incrementing this field, because only one thread ever + * does so. + */ + uint64_t tctx_uid_next; + + /* + * Hash of (prof_bt_t *)-->(prof_tctx_t *). Each thread tracks + * backtraces for which it has non-zero allocation/deallocation counters + * associated with thread-specific prof_tctx_t objects. Other threads + * may write to prof_tctx_t contents when freeing associated objects. + */ + ckh_t bt2tctx; + + /* Included in heap profile dumps if has content. */ + char thread_name[PROF_THREAD_NAME_MAX_LEN]; + + /* State used to avoid dumping while operating on prof internals. */ + bool enq; + bool enq_idump; + bool enq_gdump; + + /* + * Set to true during an early dump phase for tdata's which are + * currently being dumped. New threads' tdata's have this initialized + * to false so that they aren't accidentally included in later dump + * phases. + */ + bool dumping; + + /* + * True if profiling is active for this tdata's thread + * (thread.prof.active mallctl). + */ + bool active; + + bool attached; + bool expired; + + /* Temporary storage for summation during dump. */ + prof_cnt_t cnt_summed; + + /* Backtrace vector, used for calls to prof_backtrace(). */ + void **vec; +}; +typedef rb_tree(prof_tdata_t) prof_tdata_tree_t; + +struct prof_recent_s { + nstime_t alloc_time; + nstime_t dalloc_time; + + ql_elm(prof_recent_t) link; + size_t size; + size_t usize; + atomic_p_t alloc_edata; /* NULL means allocation has been freed. */ + prof_tctx_t *alloc_tctx; + prof_tctx_t *dalloc_tctx; +}; + +#endif /* JEMALLOC_INTERNAL_PROF_STRUCTS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_sys.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_sys.h new file mode 100644 index 000000000..e6e7f06fb --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_sys.h @@ -0,0 +1,34 @@ +#ifndef JEMALLOC_INTERNAL_PROF_SYS_H +#define JEMALLOC_INTERNAL_PROF_SYS_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/base.h" +#include "jemalloc/internal/mutex.h" + +extern malloc_mutex_t prof_dump_filename_mtx; +extern base_t *prof_base; + +void bt_init(prof_bt_t *bt, void **vec); +void prof_backtrace(tsd_t *tsd, prof_bt_t *bt); +void prof_hooks_init(void); +void prof_unwind_init(void); +void prof_sys_thread_name_fetch(tsd_t *tsd); +int prof_getpid(void); +void prof_get_default_filename(tsdn_t *tsdn, char *filename, uint64_t ind); +bool prof_prefix_set(tsdn_t *tsdn, const char *prefix); +void prof_fdump_impl(tsd_t *tsd); +void prof_idump_impl(tsd_t *tsd); +bool prof_mdump_impl(tsd_t *tsd, const char *filename); +void prof_gdump_impl(tsd_t *tsd); + +/* Used in unit tests. */ +typedef int (prof_sys_thread_name_read_t)(char *buf, size_t limit); +extern prof_sys_thread_name_read_t *JET_MUTABLE prof_sys_thread_name_read; +typedef int (prof_dump_open_file_t)(const char *, int); +extern prof_dump_open_file_t *JET_MUTABLE prof_dump_open_file; +typedef ssize_t (prof_dump_write_file_t)(int, const void *, size_t); +extern prof_dump_write_file_t *JET_MUTABLE prof_dump_write_file; +typedef int (prof_dump_open_maps_t)(void); +extern prof_dump_open_maps_t *JET_MUTABLE prof_dump_open_maps; + +#endif /* JEMALLOC_INTERNAL_PROF_SYS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_types.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_types.h new file mode 100644 index 000000000..a27f7fb33 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/prof_types.h @@ -0,0 +1,94 @@ +#ifndef JEMALLOC_INTERNAL_PROF_TYPES_H +#define JEMALLOC_INTERNAL_PROF_TYPES_H + +typedef struct prof_bt_s prof_bt_t; +typedef struct prof_cnt_s prof_cnt_t; +typedef struct prof_tctx_s prof_tctx_t; +typedef struct prof_info_s prof_info_t; +typedef struct prof_gctx_s prof_gctx_t; +typedef struct prof_tdata_s prof_tdata_t; +typedef struct prof_recent_s prof_recent_t; + +/* Option defaults. */ +#ifdef JEMALLOC_PROF +# define PROF_PREFIX_DEFAULT "jeprof" +#else +# define PROF_PREFIX_DEFAULT "" +#endif +#define LG_PROF_SAMPLE_DEFAULT 19 +#define LG_PROF_INTERVAL_DEFAULT -1 + +/* + * Hard limit on stack backtrace depth. The version of prof_backtrace() that + * is based on __builtin_return_address() necessarily has a hard-coded number + * of backtrace frame handlers, and should be kept in sync with this setting. + */ +#ifdef JEMALLOC_PROF_GCC +# define PROF_BT_MAX_LIMIT 256 +#else +# define PROF_BT_MAX_LIMIT UINT_MAX +#endif +#define PROF_BT_MAX_DEFAULT 128 + +/* Initial hash table size. */ +#define PROF_CKH_MINITEMS 64 + +/* Size of memory buffer to use when writing dump files. */ +#ifndef JEMALLOC_PROF +/* Minimize memory bloat for non-prof builds. */ +# define PROF_DUMP_BUFSIZE 1 +#elif defined(JEMALLOC_DEBUG) +/* Use a small buffer size in debug build, mainly to facilitate testing. */ +# define PROF_DUMP_BUFSIZE 16 +#else +# define PROF_DUMP_BUFSIZE 65536 +#endif + +/* Size of size class related tables */ +#ifdef JEMALLOC_PROF +# define PROF_SC_NSIZES SC_NSIZES +#else +/* Minimize memory bloat for non-prof builds. */ +# define PROF_SC_NSIZES 1 +#endif + +/* Size of stack-allocated buffer used by prof_printf(). */ +#define PROF_PRINTF_BUFSIZE 128 + +/* + * Number of mutexes shared among all gctx's. No space is allocated for these + * unless profiling is enabled, so it's okay to over-provision. + */ +#define PROF_NCTX_LOCKS 1024 + +/* + * Number of mutexes shared among all tdata's. No space is allocated for these + * unless profiling is enabled, so it's okay to over-provision. + */ +#define PROF_NTDATA_LOCKS 256 + +/* Minimize memory bloat for non-prof builds. */ +#ifdef JEMALLOC_PROF +#define PROF_DUMP_FILENAME_LEN (PATH_MAX + 1) +#else +#define PROF_DUMP_FILENAME_LEN 1 +#endif + +/* Default number of recent allocations to record. */ +#define PROF_RECENT_ALLOC_MAX_DEFAULT 0 + +/* Thread name storage size limit. */ +#define PROF_THREAD_NAME_MAX_LEN 16 + +/* + * Minimum required alignment for sampled allocations. Over-aligning sampled + * allocations allows us to quickly identify them on the dalloc path without + * resorting to metadata lookup. + */ +#define PROF_SAMPLE_ALIGNMENT PAGE +#define PROF_SAMPLE_ALIGNMENT_MASK PAGE_MASK + +/* NOLINTNEXTLINE(performance-no-int-to-ptr) */ +#define PROF_TCTX_SENTINEL ((prof_tctx_t *)((uintptr_t)1U)) + +#endif /* JEMALLOC_INTERNAL_PROF_TYPES_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/psset.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/psset.h new file mode 100644 index 000000000..7e510b7f6 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/psset.h @@ -0,0 +1,132 @@ +#ifndef JEMALLOC_INTERNAL_PSSET_H +#define JEMALLOC_INTERNAL_PSSET_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/hpdata.h" + +/* + * A page-slab set. What the eset is to PAC, the psset is to HPA. It maintains + * a collection of page-slabs (the intent being that they are backed by + * hugepages, or at least could be), and handles allocation and deallocation + * requests. + */ + +/* + * One more than the maximum pszind_t we will serve out of the HPA. + * Practically, we expect only the first few to be actually used. This + * corresponds to a maximum size of of 512MB on systems with 4k pages and + * SC_NGROUP == 4, which is already an unreasonably large maximum. Morally, you + * can think of this as being SC_NPSIZES, but there's no sense in wasting that + * much space in the arena, making bitmaps that much larger, etc. + */ +#define PSSET_NPSIZES 64 + +/* + * We keep two purge lists per page size class; one for hugified hpdatas (at + * index 2*pszind), and one for the non-hugified hpdatas (at index 2*pszind + + * 1). This lets us implement a preference for purging non-hugified hpdatas + * among similarly-dirty ones. + * We reserve the last two indices for empty slabs, in that case purging + * hugified ones (which are definitionally all waste) before non-hugified ones + * (i.e. reversing the order). + */ +#define PSSET_NPURGE_LISTS (2 * PSSET_NPSIZES) + +typedef struct psset_bin_stats_s psset_bin_stats_t; +struct psset_bin_stats_s { + /* How many pageslabs are in this bin? */ + size_t npageslabs; + /* Of them, how many pages are active? */ + size_t nactive; + /* And how many are dirty? */ + size_t ndirty; +}; + +typedef struct psset_stats_s psset_stats_t; +struct psset_stats_s { + /* + * The second index is huge stats; nonfull_slabs[pszind][0] contains + * stats for the non-huge slabs in bucket pszind, while + * nonfull_slabs[pszind][1] contains stats for the huge slabs. + */ + psset_bin_stats_t nonfull_slabs[PSSET_NPSIZES][2]; + + /* + * Full slabs don't live in any edata heap, but we still track their + * stats. + */ + psset_bin_stats_t full_slabs[2]; + + /* Empty slabs are similar. */ + psset_bin_stats_t empty_slabs[2]; +}; + +typedef struct psset_s psset_t; +struct psset_s { + /* + * The pageslabs, quantized by the size class of the largest contiguous + * free run of pages in a pageslab. + */ + hpdata_age_heap_t pageslabs[PSSET_NPSIZES]; + /* Bitmap for which set bits correspond to non-empty heaps. */ + fb_group_t pageslab_bitmap[FB_NGROUPS(PSSET_NPSIZES)]; + /* + * The sum of all bin stats in stats. This lets us quickly answer + * queries for the number of dirty, active, and retained pages in the + * entire set. + */ + psset_bin_stats_t merged_stats; + psset_stats_t stats; + /* + * Slabs with no active allocations, but which are allowed to serve new + * allocations. + */ + hpdata_empty_list_t empty; + /* + * Slabs which are available to be purged, ordered by how much we want + * to purge them (with later indices indicating slabs we want to purge + * more). + */ + hpdata_purge_list_t to_purge[PSSET_NPURGE_LISTS]; + /* Bitmap for which set bits correspond to non-empty purge lists. */ + fb_group_t purge_bitmap[FB_NGROUPS(PSSET_NPURGE_LISTS)]; + /* Slabs which are available to be hugified. */ + hpdata_hugify_list_t to_hugify; +}; + +void psset_init(psset_t *psset); +void psset_stats_accum(psset_stats_t *dst, psset_stats_t *src); + +/* + * Begin or end updating the given pageslab's metadata. While the pageslab is + * being updated, it won't be returned from psset_fit calls. + */ +void psset_update_begin(psset_t *psset, hpdata_t *ps); +void psset_update_end(psset_t *psset, hpdata_t *ps); + +/* Analogous to the eset_fit; pick a hpdata to serve the request. */ +hpdata_t *psset_pick_alloc(psset_t *psset, size_t size); +/* Pick one to purge. */ +hpdata_t *psset_pick_purge(psset_t *psset); +/* Pick one to hugify. */ +hpdata_t *psset_pick_hugify(psset_t *psset); + +void psset_insert(psset_t *psset, hpdata_t *ps); +void psset_remove(psset_t *psset, hpdata_t *ps); + +static inline size_t +psset_npageslabs(psset_t *psset) { + return psset->merged_stats.npageslabs; +} + +static inline size_t +psset_nactive(psset_t *psset) { + return psset->merged_stats.nactive; +} + +static inline size_t +psset_ndirty(psset_t *psset) { + return psset->merged_stats.ndirty; +} + +#endif /* JEMALLOC_INTERNAL_PSSET_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/public_namespace.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/public_namespace.h new file mode 100644 index 000000000..64cedadfb --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/public_namespace.h @@ -0,0 +1,26 @@ +#define je_aligned_alloc JEMALLOC_N(aligned_alloc) +#define je_calloc JEMALLOC_N(calloc) +#define je_dallocx JEMALLOC_N(dallocx) +#define je_free JEMALLOC_N(free) +#define je_free_sized JEMALLOC_N(free_sized) +#define je_free_aligned_sized JEMALLOC_N(free_aligned_sized) +#define je_mallctl JEMALLOC_N(mallctl) +#define je_mallctlbymib JEMALLOC_N(mallctlbymib) +#define je_mallctlnametomib JEMALLOC_N(mallctlnametomib) +#define je_malloc JEMALLOC_N(malloc) +#define je_malloc_conf JEMALLOC_N(malloc_conf) +#define je_malloc_conf_2_conf_harder JEMALLOC_N(malloc_conf_2_conf_harder) +#define je_malloc_message JEMALLOC_N(malloc_message) +#define je_malloc_stats_print JEMALLOC_N(malloc_stats_print) +#define je_malloc_usable_size JEMALLOC_N(malloc_usable_size) +#define je_mallocx JEMALLOC_N(mallocx) +#define je_smallocx_a25b9b8ba91881964be3083db349991bbbbf1661 JEMALLOC_N(smallocx_a25b9b8ba91881964be3083db349991bbbbf1661) +#define je_nallocx JEMALLOC_N(nallocx) +#define je_posix_memalign JEMALLOC_N(posix_memalign) +#define je_rallocx JEMALLOC_N(rallocx) +#define je_realloc JEMALLOC_N(realloc) +#define je_sallocx JEMALLOC_N(sallocx) +#define je_sdallocx JEMALLOC_N(sdallocx) +#define je_xallocx JEMALLOC_N(xallocx) +#define je_valloc JEMALLOC_N(valloc) +#define je_malloc_size JEMALLOC_N(malloc_size) diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/public_unnamespace.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/public_unnamespace.h new file mode 100644 index 000000000..7709fd7d2 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/public_unnamespace.h @@ -0,0 +1,26 @@ +#undef je_aligned_alloc +#undef je_calloc +#undef je_dallocx +#undef je_free +#undef je_free_sized +#undef je_free_aligned_sized +#undef je_mallctl +#undef je_mallctlbymib +#undef je_mallctlnametomib +#undef je_malloc +#undef je_malloc_conf +#undef je_malloc_conf_2_conf_harder +#undef je_malloc_message +#undef je_malloc_stats_print +#undef je_malloc_usable_size +#undef je_mallocx +#undef je_smallocx_a25b9b8ba91881964be3083db349991bbbbf1661 +#undef je_nallocx +#undef je_posix_memalign +#undef je_rallocx +#undef je_realloc +#undef je_sallocx +#undef je_sdallocx +#undef je_xallocx +#undef je_valloc +#undef je_malloc_size diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/ql.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/ql.h new file mode 100644 index 000000000..ebe69988a --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/ql.h @@ -0,0 +1,198 @@ +#ifndef JEMALLOC_INTERNAL_QL_H +#define JEMALLOC_INTERNAL_QL_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/qr.h" + +/* + * A linked-list implementation. + * + * This is built on top of the ring implementation, but that can be viewed as an + * implementation detail (i.e. trying to advance past the tail of the list + * doesn't wrap around). + * + * You define a struct like so: + * typedef strucy my_s my_t; + * struct my_s { + * int data; + * ql_elm(my_t) my_link; + * }; + * + * // We wobble between "list" and "head" for this type; we're now mostly + * // heading towards "list". + * typedef ql_head(my_t) my_list_t; + * + * You then pass a my_list_t * for a_head arguments, a my_t * for a_elm + * arguments, the token "my_link" for a_field arguments, and the token "my_t" + * for a_type arguments. + */ + +/* List definitions. */ +#define ql_head(a_type) \ +struct { \ + a_type *qlh_first; \ +} + +/* Static initializer for an empty list. */ +#define ql_head_initializer(a_head) {NULL} + +/* The field definition. */ +#define ql_elm(a_type) qr(a_type) + +/* A pointer to the first element in the list, or NULL if the list is empty. */ +#define ql_first(a_head) ((a_head)->qlh_first) + +/* Dynamically initializes a list. */ +#define ql_new(a_head) do { \ + ql_first(a_head) = NULL; \ +} while (0) + +/* + * Sets dest to be the contents of src (overwriting any elements there), leaving + * src empty. + */ +#define ql_move(a_head_dest, a_head_src) do { \ + ql_first(a_head_dest) = ql_first(a_head_src); \ + ql_new(a_head_src); \ +} while (0) + +/* True if the list is empty, otherwise false. */ +#define ql_empty(a_head) (ql_first(a_head) == NULL) + +/* + * Initializes a ql_elm. Must be called even if the field is about to be + * overwritten. + */ +#define ql_elm_new(a_elm, a_field) qr_new((a_elm), a_field) + +/* + * Obtains the last item in the list. + */ +#define ql_last(a_head, a_field) \ + (ql_empty(a_head) ? NULL : qr_prev(ql_first(a_head), a_field)) + +/* + * Gets a pointer to the next/prev element in the list. Trying to advance past + * the end or retreat before the beginning of the list returns NULL. + */ +#define ql_next(a_head, a_elm, a_field) \ + ((ql_last(a_head, a_field) != (a_elm)) \ + ? qr_next((a_elm), a_field) : NULL) +#define ql_prev(a_head, a_elm, a_field) \ + ((ql_first(a_head) != (a_elm)) ? qr_prev((a_elm), a_field) \ + : NULL) + +/* Inserts a_elm before a_qlelm in the list. */ +#define ql_before_insert(a_head, a_qlelm, a_elm, a_field) do { \ + qr_before_insert((a_qlelm), (a_elm), a_field); \ + if (ql_first(a_head) == (a_qlelm)) { \ + ql_first(a_head) = (a_elm); \ + } \ +} while (0) + +/* Inserts a_elm after a_qlelm in the list. */ +#define ql_after_insert(a_qlelm, a_elm, a_field) \ + qr_after_insert((a_qlelm), (a_elm), a_field) + +/* Inserts a_elm as the first item in the list. */ +#define ql_head_insert(a_head, a_elm, a_field) do { \ + if (!ql_empty(a_head)) { \ + qr_before_insert(ql_first(a_head), (a_elm), a_field); \ + } \ + ql_first(a_head) = (a_elm); \ +} while (0) + +/* Inserts a_elm as the last item in the list. */ +#define ql_tail_insert(a_head, a_elm, a_field) do { \ + if (!ql_empty(a_head)) { \ + qr_before_insert(ql_first(a_head), (a_elm), a_field); \ + } \ + ql_first(a_head) = qr_next((a_elm), a_field); \ +} while (0) + +/* + * Given lists a = [a_1, ..., a_n] and [b_1, ..., b_n], results in: + * a = [a1, ..., a_n, b_1, ..., b_n] and b = []. + */ +#define ql_concat(a_head_a, a_head_b, a_field) do { \ + if (ql_empty(a_head_a)) { \ + ql_move(a_head_a, a_head_b); \ + } else if (!ql_empty(a_head_b)) { \ + qr_meld(ql_first(a_head_a), ql_first(a_head_b), \ + a_field); \ + ql_new(a_head_b); \ + } \ +} while (0) + +/* Removes a_elm from the list. */ +#define ql_remove(a_head, a_elm, a_field) do { \ + if (ql_first(a_head) == (a_elm)) { \ + ql_first(a_head) = qr_next(ql_first(a_head), a_field); \ + } \ + if (ql_first(a_head) != (a_elm)) { \ + qr_remove((a_elm), a_field); \ + } else { \ + ql_new(a_head); \ + } \ +} while (0) + +/* Removes the first item in the list. */ +#define ql_head_remove(a_head, a_type, a_field) do { \ + a_type *t = ql_first(a_head); \ + ql_remove((a_head), t, a_field); \ +} while (0) + +/* Removes the last item in the list. */ +#define ql_tail_remove(a_head, a_type, a_field) do { \ + a_type *t = ql_last(a_head, a_field); \ + ql_remove((a_head), t, a_field); \ +} while (0) + +/* + * Given a = [a_1, a_2, ..., a_n-1, a_n, a_n+1, ...], + * ql_split(a, a_n, b, some_field) results in + * a = [a_1, a_2, ..., a_n-1] + * and replaces b's contents with: + * b = [a_n, a_n+1, ...] + */ +#define ql_split(a_head_a, a_elm, a_head_b, a_field) do { \ + if (ql_first(a_head_a) == (a_elm)) { \ + ql_move(a_head_b, a_head_a); \ + } else { \ + qr_split(ql_first(a_head_a), (a_elm), a_field); \ + ql_first(a_head_b) = (a_elm); \ + } \ +} while (0) + +/* + * An optimized version of: + * a_type *t = ql_first(a_head); + * ql_remove((a_head), t, a_field); + * ql_tail_insert((a_head), t, a_field); + */ +#define ql_rotate(a_head, a_field) do { \ + ql_first(a_head) = qr_next(ql_first(a_head), a_field); \ +} while (0) + +/* + * Helper macro to iterate over each element in a list in order, starting from + * the head (or in reverse order, starting from the tail). The usage is + * (assuming my_t and my_list_t defined as above). + * + * int sum(my_list_t *list) { + * int sum = 0; + * my_t *iter; + * ql_foreach(iter, list, link) { + * sum += iter->data; + * } + * return sum; + * } + */ + +#define ql_foreach(a_var, a_head, a_field) \ + qr_foreach((a_var), ql_first(a_head), a_field) + +#define ql_reverse_foreach(a_var, a_head, a_field) \ + qr_reverse_foreach((a_var), ql_first(a_head), a_field) + +#endif /* JEMALLOC_INTERNAL_QL_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/qr.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/qr.h new file mode 100644 index 000000000..ece4f5568 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/qr.h @@ -0,0 +1,140 @@ +#ifndef JEMALLOC_INTERNAL_QR_H +#define JEMALLOC_INTERNAL_QR_H + +/* + * A ring implementation based on an embedded circular doubly-linked list. + * + * You define your struct like so: + * + * typedef struct my_s my_t; + * struct my_s { + * int data; + * qr(my_t) my_link; + * }; + * + * And then pass a my_t * into macros for a_qr arguments, and the token + * "my_link" into a_field fields. + */ + +/* Ring definitions. */ +#define qr(a_type) \ +struct { \ + a_type *qre_next; \ + a_type *qre_prev; \ +} + +/* + * Initialize a qr link. Every link must be initialized before being used, even + * if that initialization is going to be immediately overwritten (say, by being + * passed into an insertion macro). + */ +#define qr_new(a_qr, a_field) do { \ + (a_qr)->a_field.qre_next = (a_qr); \ + (a_qr)->a_field.qre_prev = (a_qr); \ +} while (0) + +/* + * Go forwards or backwards in the ring. Note that (the ring being circular), this + * always succeeds -- you just keep looping around and around the ring if you + * chase pointers without end. + */ +#define qr_next(a_qr, a_field) ((a_qr)->a_field.qre_next) +#define qr_prev(a_qr, a_field) ((a_qr)->a_field.qre_prev) + +/* + * Given two rings: + * a -> a_1 -> ... -> a_n -- + * ^ | + * |------------------------ + * + * b -> b_1 -> ... -> b_n -- + * ^ | + * |------------------------ + * + * Results in the ring: + * a -> a_1 -> ... -> a_n -> b -> b_1 -> ... -> b_n -- + * ^ | + * |-------------------------------------------------| + * + * a_qr_a can directly be a qr_next() macro, but a_qr_b cannot. + */ +#define qr_meld(a_qr_a, a_qr_b, a_field) do { \ + (a_qr_b)->a_field.qre_prev->a_field.qre_next = \ + (a_qr_a)->a_field.qre_prev; \ + (a_qr_a)->a_field.qre_prev = (a_qr_b)->a_field.qre_prev; \ + (a_qr_b)->a_field.qre_prev = \ + (a_qr_b)->a_field.qre_prev->a_field.qre_next; \ + (a_qr_a)->a_field.qre_prev->a_field.qre_next = (a_qr_a); \ + (a_qr_b)->a_field.qre_prev->a_field.qre_next = (a_qr_b); \ +} while (0) + +/* + * Logically, this is just a meld. The intent, though, is that a_qrelm is a + * single-element ring, so that "before" has a more obvious interpretation than + * meld. + */ +#define qr_before_insert(a_qrelm, a_qr, a_field) \ + qr_meld((a_qrelm), (a_qr), a_field) + +/* Ditto, but inserting after rather than before. */ +#define qr_after_insert(a_qrelm, a_qr, a_field) \ + qr_before_insert(qr_next(a_qrelm, a_field), (a_qr), a_field) + +/* + * Inverts meld; given the ring: + * a -> a_1 -> ... -> a_n -> b -> b_1 -> ... -> b_n -- + * ^ | + * |-------------------------------------------------| + * + * Results in two rings: + * a -> a_1 -> ... -> a_n -- + * ^ | + * |------------------------ + * + * b -> b_1 -> ... -> b_n -- + * ^ | + * |------------------------ + * + * qr_meld() and qr_split() are functionally equivalent, so there's no need to + * have two copies of the code. + */ +#define qr_split(a_qr_a, a_qr_b, a_field) \ + qr_meld((a_qr_a), (a_qr_b), a_field) + +/* + * Splits off a_qr from the rest of its ring, so that it becomes a + * single-element ring. + */ +#define qr_remove(a_qr, a_field) \ + qr_split(qr_next(a_qr, a_field), (a_qr), a_field) + +/* + * Helper macro to iterate over each element in a ring exactly once, starting + * with a_qr. The usage is (assuming my_t defined as above): + * + * int sum(my_t *item) { + * int sum = 0; + * my_t *iter; + * qr_foreach(iter, item, link) { + * sum += iter->data; + * } + * return sum; + * } + */ +#define qr_foreach(var, a_qr, a_field) \ + for ((var) = (a_qr); \ + (var) != NULL; \ + (var) = (((var)->a_field.qre_next != (a_qr)) \ + ? (var)->a_field.qre_next : NULL)) + +/* + * The same (and with the same usage) as qr_foreach, but in the opposite order, + * ending with a_qr. + */ +#define qr_reverse_foreach(var, a_qr, a_field) \ + for ((var) = ((a_qr) != NULL) ? qr_prev(a_qr, a_field) : NULL; \ + (var) != NULL; \ + (var) = (((var) != (a_qr)) \ + ? (var)->a_field.qre_prev : NULL)) + +#endif /* JEMALLOC_INTERNAL_QR_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/quantum.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/quantum.h new file mode 100644 index 000000000..a97f54caf --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/quantum.h @@ -0,0 +1,87 @@ +#ifndef JEMALLOC_INTERNAL_QUANTUM_H +#define JEMALLOC_INTERNAL_QUANTUM_H + +/* + * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size + * classes). + */ +#ifndef LG_QUANTUM +# if (defined(__i386__) || defined(_M_IX86)) +# define LG_QUANTUM 4 +# endif +# ifdef __ia64__ +# define LG_QUANTUM 4 +# endif +# ifdef __alpha__ +# define LG_QUANTUM 4 +# endif +# if (defined(__sparc64__) || defined(__sparcv9) || defined(__sparc_v9__)) +# define LG_QUANTUM 4 +# endif +# if (defined(__amd64__) || defined(__x86_64__) || defined(_M_X64)) +# define LG_QUANTUM 4 +# endif +# ifdef __arm__ +# define LG_QUANTUM 3 +# endif +# ifdef __aarch64__ +# define LG_QUANTUM 4 +# endif +# ifdef __hppa__ +# define LG_QUANTUM 4 +# endif +# ifdef __loongarch__ +# define LG_QUANTUM 4 +# endif +# ifdef __m68k__ +# define LG_QUANTUM 3 +# endif +# ifdef __mips__ +# if defined(__mips_n32) || defined(__mips_n64) +# define LG_QUANTUM 4 +# else +# define LG_QUANTUM 3 +# endif +# endif +# ifdef __nios2__ +# define LG_QUANTUM 3 +# endif +# ifdef __or1k__ +# define LG_QUANTUM 3 +# endif +# if defined(__powerpc__) || defined(__ppc__) || defined(__powerpc64__) || defined(__ppc64__) +# define LG_QUANTUM 4 +# endif +# if defined(__riscv) || defined(__riscv__) +# define LG_QUANTUM 4 +# endif +# ifdef __s390__ +# define LG_QUANTUM 4 +# endif +# if (defined (__SH3E__) || defined(__SH4_SINGLE__) || defined(__SH4__) || \ + defined(__SH4_SINGLE_ONLY__)) +# define LG_QUANTUM 4 +# endif +# ifdef __tile__ +# define LG_QUANTUM 4 +# endif +# ifdef __le32__ +# define LG_QUANTUM 4 +# endif +# ifdef __arc__ +# define LG_QUANTUM 3 +# endif +# ifndef LG_QUANTUM +# error "Unknown minimum alignment for architecture; specify via " + "--with-lg-quantum" +# endif +#endif + +#define QUANTUM ((size_t)(1U << LG_QUANTUM)) +#define QUANTUM_MASK (QUANTUM - 1) + +/* Return the smallest quantum multiple that is >= a. */ +#define QUANTUM_CEILING(a) \ + (((a) + QUANTUM_MASK) & ~QUANTUM_MASK) + +#endif /* JEMALLOC_INTERNAL_QUANTUM_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/rb.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/rb.h new file mode 100644 index 000000000..5f2771a9c --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/rb.h @@ -0,0 +1,1875 @@ +#ifndef JEMALLOC_INTERNAL_RB_H +#define JEMALLOC_INTERNAL_RB_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/safety_check.h" + +/*- + ******************************************************************************* + * + * cpp macro implementation of left-leaning 2-3 red-black trees. Parent + * pointers are not used, and color bits are stored in the least significant + * bit of right-child pointers (if RB_COMPACT is defined), thus making node + * linkage as compact as is possible for red-black trees. + * + * Usage: + * + * #include + * #include + * #define NDEBUG // (Optional, see assert(3).) + * #include + * #define RB_COMPACT // (Optional, embed color bits in right-child pointers.) + * #include + * ... + * + ******************************************************************************* + */ + +#ifndef __PGI +#define RB_COMPACT +#endif + +/* + * Each node in the RB tree consumes at least 1 byte of space (for the linkage + * if nothing else, so there are a maximum of sizeof(void *) << 3 rb tree nodes + * in any process (and thus, at most sizeof(void *) << 3 nodes in any rb tree). + * The choice of algorithm bounds the depth of a tree to twice the binary log of + * the number of elements in the tree; the following bound follows. + */ +#define RB_MAX_DEPTH (sizeof(void *) << 4) + +#ifdef RB_COMPACT +/* Node structure. */ +#define rb_node(a_type) \ +struct { \ + a_type *rbn_left; \ + a_type *rbn_right_red; \ +} +#else +#define rb_node(a_type) \ +struct { \ + a_type *rbn_left; \ + a_type *rbn_right; \ + bool rbn_red; \ +} +#endif + +/* Root structure. */ +#define rb_tree(a_type) \ +struct { \ + a_type *rbt_root; \ +} + +/* Left accessors. */ +#define rbtn_left_get(a_type, a_field, a_node) \ + ((a_node)->a_field.rbn_left) +#define rbtn_left_set(a_type, a_field, a_node, a_left) do { \ + (a_node)->a_field.rbn_left = a_left; \ +} while (0) + +#ifdef RB_COMPACT +/* Right accessors. */ +#define rbtn_right_get(a_type, a_field, a_node) \ + ((a_type *) (((intptr_t) (a_node)->a_field.rbn_right_red) \ + & ((ssize_t)-2))) +#define rbtn_right_set(a_type, a_field, a_node, a_right) do { \ + (a_node)->a_field.rbn_right_red = (a_type *) (((uintptr_t) a_right) \ + | (((uintptr_t) (a_node)->a_field.rbn_right_red) & ((size_t)1))); \ +} while (0) + +/* Color accessors. */ +#define rbtn_red_get(a_type, a_field, a_node) \ + ((bool) (((uintptr_t) (a_node)->a_field.rbn_right_red) \ + & ((size_t)1))) +#define rbtn_color_set(a_type, a_field, a_node, a_red) do { \ + (a_node)->a_field.rbn_right_red = (a_type *) ((((intptr_t) \ + (a_node)->a_field.rbn_right_red) & ((ssize_t)-2)) \ + | ((ssize_t)a_red)); \ +} while (0) +#define rbtn_red_set(a_type, a_field, a_node) do { \ + (a_node)->a_field.rbn_right_red = (a_type *) (((uintptr_t) \ + (a_node)->a_field.rbn_right_red) | ((size_t)1)); \ +} while (0) +#define rbtn_black_set(a_type, a_field, a_node) do { \ + (a_node)->a_field.rbn_right_red = (a_type *) (((intptr_t) \ + (a_node)->a_field.rbn_right_red) & ((ssize_t)-2)); \ +} while (0) + +/* Node initializer. */ +#define rbt_node_new(a_type, a_field, a_rbt, a_node) do { \ + /* Bookkeeping bit cannot be used by node pointer. */ \ + assert(((uintptr_t)(a_node) & 0x1) == 0); \ + rbtn_left_set(a_type, a_field, (a_node), NULL); \ + rbtn_right_set(a_type, a_field, (a_node), NULL); \ + rbtn_red_set(a_type, a_field, (a_node)); \ +} while (0) +#else +/* Right accessors. */ +#define rbtn_right_get(a_type, a_field, a_node) \ + ((a_node)->a_field.rbn_right) +#define rbtn_right_set(a_type, a_field, a_node, a_right) do { \ + (a_node)->a_field.rbn_right = a_right; \ +} while (0) + +/* Color accessors. */ +#define rbtn_red_get(a_type, a_field, a_node) \ + ((a_node)->a_field.rbn_red) +#define rbtn_color_set(a_type, a_field, a_node, a_red) do { \ + (a_node)->a_field.rbn_red = (a_red); \ +} while (0) +#define rbtn_red_set(a_type, a_field, a_node) do { \ + (a_node)->a_field.rbn_red = true; \ +} while (0) +#define rbtn_black_set(a_type, a_field, a_node) do { \ + (a_node)->a_field.rbn_red = false; \ +} while (0) + +/* Node initializer. */ +#define rbt_node_new(a_type, a_field, a_rbt, a_node) do { \ + rbtn_left_set(a_type, a_field, (a_node), NULL); \ + rbtn_right_set(a_type, a_field, (a_node), NULL); \ + rbtn_red_set(a_type, a_field, (a_node)); \ +} while (0) +#endif + +/* Tree initializer. */ +#define rb_new(a_type, a_field, a_rbt) do { \ + (a_rbt)->rbt_root = NULL; \ +} while (0) + +/* Internal utility macros. */ +#define rbtn_first(a_type, a_field, a_rbt, a_root, r_node) do { \ + (r_node) = (a_root); \ + if ((r_node) != NULL) { \ + for (; \ + rbtn_left_get(a_type, a_field, (r_node)) != NULL; \ + (r_node) = rbtn_left_get(a_type, a_field, (r_node))) { \ + } \ + } \ +} while (0) + +#define rbtn_last(a_type, a_field, a_rbt, a_root, r_node) do { \ + (r_node) = (a_root); \ + if ((r_node) != NULL) { \ + for (; rbtn_right_get(a_type, a_field, (r_node)) != NULL; \ + (r_node) = rbtn_right_get(a_type, a_field, (r_node))) { \ + } \ + } \ +} while (0) + +#define rbtn_rotate_left(a_type, a_field, a_node, r_node) do { \ + (r_node) = rbtn_right_get(a_type, a_field, (a_node)); \ + rbtn_right_set(a_type, a_field, (a_node), \ + rbtn_left_get(a_type, a_field, (r_node))); \ + rbtn_left_set(a_type, a_field, (r_node), (a_node)); \ +} while (0) + +#define rbtn_rotate_right(a_type, a_field, a_node, r_node) do { \ + (r_node) = rbtn_left_get(a_type, a_field, (a_node)); \ + rbtn_left_set(a_type, a_field, (a_node), \ + rbtn_right_get(a_type, a_field, (r_node))); \ + rbtn_right_set(a_type, a_field, (r_node), (a_node)); \ +} while (0) + +#define rb_summarized_only_false(...) +#define rb_summarized_only_true(...) __VA_ARGS__ +#define rb_empty_summarize(a_node, a_lchild, a_rchild) false + +/* + * The rb_proto() and rb_summarized_proto() macros generate function prototypes + * that correspond to the functions generated by an equivalently parameterized + * call to rb_gen() or rb_summarized_gen(), respectively. + */ + +#define rb_proto(a_attr, a_prefix, a_rbt_type, a_type) \ + rb_proto_impl(a_attr, a_prefix, a_rbt_type, a_type, false) +#define rb_summarized_proto(a_attr, a_prefix, a_rbt_type, a_type) \ + rb_proto_impl(a_attr, a_prefix, a_rbt_type, a_type, true) +#define rb_proto_impl(a_attr, a_prefix, a_rbt_type, a_type, \ + a_is_summarized) \ +a_attr void \ +a_prefix##new(a_rbt_type *rbtree); \ +a_attr bool \ +a_prefix##empty(a_rbt_type *rbtree); \ +a_attr a_type * \ +a_prefix##first(a_rbt_type *rbtree); \ +a_attr a_type * \ +a_prefix##last(a_rbt_type *rbtree); \ +a_attr a_type * \ +a_prefix##next(a_rbt_type *rbtree, a_type *node); \ +a_attr a_type * \ +a_prefix##prev(a_rbt_type *rbtree, a_type *node); \ +a_attr a_type * \ +a_prefix##search(a_rbt_type *rbtree, const a_type *key); \ +a_attr a_type * \ +a_prefix##nsearch(a_rbt_type *rbtree, const a_type *key); \ +a_attr a_type * \ +a_prefix##psearch(a_rbt_type *rbtree, const a_type *key); \ +a_attr void \ +a_prefix##insert(a_rbt_type *rbtree, a_type *node); \ +a_attr void \ +a_prefix##remove(a_rbt_type *rbtree, a_type *node); \ +a_attr a_type * \ +a_prefix##iter(a_rbt_type *rbtree, a_type *start, a_type *(*cb)( \ + a_rbt_type *, a_type *, void *), void *arg); \ +a_attr a_type * \ +a_prefix##reverse_iter(a_rbt_type *rbtree, a_type *start, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg); \ +a_attr void \ +a_prefix##destroy(a_rbt_type *rbtree, void (*cb)(a_type *, void *), \ + void *arg); \ +/* Extended API */ \ +rb_summarized_only_##a_is_summarized( \ +a_attr void \ +a_prefix##update_summaries(a_rbt_type *rbtree, a_type *node); \ +a_attr bool \ +a_prefix##empty_filtered(a_rbt_type *rbtree, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx); \ +a_attr a_type * \ +a_prefix##first_filtered(a_rbt_type *rbtree, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx); \ +a_attr a_type * \ +a_prefix##last_filtered(a_rbt_type *rbtree, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx); \ +a_attr a_type * \ +a_prefix##next_filtered(a_rbt_type *rbtree, a_type *node, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx); \ +a_attr a_type * \ +a_prefix##prev_filtered(a_rbt_type *rbtree, a_type *node, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx); \ +a_attr a_type * \ +a_prefix##search_filtered(a_rbt_type *rbtree, const a_type *key, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx); \ +a_attr a_type * \ +a_prefix##nsearch_filtered(a_rbt_type *rbtree, const a_type *key, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx); \ +a_attr a_type * \ +a_prefix##psearch_filtered(a_rbt_type *rbtree, const a_type *key, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx); \ +a_attr a_type * \ +a_prefix##iter_filtered(a_rbt_type *rbtree, a_type *start, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx); \ +a_attr a_type * \ +a_prefix##reverse_iter_filtered(a_rbt_type *rbtree, a_type *start, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx); \ +) + +/* + * The rb_gen() macro generates a type-specific red-black tree implementation, + * based on the above cpp macros. + * Arguments: + * + * a_attr: + * Function attribute for generated functions (ex: static). + * a_prefix: + * Prefix for generated functions (ex: ex_). + * a_rb_type: + * Type for red-black tree data structure (ex: ex_t). + * a_type: + * Type for red-black tree node data structure (ex: ex_node_t). + * a_field: + * Name of red-black tree node linkage (ex: ex_link). + * a_cmp: + * Node comparison function name, with the following prototype: + * + * int a_cmp(a_type *a_node, a_type *a_other); + * ^^^^^^ + * or a_key + * Interpretation of comparison function return values: + * -1 : a_node < a_other + * 0 : a_node == a_other + * 1 : a_node > a_other + * In all cases, the a_node or a_key macro argument is the first argument to + * the comparison function, which makes it possible to write comparison + * functions that treat the first argument specially. a_cmp must be a total + * order on values inserted into the tree -- duplicates are not allowed. + * + * Assuming the following setup: + * + * typedef struct ex_node_s ex_node_t; + * struct ex_node_s { + * rb_node(ex_node_t) ex_link; + * }; + * typedef rb_tree(ex_node_t) ex_t; + * rb_gen(static, ex_, ex_t, ex_node_t, ex_link, ex_cmp) + * + * The following API is generated: + * + * static void + * ex_new(ex_t *tree); + * Description: Initialize a red-black tree structure. + * Args: + * tree: Pointer to an uninitialized red-black tree object. + * + * static bool + * ex_empty(ex_t *tree); + * Description: Determine whether tree is empty. + * Args: + * tree: Pointer to an initialized red-black tree object. + * Ret: True if tree is empty, false otherwise. + * + * static ex_node_t * + * ex_first(ex_t *tree); + * static ex_node_t * + * ex_last(ex_t *tree); + * Description: Get the first/last node in tree. + * Args: + * tree: Pointer to an initialized red-black tree object. + * Ret: First/last node in tree, or NULL if tree is empty. + * + * static ex_node_t * + * ex_next(ex_t *tree, ex_node_t *node); + * static ex_node_t * + * ex_prev(ex_t *tree, ex_node_t *node); + * Description: Get node's successor/predecessor. + * Args: + * tree: Pointer to an initialized red-black tree object. + * node: A node in tree. + * Ret: node's successor/predecessor in tree, or NULL if node is + * last/first. + * + * static ex_node_t * + * ex_search(ex_t *tree, const ex_node_t *key); + * Description: Search for node that matches key. + * Args: + * tree: Pointer to an initialized red-black tree object. + * key : Search key. + * Ret: Node in tree that matches key, or NULL if no match. + * + * static ex_node_t * + * ex_nsearch(ex_t *tree, const ex_node_t *key); + * static ex_node_t * + * ex_psearch(ex_t *tree, const ex_node_t *key); + * Description: Search for node that matches key. If no match is found, + * return what would be key's successor/predecessor, were + * key in tree. + * Args: + * tree: Pointer to an initialized red-black tree object. + * key : Search key. + * Ret: Node in tree that matches key, or if no match, hypothetical node's + * successor/predecessor (NULL if no successor/predecessor). + * + * static void + * ex_insert(ex_t *tree, ex_node_t *node); + * Description: Insert node into tree. + * Args: + * tree: Pointer to an initialized red-black tree object. + * node: Node to be inserted into tree. + * + * static void + * ex_remove(ex_t *tree, ex_node_t *node); + * Description: Remove node from tree. + * Args: + * tree: Pointer to an initialized red-black tree object. + * node: Node in tree to be removed. + * + * static ex_node_t * + * ex_iter(ex_t *tree, ex_node_t *start, ex_node_t *(*cb)(ex_t *, + * ex_node_t *, void *), void *arg); + * static ex_node_t * + * ex_reverse_iter(ex_t *tree, ex_node_t *start, ex_node *(*cb)(ex_t *, + * ex_node_t *, void *), void *arg); + * Description: Iterate forward/backward over tree, starting at node. If + * tree is modified, iteration must be immediately + * terminated by the callback function that causes the + * modification. + * Args: + * tree : Pointer to an initialized red-black tree object. + * start: Node at which to start iteration, or NULL to start at + * first/last node. + * cb : Callback function, which is called for each node during + * iteration. Under normal circumstances the callback function + * should return NULL, which causes iteration to continue. If a + * callback function returns non-NULL, iteration is immediately + * terminated and the non-NULL return value is returned by the + * iterator. This is useful for re-starting iteration after + * modifying tree. + * arg : Opaque pointer passed to cb(). + * Ret: NULL if iteration completed, or the non-NULL callback return value + * that caused termination of the iteration. + * + * static void + * ex_destroy(ex_t *tree, void (*cb)(ex_node_t *, void *), void *arg); + * Description: Iterate over the tree with post-order traversal, remove + * each node, and run the callback if non-null. This is + * used for destroying a tree without paying the cost to + * rebalance it. The tree must not be otherwise altered + * during traversal. + * Args: + * tree: Pointer to an initialized red-black tree object. + * cb : Callback function, which, if non-null, is called for each node + * during iteration. There is no way to stop iteration once it + * has begun. + * arg : Opaque pointer passed to cb(). + * + * The rb_summarized_gen() macro generates all the functions above, but has an + * expanded interface. In introduces the notion of summarizing subtrees, and of + * filtering searches in the tree according to the information contained in + * those summaries. + * The extra macro argument is: + * a_summarize: + * Tree summarization function name, with the following prototype: + * + * bool a_summarize(a_type *a_node, const a_type *a_left_child, + * const a_type *a_right_child); + * + * This function should update a_node with the summary of the subtree rooted + * there, using the data contained in it and the summaries in a_left_child + * and a_right_child. One or both of them may be NULL. When the tree + * changes due to an insertion or removal, it updates the summaries of all + * nodes whose subtrees have changed (always updating the summaries of + * children before their parents). If the user alters a node in the tree in + * a way that may change its summary, they can call the generated + * update_summaries function to bubble up the summary changes to the root. + * It should return true if the summary changed (or may have changed), and + * false if it didn't (which will allow the implementation to terminate + * "bubbling up" the summaries early). + * As the parameter names indicate, the children are ordered as they are in + * the tree, a_left_child, if it is not NULL, compares less than a_node, + * which in turn compares less than a_right_child (if a_right_child is not + * NULL). + * + * Using the same setup as above but replacing the macro with + * rb_summarized_gen(static, ex_, ex_t, ex_node_t, ex_link, ex_cmp, + * ex_summarize) + * + * Generates all the previous functions, but adds some more: + * + * static void + * ex_update_summaries(ex_t *tree, ex_node_t *node); + * Description: Recompute all summaries of ancestors of node. + * Args: + * tree: Pointer to an initialized red-black tree object. + * node: The element of the tree whose summary may have changed. + * + * For each of ex_empty, ex_first, ex_last, ex_next, ex_prev, ex_search, + * ex_nsearch, ex_psearch, ex_iter, and ex_reverse_iter, an additional function + * is generated as well, with the suffix _filtered (e.g. ex_empty_filtered, + * ex_first_filtered, etc.). These use the concept of a "filter"; a binary + * property some node either satisfies or does not satisfy. Clever use of the + * a_summary argument to rb_summarized_gen can allow efficient computation of + * these predicates across whole subtrees of the tree. + * The extended API functions accept three additional arguments after the + * arguments to the corresponding non-extended equivalent. + * + * ex_fn(..., bool (*filter_node)(void *, ex_node_t *), + * bool (*filter_subtree)(void *, ex_node_t *), void *filter_ctx); + * filter_node : Returns true if the node passes the filter. + * filter_subtree : Returns true if some node in the subtree rooted at + * node passes the filter. + * filter_ctx : A context argument passed to the filters. + * + * For a more concrete example of summarizing and filtering, suppose we're using + * the red-black tree to track a set of integers: + * + * struct ex_node_s { + * rb_node(ex_node_t) ex_link; + * unsigned data; + * }; + * + * Suppose, for some application-specific reason, we want to be able to quickly + * find numbers in the set which are divisible by large powers of 2 (say, for + * aligned allocation purposes). We augment the node with a summary field: + * + * struct ex_node_s { + * rb_node(ex_node_t) ex_link; + * unsigned data; + * unsigned max_subtree_ffs; + * } + * + * and define our summarization function as follows: + * + * bool + * ex_summarize(ex_node_t *node, const ex_node_t *lchild, + * const ex_node_t *rchild) { + * unsigned new_max_subtree_ffs = ffs(node->data); + * if (lchild != NULL && lchild->max_subtree_ffs > new_max_subtree_ffs) { + * new_max_subtree_ffs = lchild->max_subtree_ffs; + * } + * if (rchild != NULL && rchild->max_subtree_ffs > new_max_subtree_ffs) { + * new_max_subtree_ffs = rchild->max_subtree_ffs; + * } + * bool changed = (node->max_subtree_ffs != new_max_subtree_ffs) + * node->max_subtree_ffs = new_max_subtree_ffs; + * // This could be "return true" without any correctness or big-O + * // performance changes; but practically, precisely reporting summary + * // changes reduces the amount of work that has to be done when "bubbling + * // up" summary changes. + * return changed; + * } + * + * We can now implement our filter functions as follows: + * bool + * ex_filter_node(void *filter_ctx, ex_node_t *node) { + * unsigned required_ffs = *(unsigned *)filter_ctx; + * return ffs(node->data) >= required_ffs; + * } + * bool + * ex_filter_subtree(void *filter_ctx, ex_node_t *node) { + * unsigned required_ffs = *(unsigned *)filter_ctx; + * return node->max_subtree_ffs >= required_ffs; + * } + * + * We can now easily search for, e.g., the smallest integer in the set that's + * divisible by 128: + * ex_node_t * + * find_div_128(ex_tree_t *tree) { + * unsigned min_ffs = 7; + * return ex_first_filtered(tree, &ex_filter_node, &ex_filter_subtree, + * &min_ffs); + * } + * + * We could with similar ease: + * - Fnd the next multiple of 128 in the set that's larger than 12345 (with + * ex_nsearch_filtered) + * - Iterate over just those multiples of 64 that are in the set (with + * ex_iter_filtered) + * - Determine if the set contains any multiples of 1024 (with + * ex_empty_filtered). + * + * Some possibly subtle API notes: + * - The node argument to ex_next_filtered and ex_prev_filtered need not pass + * the filter; it will find the next/prev node that passes the filter. + * - ex_search_filtered will fail even for a node in the tree, if that node does + * not pass the filter. ex_psearch_filtered and ex_nsearch_filtered behave + * similarly; they may return a node larger/smaller than the key, even if a + * node equivalent to the key is in the tree (but does not pass the filter). + * - Similarly, if the start argument to a filtered iteration function does not + * pass the filter, the callback won't be invoked on it. + * + * These should make sense after a moment's reflection; each post-condition is + * the same as with the unfiltered version, with the added constraint that the + * returned node must pass the filter. + */ +JEMALLOC_ALWAYS_INLINE void +rb_remove_safety_checks(const void *nodep, const char *function_name) { + if (!config_opt_safety_checks) { + return; + } + if (unlikely(nodep == NULL)) { + safety_check_fail( + ": Invalid deallocation detected in %s: " + "attempting to remove node from tree but node was " + "not found. Possibly caused by double free bugs.", + function_name); + } +} + +#define rb_gen(a_attr, a_prefix, a_rbt_type, a_type, a_field, a_cmp) \ + rb_gen_impl(a_attr, a_prefix, a_rbt_type, a_type, a_field, a_cmp, \ + rb_empty_summarize, false) +#define rb_summarized_gen(a_attr, a_prefix, a_rbt_type, a_type, \ + a_field, a_cmp, a_summarize) \ + rb_gen_impl(a_attr, a_prefix, a_rbt_type, a_type, a_field, a_cmp, \ + a_summarize, true) + +#define rb_gen_impl(a_attr, a_prefix, a_rbt_type, a_type, \ + a_field, a_cmp, a_summarize, a_is_summarized) \ +typedef struct { \ + a_type *node; \ + int cmp; \ +} a_prefix##path_entry_t; \ +static inline void \ +a_prefix##summarize_range(a_prefix##path_entry_t *rfirst, \ + a_prefix##path_entry_t *rlast) { \ + while ((uintptr_t)rlast >= (uintptr_t)rfirst) { \ + a_type *node = rlast->node; \ + /* Avoid a warning when a_summarize is rb_empty_summarize. */ \ + (void)node; \ + bool changed = a_summarize(node, rbtn_left_get(a_type, a_field, \ + node), rbtn_right_get(a_type, a_field, node)); \ + if (!changed) { \ + break; \ + } \ + rlast--; \ + } \ +} \ +/* On the remove pathways, we sometimes swap the node being removed */\ +/* and its first successor; in such cases we need to do two range */\ +/* updates; one from the node to its (former) swapped successor, the */\ +/* next from that successor to the root (with either allowed to */\ +/* bail out early if appropriate. */\ +static inline void \ +a_prefix##summarize_swapped_range(a_prefix##path_entry_t *rfirst, \ + a_prefix##path_entry_t *rlast, a_prefix##path_entry_t *swap_loc) { \ + if (swap_loc == NULL || rlast <= swap_loc) { \ + a_prefix##summarize_range(rfirst, rlast); \ + } else { \ + a_prefix##summarize_range(swap_loc + 1, rlast); \ + (void)a_summarize(swap_loc->node, \ + rbtn_left_get(a_type, a_field, swap_loc->node), \ + rbtn_right_get(a_type, a_field, swap_loc->node)); \ + a_prefix##summarize_range(rfirst, swap_loc - 1); \ + } \ +} \ +a_attr void \ +a_prefix##new(a_rbt_type *rbtree) { \ + rb_new(a_type, a_field, rbtree); \ +} \ +a_attr bool \ +a_prefix##empty(a_rbt_type *rbtree) { \ + return (rbtree->rbt_root == NULL); \ +} \ +a_attr a_type * \ +a_prefix##first(a_rbt_type *rbtree) { \ + a_type *ret; \ + rbtn_first(a_type, a_field, rbtree, rbtree->rbt_root, ret); \ + return ret; \ +} \ +a_attr a_type * \ +a_prefix##last(a_rbt_type *rbtree) { \ + a_type *ret; \ + rbtn_last(a_type, a_field, rbtree, rbtree->rbt_root, ret); \ + return ret; \ +} \ +a_attr a_type * \ +a_prefix##next(a_rbt_type *rbtree, a_type *node) { \ + a_type *ret; \ + if (rbtn_right_get(a_type, a_field, node) != NULL) { \ + rbtn_first(a_type, a_field, rbtree, rbtn_right_get(a_type, \ + a_field, node), ret); \ + } else { \ + a_type *tnode = rbtree->rbt_root; \ + assert(tnode != NULL); \ + ret = NULL; \ + while (true) { \ + int cmp = (a_cmp)(node, tnode); \ + if (cmp < 0) { \ + ret = tnode; \ + tnode = rbtn_left_get(a_type, a_field, tnode); \ + } else if (cmp > 0) { \ + tnode = rbtn_right_get(a_type, a_field, tnode); \ + } else { \ + break; \ + } \ + assert(tnode != NULL); \ + } \ + } \ + return ret; \ +} \ +a_attr a_type * \ +a_prefix##prev(a_rbt_type *rbtree, a_type *node) { \ + a_type *ret; \ + if (rbtn_left_get(a_type, a_field, node) != NULL) { \ + rbtn_last(a_type, a_field, rbtree, rbtn_left_get(a_type, \ + a_field, node), ret); \ + } else { \ + a_type *tnode = rbtree->rbt_root; \ + assert(tnode != NULL); \ + ret = NULL; \ + while (true) { \ + int cmp = (a_cmp)(node, tnode); \ + if (cmp < 0) { \ + tnode = rbtn_left_get(a_type, a_field, tnode); \ + } else if (cmp > 0) { \ + ret = tnode; \ + tnode = rbtn_right_get(a_type, a_field, tnode); \ + } else { \ + break; \ + } \ + assert(tnode != NULL); \ + } \ + } \ + return ret; \ +} \ +a_attr a_type * \ +a_prefix##search(a_rbt_type *rbtree, const a_type *key) { \ + a_type *ret; \ + int cmp; \ + ret = rbtree->rbt_root; \ + while (ret != NULL \ + && (cmp = (a_cmp)(key, ret)) != 0) { \ + if (cmp < 0) { \ + ret = rbtn_left_get(a_type, a_field, ret); \ + } else { \ + ret = rbtn_right_get(a_type, a_field, ret); \ + } \ + } \ + return ret; \ +} \ +a_attr a_type * \ +a_prefix##nsearch(a_rbt_type *rbtree, const a_type *key) { \ + a_type *ret; \ + a_type *tnode = rbtree->rbt_root; \ + ret = NULL; \ + while (tnode != NULL) { \ + int cmp = (a_cmp)(key, tnode); \ + if (cmp < 0) { \ + ret = tnode; \ + tnode = rbtn_left_get(a_type, a_field, tnode); \ + } else if (cmp > 0) { \ + tnode = rbtn_right_get(a_type, a_field, tnode); \ + } else { \ + ret = tnode; \ + break; \ + } \ + } \ + return ret; \ +} \ +a_attr a_type * \ +a_prefix##psearch(a_rbt_type *rbtree, const a_type *key) { \ + a_type *ret; \ + a_type *tnode = rbtree->rbt_root; \ + ret = NULL; \ + while (tnode != NULL) { \ + int cmp = (a_cmp)(key, tnode); \ + if (cmp < 0) { \ + tnode = rbtn_left_get(a_type, a_field, tnode); \ + } else if (cmp > 0) { \ + ret = tnode; \ + tnode = rbtn_right_get(a_type, a_field, tnode); \ + } else { \ + ret = tnode; \ + break; \ + } \ + } \ + return ret; \ +} \ +a_attr void \ +a_prefix##insert(a_rbt_type *rbtree, a_type *node) { \ + a_prefix##path_entry_t path[RB_MAX_DEPTH]; \ + a_prefix##path_entry_t *pathp; \ + rbt_node_new(a_type, a_field, rbtree, node); \ + /* Wind. */ \ + path->node = rbtree->rbt_root; \ + for (pathp = path; pathp->node != NULL; pathp++) { \ + int cmp = pathp->cmp = a_cmp(node, pathp->node); \ + assert(cmp != 0); \ + if (cmp < 0) { \ + pathp[1].node = rbtn_left_get(a_type, a_field, \ + pathp->node); \ + } else { \ + pathp[1].node = rbtn_right_get(a_type, a_field, \ + pathp->node); \ + } \ + } \ + pathp->node = node; \ + /* A loop invariant we maintain is that all nodes with */\ + /* out-of-date summaries live in path[0], path[1], ..., *pathp. */\ + /* To maintain this, we have to summarize node, since we */\ + /* decrement pathp before the first iteration. */\ + assert(rbtn_left_get(a_type, a_field, node) == NULL); \ + assert(rbtn_right_get(a_type, a_field, node) == NULL); \ + (void)a_summarize(node, NULL, NULL); \ + /* Unwind. */ \ + for (pathp--; (uintptr_t)pathp >= (uintptr_t)path; pathp--) { \ + a_type *cnode = pathp->node; \ + if (pathp->cmp < 0) { \ + a_type *left = pathp[1].node; \ + rbtn_left_set(a_type, a_field, cnode, left); \ + if (rbtn_red_get(a_type, a_field, left)) { \ + a_type *leftleft = rbtn_left_get(a_type, a_field, left);\ + if (leftleft != NULL && rbtn_red_get(a_type, a_field, \ + leftleft)) { \ + /* Fix up 4-node. */ \ + a_type *tnode; \ + rbtn_black_set(a_type, a_field, leftleft); \ + rbtn_rotate_right(a_type, a_field, cnode, tnode); \ + (void)a_summarize(cnode, \ + rbtn_left_get(a_type, a_field, cnode), \ + rbtn_right_get(a_type, a_field, cnode)); \ + cnode = tnode; \ + } \ + } else { \ + a_prefix##summarize_range(path, pathp); \ + return; \ + } \ + } else { \ + a_type *right = pathp[1].node; \ + rbtn_right_set(a_type, a_field, cnode, right); \ + if (rbtn_red_get(a_type, a_field, right)) { \ + a_type *left = rbtn_left_get(a_type, a_field, cnode); \ + if (left != NULL && rbtn_red_get(a_type, a_field, \ + left)) { \ + /* Split 4-node. */ \ + rbtn_black_set(a_type, a_field, left); \ + rbtn_black_set(a_type, a_field, right); \ + rbtn_red_set(a_type, a_field, cnode); \ + } else { \ + /* Lean left. */ \ + a_type *tnode; \ + bool tred = rbtn_red_get(a_type, a_field, cnode); \ + rbtn_rotate_left(a_type, a_field, cnode, tnode); \ + rbtn_color_set(a_type, a_field, tnode, tred); \ + rbtn_red_set(a_type, a_field, cnode); \ + (void)a_summarize(cnode, \ + rbtn_left_get(a_type, a_field, cnode), \ + rbtn_right_get(a_type, a_field, cnode)); \ + cnode = tnode; \ + } \ + } else { \ + a_prefix##summarize_range(path, pathp); \ + return; \ + } \ + } \ + pathp->node = cnode; \ + (void)a_summarize(cnode, \ + rbtn_left_get(a_type, a_field, cnode), \ + rbtn_right_get(a_type, a_field, cnode)); \ + } \ + /* Set root, and make it black. */ \ + rbtree->rbt_root = path->node; \ + rbtn_black_set(a_type, a_field, rbtree->rbt_root); \ +} \ +a_attr void \ +a_prefix##remove(a_rbt_type *rbtree, a_type *node) { \ + a_prefix##path_entry_t path[RB_MAX_DEPTH]; \ + a_prefix##path_entry_t *pathp; \ + a_prefix##path_entry_t *nodep; \ + a_prefix##path_entry_t *swap_loc; \ + /* This is a "real" sentinel -- NULL means we didn't swap the */\ + /* node to be pruned with one of its successors, and so */\ + /* summarization can terminate early whenever some summary */\ + /* doesn't change. */\ + swap_loc = NULL; \ + /* This is just to silence a compiler warning. */ \ + nodep = NULL; \ + /* Wind. */ \ + path->node = rbtree->rbt_root; \ + for (pathp = path; pathp->node != NULL; pathp++) { \ + int cmp = pathp->cmp = a_cmp(node, pathp->node); \ + if (cmp < 0) { \ + pathp[1].node = rbtn_left_get(a_type, a_field, \ + pathp->node); \ + } else { \ + pathp[1].node = rbtn_right_get(a_type, a_field, \ + pathp->node); \ + if (cmp == 0) { \ + /* Find node's successor, in preparation for swap. */ \ + pathp->cmp = 1; \ + nodep = pathp; \ + for (pathp++; pathp->node != NULL; pathp++) { \ + pathp->cmp = -1; \ + pathp[1].node = rbtn_left_get(a_type, a_field, \ + pathp->node); \ + } \ + break; \ + } \ + } \ + } \ + rb_remove_safety_checks(nodep, __func__); \ + assert(nodep != NULL); \ + assert(nodep->node == node); \ + pathp--; \ + if (pathp->node != node) { \ + /* Swap node with its successor. */ \ + swap_loc = nodep; \ + bool tred = rbtn_red_get(a_type, a_field, pathp->node); \ + rbtn_color_set(a_type, a_field, pathp->node, \ + rbtn_red_get(a_type, a_field, node)); \ + rbtn_left_set(a_type, a_field, pathp->node, \ + rbtn_left_get(a_type, a_field, node)); \ + /* If node's successor is its right child, the following code */\ + /* will do the wrong thing for the right child pointer. */\ + /* However, it doesn't matter, because the pointer will be */\ + /* properly set when the successor is pruned. */\ + rbtn_right_set(a_type, a_field, pathp->node, \ + rbtn_right_get(a_type, a_field, node)); \ + rbtn_color_set(a_type, a_field, node, tred); \ + /* The pruned leaf node's child pointers are never accessed */\ + /* again, so don't bother setting them to nil. */\ + nodep->node = pathp->node; \ + pathp->node = node; \ + if (nodep == path) { \ + rbtree->rbt_root = nodep->node; \ + } else { \ + if (nodep[-1].cmp < 0) { \ + rbtn_left_set(a_type, a_field, nodep[-1].node, \ + nodep->node); \ + } else { \ + rbtn_right_set(a_type, a_field, nodep[-1].node, \ + nodep->node); \ + } \ + } \ + } else { \ + a_type *left = rbtn_left_get(a_type, a_field, node); \ + if (left != NULL) { \ + /* node has no successor, but it has a left child. */\ + /* Splice node out, without losing the left child. */\ + assert(!rbtn_red_get(a_type, a_field, node)); \ + assert(rbtn_red_get(a_type, a_field, left)); \ + rbtn_black_set(a_type, a_field, left); \ + if (pathp == path) { \ + rbtree->rbt_root = left; \ + /* Nothing to summarize -- the subtree rooted at the */\ + /* node's left child hasn't changed, and it's now the */\ + /* root. */\ + } else { \ + if (pathp[-1].cmp < 0) { \ + rbtn_left_set(a_type, a_field, pathp[-1].node, \ + left); \ + } else { \ + rbtn_right_set(a_type, a_field, pathp[-1].node, \ + left); \ + } \ + a_prefix##summarize_swapped_range(path, &pathp[-1], \ + swap_loc); \ + } \ + return; \ + } else if (pathp == path) { \ + /* The tree only contained one node. */ \ + rbtree->rbt_root = NULL; \ + return; \ + } \ + } \ + /* We've now established the invariant that the node has no right */\ + /* child (well, morally; we didn't bother nulling it out if we */\ + /* swapped it with its successor), and that the only nodes with */\ + /* out-of-date summaries live in path[0], path[1], ..., pathp[-1].*/\ + if (rbtn_red_get(a_type, a_field, pathp->node)) { \ + /* Prune red node, which requires no fixup. */ \ + assert(pathp[-1].cmp < 0); \ + rbtn_left_set(a_type, a_field, pathp[-1].node, NULL); \ + a_prefix##summarize_swapped_range(path, &pathp[-1], swap_loc); \ + return; \ + } \ + /* The node to be pruned is black, so unwind until balance is */\ + /* restored. */\ + pathp->node = NULL; \ + for (pathp--; (uintptr_t)pathp >= (uintptr_t)path; pathp--) { \ + assert(pathp->cmp != 0); \ + if (pathp->cmp < 0) { \ + rbtn_left_set(a_type, a_field, pathp->node, \ + pathp[1].node); \ + if (rbtn_red_get(a_type, a_field, pathp->node)) { \ + a_type *right = rbtn_right_get(a_type, a_field, \ + pathp->node); \ + a_type *rightleft = rbtn_left_get(a_type, a_field, \ + right); \ + a_type *tnode; \ + if (rightleft != NULL && rbtn_red_get(a_type, a_field, \ + rightleft)) { \ + /* In the following diagrams, ||, //, and \\ */\ + /* indicate the path to the removed node. */\ + /* */\ + /* || */\ + /* pathp(r) */\ + /* // \ */\ + /* (b) (b) */\ + /* / */\ + /* (r) */\ + /* */\ + rbtn_black_set(a_type, a_field, pathp->node); \ + rbtn_rotate_right(a_type, a_field, right, tnode); \ + rbtn_right_set(a_type, a_field, pathp->node, tnode);\ + rbtn_rotate_left(a_type, a_field, pathp->node, \ + tnode); \ + (void)a_summarize(pathp->node, \ + rbtn_left_get(a_type, a_field, pathp->node), \ + rbtn_right_get(a_type, a_field, pathp->node)); \ + (void)a_summarize(right, \ + rbtn_left_get(a_type, a_field, right), \ + rbtn_right_get(a_type, a_field, right)); \ + } else { \ + /* || */\ + /* pathp(r) */\ + /* // \ */\ + /* (b) (b) */\ + /* / */\ + /* (b) */\ + /* */\ + rbtn_rotate_left(a_type, a_field, pathp->node, \ + tnode); \ + (void)a_summarize(pathp->node, \ + rbtn_left_get(a_type, a_field, pathp->node), \ + rbtn_right_get(a_type, a_field, pathp->node)); \ + } \ + (void)a_summarize(tnode, rbtn_left_get(a_type, a_field, \ + tnode), rbtn_right_get(a_type, a_field, tnode)); \ + /* Balance restored, but rotation modified subtree */\ + /* root. */\ + assert((uintptr_t)pathp > (uintptr_t)path); \ + if (pathp[-1].cmp < 0) { \ + rbtn_left_set(a_type, a_field, pathp[-1].node, \ + tnode); \ + } else { \ + rbtn_right_set(a_type, a_field, pathp[-1].node, \ + tnode); \ + } \ + a_prefix##summarize_swapped_range(path, &pathp[-1], \ + swap_loc); \ + return; \ + } else { \ + a_type *right = rbtn_right_get(a_type, a_field, \ + pathp->node); \ + a_type *rightleft = rbtn_left_get(a_type, a_field, \ + right); \ + if (rightleft != NULL && rbtn_red_get(a_type, a_field, \ + rightleft)) { \ + /* || */\ + /* pathp(b) */\ + /* // \ */\ + /* (b) (b) */\ + /* / */\ + /* (r) */\ + a_type *tnode; \ + rbtn_black_set(a_type, a_field, rightleft); \ + rbtn_rotate_right(a_type, a_field, right, tnode); \ + rbtn_right_set(a_type, a_field, pathp->node, tnode);\ + rbtn_rotate_left(a_type, a_field, pathp->node, \ + tnode); \ + (void)a_summarize(pathp->node, \ + rbtn_left_get(a_type, a_field, pathp->node), \ + rbtn_right_get(a_type, a_field, pathp->node)); \ + (void)a_summarize(right, \ + rbtn_left_get(a_type, a_field, right), \ + rbtn_right_get(a_type, a_field, right)); \ + (void)a_summarize(tnode, \ + rbtn_left_get(a_type, a_field, tnode), \ + rbtn_right_get(a_type, a_field, tnode)); \ + /* Balance restored, but rotation modified */\ + /* subtree root, which may actually be the tree */\ + /* root. */\ + if (pathp == path) { \ + /* Set root. */ \ + rbtree->rbt_root = tnode; \ + } else { \ + if (pathp[-1].cmp < 0) { \ + rbtn_left_set(a_type, a_field, \ + pathp[-1].node, tnode); \ + } else { \ + rbtn_right_set(a_type, a_field, \ + pathp[-1].node, tnode); \ + } \ + a_prefix##summarize_swapped_range(path, \ + &pathp[-1], swap_loc); \ + } \ + return; \ + } else { \ + /* || */\ + /* pathp(b) */\ + /* // \ */\ + /* (b) (b) */\ + /* / */\ + /* (b) */\ + a_type *tnode; \ + rbtn_red_set(a_type, a_field, pathp->node); \ + rbtn_rotate_left(a_type, a_field, pathp->node, \ + tnode); \ + (void)a_summarize(pathp->node, \ + rbtn_left_get(a_type, a_field, pathp->node), \ + rbtn_right_get(a_type, a_field, pathp->node)); \ + (void)a_summarize(tnode, \ + rbtn_left_get(a_type, a_field, tnode), \ + rbtn_right_get(a_type, a_field, tnode)); \ + pathp->node = tnode; \ + } \ + } \ + } else { \ + a_type *left; \ + rbtn_right_set(a_type, a_field, pathp->node, \ + pathp[1].node); \ + left = rbtn_left_get(a_type, a_field, pathp->node); \ + if (rbtn_red_get(a_type, a_field, left)) { \ + a_type *tnode; \ + a_type *leftright = rbtn_right_get(a_type, a_field, \ + left); \ + a_type *leftrightleft = rbtn_left_get(a_type, a_field, \ + leftright); \ + if (leftrightleft != NULL && rbtn_red_get(a_type, \ + a_field, leftrightleft)) { \ + /* || */\ + /* pathp(b) */\ + /* / \\ */\ + /* (r) (b) */\ + /* \ */\ + /* (b) */\ + /* / */\ + /* (r) */\ + a_type *unode; \ + rbtn_black_set(a_type, a_field, leftrightleft); \ + rbtn_rotate_right(a_type, a_field, pathp->node, \ + unode); \ + rbtn_rotate_right(a_type, a_field, pathp->node, \ + tnode); \ + rbtn_right_set(a_type, a_field, unode, tnode); \ + rbtn_rotate_left(a_type, a_field, unode, tnode); \ + (void)a_summarize(pathp->node, \ + rbtn_left_get(a_type, a_field, pathp->node), \ + rbtn_right_get(a_type, a_field, pathp->node)); \ + (void)a_summarize(unode, \ + rbtn_left_get(a_type, a_field, unode), \ + rbtn_right_get(a_type, a_field, unode)); \ + } else { \ + /* || */\ + /* pathp(b) */\ + /* / \\ */\ + /* (r) (b) */\ + /* \ */\ + /* (b) */\ + /* / */\ + /* (b) */\ + assert(leftright != NULL); \ + rbtn_red_set(a_type, a_field, leftright); \ + rbtn_rotate_right(a_type, a_field, pathp->node, \ + tnode); \ + rbtn_black_set(a_type, a_field, tnode); \ + (void)a_summarize(pathp->node, \ + rbtn_left_get(a_type, a_field, pathp->node), \ + rbtn_right_get(a_type, a_field, pathp->node)); \ + } \ + (void)a_summarize(tnode, \ + rbtn_left_get(a_type, a_field, tnode), \ + rbtn_right_get(a_type, a_field, tnode)); \ + /* Balance restored, but rotation modified subtree */\ + /* root, which may actually be the tree root. */\ + if (pathp == path) { \ + /* Set root. */ \ + rbtree->rbt_root = tnode; \ + } else { \ + if (pathp[-1].cmp < 0) { \ + rbtn_left_set(a_type, a_field, pathp[-1].node, \ + tnode); \ + } else { \ + rbtn_right_set(a_type, a_field, pathp[-1].node, \ + tnode); \ + } \ + a_prefix##summarize_swapped_range(path, &pathp[-1], \ + swap_loc); \ + } \ + return; \ + } else if (rbtn_red_get(a_type, a_field, pathp->node)) { \ + a_type *leftleft = rbtn_left_get(a_type, a_field, left);\ + if (leftleft != NULL && rbtn_red_get(a_type, a_field, \ + leftleft)) { \ + /* || */\ + /* pathp(r) */\ + /* / \\ */\ + /* (b) (b) */\ + /* / */\ + /* (r) */\ + a_type *tnode; \ + rbtn_black_set(a_type, a_field, pathp->node); \ + rbtn_red_set(a_type, a_field, left); \ + rbtn_black_set(a_type, a_field, leftleft); \ + rbtn_rotate_right(a_type, a_field, pathp->node, \ + tnode); \ + (void)a_summarize(pathp->node, \ + rbtn_left_get(a_type, a_field, pathp->node), \ + rbtn_right_get(a_type, a_field, pathp->node)); \ + (void)a_summarize(tnode, \ + rbtn_left_get(a_type, a_field, tnode), \ + rbtn_right_get(a_type, a_field, tnode)); \ + /* Balance restored, but rotation modified */\ + /* subtree root. */\ + assert((uintptr_t)pathp > (uintptr_t)path); \ + if (pathp[-1].cmp < 0) { \ + rbtn_left_set(a_type, a_field, pathp[-1].node, \ + tnode); \ + } else { \ + rbtn_right_set(a_type, a_field, pathp[-1].node, \ + tnode); \ + } \ + a_prefix##summarize_swapped_range(path, &pathp[-1], \ + swap_loc); \ + return; \ + } else { \ + /* || */\ + /* pathp(r) */\ + /* / \\ */\ + /* (b) (b) */\ + /* / */\ + /* (b) */\ + rbtn_red_set(a_type, a_field, left); \ + rbtn_black_set(a_type, a_field, pathp->node); \ + /* Balance restored. */ \ + a_prefix##summarize_swapped_range(path, pathp, \ + swap_loc); \ + return; \ + } \ + } else { \ + a_type *leftleft = rbtn_left_get(a_type, a_field, left);\ + if (leftleft != NULL && rbtn_red_get(a_type, a_field, \ + leftleft)) { \ + /* || */\ + /* pathp(b) */\ + /* / \\ */\ + /* (b) (b) */\ + /* / */\ + /* (r) */\ + a_type *tnode; \ + rbtn_black_set(a_type, a_field, leftleft); \ + rbtn_rotate_right(a_type, a_field, pathp->node, \ + tnode); \ + (void)a_summarize(pathp->node, \ + rbtn_left_get(a_type, a_field, pathp->node), \ + rbtn_right_get(a_type, a_field, pathp->node)); \ + (void)a_summarize(tnode, \ + rbtn_left_get(a_type, a_field, tnode), \ + rbtn_right_get(a_type, a_field, tnode)); \ + /* Balance restored, but rotation modified */\ + /* subtree root, which may actually be the tree */\ + /* root. */\ + if (pathp == path) { \ + /* Set root. */ \ + rbtree->rbt_root = tnode; \ + } else { \ + if (pathp[-1].cmp < 0) { \ + rbtn_left_set(a_type, a_field, \ + pathp[-1].node, tnode); \ + } else { \ + rbtn_right_set(a_type, a_field, \ + pathp[-1].node, tnode); \ + } \ + a_prefix##summarize_swapped_range(path, \ + &pathp[-1], swap_loc); \ + } \ + return; \ + } else { \ + /* || */\ + /* pathp(b) */\ + /* / \\ */\ + /* (b) (b) */\ + /* / */\ + /* (b) */\ + rbtn_red_set(a_type, a_field, left); \ + (void)a_summarize(pathp->node, \ + rbtn_left_get(a_type, a_field, pathp->node), \ + rbtn_right_get(a_type, a_field, pathp->node)); \ + } \ + } \ + } \ + } \ + /* Set root. */ \ + rbtree->rbt_root = path->node; \ + assert(!rbtn_red_get(a_type, a_field, rbtree->rbt_root)); \ +} \ +a_attr a_type * \ +a_prefix##iter_recurse(a_rbt_type *rbtree, a_type *node, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg) { \ + if (node == NULL) { \ + return NULL; \ + } else { \ + a_type *ret; \ + if ((ret = a_prefix##iter_recurse(rbtree, rbtn_left_get(a_type, \ + a_field, node), cb, arg)) != NULL || (ret = cb(rbtree, node, \ + arg)) != NULL) { \ + return ret; \ + } \ + return a_prefix##iter_recurse(rbtree, rbtn_right_get(a_type, \ + a_field, node), cb, arg); \ + } \ +} \ +a_attr a_type * \ +a_prefix##iter_start(a_rbt_type *rbtree, a_type *start, a_type *node, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg) { \ + int cmp = a_cmp(start, node); \ + if (cmp < 0) { \ + a_type *ret; \ + if ((ret = a_prefix##iter_start(rbtree, start, \ + rbtn_left_get(a_type, a_field, node), cb, arg)) != NULL || \ + (ret = cb(rbtree, node, arg)) != NULL) { \ + return ret; \ + } \ + return a_prefix##iter_recurse(rbtree, rbtn_right_get(a_type, \ + a_field, node), cb, arg); \ + } else if (cmp > 0) { \ + return a_prefix##iter_start(rbtree, start, \ + rbtn_right_get(a_type, a_field, node), cb, arg); \ + } else { \ + a_type *ret; \ + if ((ret = cb(rbtree, node, arg)) != NULL) { \ + return ret; \ + } \ + return a_prefix##iter_recurse(rbtree, rbtn_right_get(a_type, \ + a_field, node), cb, arg); \ + } \ +} \ +a_attr a_type * \ +a_prefix##iter(a_rbt_type *rbtree, a_type *start, a_type *(*cb)( \ + a_rbt_type *, a_type *, void *), void *arg) { \ + a_type *ret; \ + if (start != NULL) { \ + ret = a_prefix##iter_start(rbtree, start, rbtree->rbt_root, \ + cb, arg); \ + } else { \ + ret = a_prefix##iter_recurse(rbtree, rbtree->rbt_root, cb, arg);\ + } \ + return ret; \ +} \ +a_attr a_type * \ +a_prefix##reverse_iter_recurse(a_rbt_type *rbtree, a_type *node, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg) { \ + if (node == NULL) { \ + return NULL; \ + } else { \ + a_type *ret; \ + if ((ret = a_prefix##reverse_iter_recurse(rbtree, \ + rbtn_right_get(a_type, a_field, node), cb, arg)) != NULL || \ + (ret = cb(rbtree, node, arg)) != NULL) { \ + return ret; \ + } \ + return a_prefix##reverse_iter_recurse(rbtree, \ + rbtn_left_get(a_type, a_field, node), cb, arg); \ + } \ +} \ +a_attr a_type * \ +a_prefix##reverse_iter_start(a_rbt_type *rbtree, a_type *start, \ + a_type *node, a_type *(*cb)(a_rbt_type *, a_type *, void *), \ + void *arg) { \ + int cmp = a_cmp(start, node); \ + if (cmp > 0) { \ + a_type *ret; \ + if ((ret = a_prefix##reverse_iter_start(rbtree, start, \ + rbtn_right_get(a_type, a_field, node), cb, arg)) != NULL || \ + (ret = cb(rbtree, node, arg)) != NULL) { \ + return ret; \ + } \ + return a_prefix##reverse_iter_recurse(rbtree, \ + rbtn_left_get(a_type, a_field, node), cb, arg); \ + } else if (cmp < 0) { \ + return a_prefix##reverse_iter_start(rbtree, start, \ + rbtn_left_get(a_type, a_field, node), cb, arg); \ + } else { \ + a_type *ret; \ + if ((ret = cb(rbtree, node, arg)) != NULL) { \ + return ret; \ + } \ + return a_prefix##reverse_iter_recurse(rbtree, \ + rbtn_left_get(a_type, a_field, node), cb, arg); \ + } \ +} \ +a_attr a_type * \ +a_prefix##reverse_iter(a_rbt_type *rbtree, a_type *start, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg) { \ + a_type *ret; \ + if (start != NULL) { \ + ret = a_prefix##reverse_iter_start(rbtree, start, \ + rbtree->rbt_root, cb, arg); \ + } else { \ + ret = a_prefix##reverse_iter_recurse(rbtree, rbtree->rbt_root, \ + cb, arg); \ + } \ + return ret; \ +} \ +a_attr void \ +a_prefix##destroy_recurse(a_rbt_type *rbtree, a_type *node, void (*cb)( \ + a_type *, void *), void *arg) { \ + if (node == NULL) { \ + return; \ + } \ + a_prefix##destroy_recurse(rbtree, rbtn_left_get(a_type, a_field, \ + node), cb, arg); \ + rbtn_left_set(a_type, a_field, (node), NULL); \ + a_prefix##destroy_recurse(rbtree, rbtn_right_get(a_type, a_field, \ + node), cb, arg); \ + rbtn_right_set(a_type, a_field, (node), NULL); \ + if (cb) { \ + cb(node, arg); \ + } \ +} \ +a_attr void \ +a_prefix##destroy(a_rbt_type *rbtree, void (*cb)(a_type *, void *), \ + void *arg) { \ + a_prefix##destroy_recurse(rbtree, rbtree->rbt_root, cb, arg); \ + rbtree->rbt_root = NULL; \ +} \ +/* BEGIN SUMMARIZED-ONLY IMPLEMENTATION */ \ +rb_summarized_only_##a_is_summarized( \ +static inline a_prefix##path_entry_t * \ +a_prefix##wind(a_rbt_type *rbtree, \ + a_prefix##path_entry_t path[RB_MAX_DEPTH], a_type *node) { \ + a_prefix##path_entry_t *pathp; \ + path->node = rbtree->rbt_root; \ + for (pathp = path; ; pathp++) { \ + assert((size_t)(pathp - path) < RB_MAX_DEPTH); \ + pathp->cmp = a_cmp(node, pathp->node); \ + if (pathp->cmp < 0) { \ + pathp[1].node = rbtn_left_get(a_type, a_field, \ + pathp->node); \ + } else if (pathp->cmp == 0) { \ + return pathp; \ + } else { \ + pathp[1].node = rbtn_right_get(a_type, a_field, \ + pathp->node); \ + } \ + } \ + unreachable(); \ +} \ +a_attr void \ +a_prefix##update_summaries(a_rbt_type *rbtree, a_type *node) { \ + a_prefix##path_entry_t path[RB_MAX_DEPTH]; \ + a_prefix##path_entry_t *pathp = a_prefix##wind(rbtree, path, node); \ + a_prefix##summarize_range(path, pathp); \ +} \ +a_attr bool \ +a_prefix##empty_filtered(a_rbt_type *rbtree, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + a_type *node = rbtree->rbt_root; \ + return node == NULL || !filter_subtree(filter_ctx, node); \ +} \ +static inline a_type * \ +a_prefix##first_filtered_from_node(a_type *node, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + assert(node != NULL && filter_subtree(filter_ctx, node)); \ + while (true) { \ + a_type *left = rbtn_left_get(a_type, a_field, node); \ + a_type *right = rbtn_right_get(a_type, a_field, node); \ + if (left != NULL && filter_subtree(filter_ctx, left)) { \ + node = left; \ + } else if (filter_node(filter_ctx, node)) { \ + return node; \ + } else { \ + assert(right != NULL \ + && filter_subtree(filter_ctx, right)); \ + node = right; \ + } \ + } \ + unreachable(); \ +} \ +a_attr a_type * \ +a_prefix##first_filtered(a_rbt_type *rbtree, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + a_type *node = rbtree->rbt_root; \ + if (node == NULL || !filter_subtree(filter_ctx, node)) { \ + return NULL; \ + } \ + return a_prefix##first_filtered_from_node(node, filter_node, \ + filter_subtree, filter_ctx); \ +} \ +static inline a_type * \ +a_prefix##last_filtered_from_node(a_type *node, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + assert(node != NULL && filter_subtree(filter_ctx, node)); \ + while (true) { \ + a_type *left = rbtn_left_get(a_type, a_field, node); \ + a_type *right = rbtn_right_get(a_type, a_field, node); \ + if (right != NULL && filter_subtree(filter_ctx, right)) { \ + node = right; \ + } else if (filter_node(filter_ctx, node)) { \ + return node; \ + } else { \ + assert(left != NULL \ + && filter_subtree(filter_ctx, left)); \ + node = left; \ + } \ + } \ + unreachable(); \ +} \ +a_attr a_type * \ +a_prefix##last_filtered(a_rbt_type *rbtree, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + a_type *node = rbtree->rbt_root; \ + if (node == NULL || !filter_subtree(filter_ctx, node)) { \ + return NULL; \ + } \ + return a_prefix##last_filtered_from_node(node, filter_node, \ + filter_subtree, filter_ctx); \ +} \ +/* Internal implementation function. Search for a node comparing */\ +/* equal to key matching the filter. If such a node is in the tree, */\ +/* return it. Additionally, the caller has the option to ask for */\ +/* bounds on the next / prev node in the tree passing the filter. */\ +/* If nextbound is true, then this function will do one of the */\ +/* following: */\ +/* - Fill in *nextbound_node with the smallest node in the tree */\ +/* greater than key passing the filter, and NULL-out */\ +/* *nextbound_subtree. */\ +/* - Fill in *nextbound_subtree with a parent of that node which is */\ +/* not a parent of the searched-for node, and NULL-out */\ +/* *nextbound_node. */\ +/* - NULL-out both *nextbound_node and *nextbound_subtree, in which */\ +/* case no node greater than key but passing the filter is in the */\ +/* tree. */\ +/* The prevbound case is similar. If the caller knows that key is in */\ +/* the tree and that the subtree rooted at key does not contain a */\ +/* node satisfying the bound being searched for, then they can pass */\ +/* false for include_subtree, in which case we won't bother searching */\ +/* there (risking a cache miss). */\ +/* */\ +/* This API is unfortunately complex; but the logic for filtered */\ +/* searches is very subtle, and otherwise we would have to repeat it */\ +/* multiple times for filtered search, nsearch, psearch, next, and */\ +/* prev. */\ +static inline a_type * \ +a_prefix##search_with_filter_bounds(a_rbt_type *rbtree, \ + const a_type *key, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx, \ + bool include_subtree, \ + bool nextbound, a_type **nextbound_node, a_type **nextbound_subtree, \ + bool prevbound, a_type **prevbound_node, a_type **prevbound_subtree) {\ + if (nextbound) { \ + *nextbound_node = NULL; \ + *nextbound_subtree = NULL; \ + } \ + if (prevbound) { \ + *prevbound_node = NULL; \ + *prevbound_subtree = NULL; \ + } \ + a_type *tnode = rbtree->rbt_root; \ + while (tnode != NULL && filter_subtree(filter_ctx, tnode)) { \ + int cmp = a_cmp(key, tnode); \ + a_type *tleft = rbtn_left_get(a_type, a_field, tnode); \ + a_type *tright = rbtn_right_get(a_type, a_field, tnode); \ + if (cmp < 0) { \ + if (nextbound) { \ + if (filter_node(filter_ctx, tnode)) { \ + *nextbound_node = tnode; \ + *nextbound_subtree = NULL; \ + } else if (tright != NULL && filter_subtree( \ + filter_ctx, tright)) { \ + *nextbound_node = NULL; \ + *nextbound_subtree = tright; \ + } \ + } \ + tnode = tleft; \ + } else if (cmp > 0) { \ + if (prevbound) { \ + if (filter_node(filter_ctx, tnode)) { \ + *prevbound_node = tnode; \ + *prevbound_subtree = NULL; \ + } else if (tleft != NULL && filter_subtree( \ + filter_ctx, tleft)) { \ + *prevbound_node = NULL; \ + *prevbound_subtree = tleft; \ + } \ + } \ + tnode = tright; \ + } else { \ + if (filter_node(filter_ctx, tnode)) { \ + return tnode; \ + } \ + if (include_subtree) { \ + if (prevbound && tleft != NULL && filter_subtree( \ + filter_ctx, tleft)) { \ + *prevbound_node = NULL; \ + *prevbound_subtree = tleft; \ + } \ + if (nextbound && tright != NULL && filter_subtree( \ + filter_ctx, tright)) { \ + *nextbound_node = NULL; \ + *nextbound_subtree = tright; \ + } \ + } \ + return NULL; \ + } \ + } \ + return NULL; \ +} \ +a_attr a_type * \ +a_prefix##next_filtered(a_rbt_type *rbtree, a_type *node, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + a_type *nright = rbtn_right_get(a_type, a_field, node); \ + if (nright != NULL && filter_subtree(filter_ctx, nright)) { \ + return a_prefix##first_filtered_from_node(nright, filter_node, \ + filter_subtree, filter_ctx); \ + } \ + a_type *node_candidate; \ + a_type *subtree_candidate; \ + a_type *search_result = a_prefix##search_with_filter_bounds( \ + rbtree, node, filter_node, filter_subtree, filter_ctx, \ + /* include_subtree */ false, \ + /* nextbound */ true, &node_candidate, &subtree_candidate, \ + /* prevbound */ false, NULL, NULL); \ + assert(node == search_result \ + || !filter_node(filter_ctx, node)); \ + if (node_candidate != NULL) { \ + return node_candidate; \ + } \ + if (subtree_candidate != NULL) { \ + return a_prefix##first_filtered_from_node( \ + subtree_candidate, filter_node, filter_subtree, \ + filter_ctx); \ + } \ + return NULL; \ +} \ +a_attr a_type * \ +a_prefix##prev_filtered(a_rbt_type *rbtree, a_type *node, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + a_type *nleft = rbtn_left_get(a_type, a_field, node); \ + if (nleft != NULL && filter_subtree(filter_ctx, nleft)) { \ + return a_prefix##last_filtered_from_node(nleft, filter_node, \ + filter_subtree, filter_ctx); \ + } \ + a_type *node_candidate; \ + a_type *subtree_candidate; \ + a_type *search_result = a_prefix##search_with_filter_bounds( \ + rbtree, node, filter_node, filter_subtree, filter_ctx, \ + /* include_subtree */ false, \ + /* nextbound */ false, NULL, NULL, \ + /* prevbound */ true, &node_candidate, &subtree_candidate); \ + assert(node == search_result \ + || !filter_node(filter_ctx, node)); \ + if (node_candidate != NULL) { \ + return node_candidate; \ + } \ + if (subtree_candidate != NULL) { \ + return a_prefix##last_filtered_from_node( \ + subtree_candidate, filter_node, filter_subtree, \ + filter_ctx); \ + } \ + return NULL; \ +} \ +a_attr a_type * \ +a_prefix##search_filtered(a_rbt_type *rbtree, const a_type *key, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + a_type *result = a_prefix##search_with_filter_bounds(rbtree, key, \ + filter_node, filter_subtree, filter_ctx, \ + /* include_subtree */ false, \ + /* nextbound */ false, NULL, NULL, \ + /* prevbound */ false, NULL, NULL); \ + return result; \ +} \ +a_attr a_type * \ +a_prefix##nsearch_filtered(a_rbt_type *rbtree, const a_type *key, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + a_type *node_candidate; \ + a_type *subtree_candidate; \ + a_type *result = a_prefix##search_with_filter_bounds(rbtree, key, \ + filter_node, filter_subtree, filter_ctx, \ + /* include_subtree */ true, \ + /* nextbound */ true, &node_candidate, &subtree_candidate, \ + /* prevbound */ false, NULL, NULL); \ + if (result != NULL) { \ + return result; \ + } \ + if (node_candidate != NULL) { \ + return node_candidate; \ + } \ + if (subtree_candidate != NULL) { \ + return a_prefix##first_filtered_from_node( \ + subtree_candidate, filter_node, filter_subtree, \ + filter_ctx); \ + } \ + return NULL; \ +} \ +a_attr a_type * \ +a_prefix##psearch_filtered(a_rbt_type *rbtree, const a_type *key, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + a_type *node_candidate; \ + a_type *subtree_candidate; \ + a_type *result = a_prefix##search_with_filter_bounds(rbtree, key, \ + filter_node, filter_subtree, filter_ctx, \ + /* include_subtree */ true, \ + /* nextbound */ false, NULL, NULL, \ + /* prevbound */ true, &node_candidate, &subtree_candidate); \ + if (result != NULL) { \ + return result; \ + } \ + if (node_candidate != NULL) { \ + return node_candidate; \ + } \ + if (subtree_candidate != NULL) { \ + return a_prefix##last_filtered_from_node( \ + subtree_candidate, filter_node, filter_subtree, \ + filter_ctx); \ + } \ + return NULL; \ +} \ +a_attr a_type * \ +a_prefix##iter_recurse_filtered(a_rbt_type *rbtree, a_type *node, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + if (node == NULL || !filter_subtree(filter_ctx, node)) { \ + return NULL; \ + } \ + a_type *ret; \ + a_type *left = rbtn_left_get(a_type, a_field, node); \ + a_type *right = rbtn_right_get(a_type, a_field, node); \ + ret = a_prefix##iter_recurse_filtered(rbtree, left, cb, arg, \ + filter_node, filter_subtree, filter_ctx); \ + if (ret != NULL) { \ + return ret; \ + } \ + if (filter_node(filter_ctx, node)) { \ + ret = cb(rbtree, node, arg); \ + } \ + if (ret != NULL) { \ + return ret; \ + } \ + return a_prefix##iter_recurse_filtered(rbtree, right, cb, arg, \ + filter_node, filter_subtree, filter_ctx); \ +} \ +a_attr a_type * \ +a_prefix##iter_start_filtered(a_rbt_type *rbtree, a_type *start, \ + a_type *node, a_type *(*cb)(a_rbt_type *, a_type *, void *), \ + void *arg, bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + if (!filter_subtree(filter_ctx, node)) { \ + return NULL; \ + } \ + int cmp = a_cmp(start, node); \ + a_type *ret; \ + a_type *left = rbtn_left_get(a_type, a_field, node); \ + a_type *right = rbtn_right_get(a_type, a_field, node); \ + if (cmp < 0) { \ + ret = a_prefix##iter_start_filtered(rbtree, start, left, cb, \ + arg, filter_node, filter_subtree, filter_ctx); \ + if (ret != NULL) { \ + return ret; \ + } \ + if (filter_node(filter_ctx, node)) { \ + ret = cb(rbtree, node, arg); \ + if (ret != NULL) { \ + return ret; \ + } \ + } \ + return a_prefix##iter_recurse_filtered(rbtree, right, cb, arg, \ + filter_node, filter_subtree, filter_ctx); \ + } else if (cmp > 0) { \ + return a_prefix##iter_start_filtered(rbtree, start, right, \ + cb, arg, filter_node, filter_subtree, filter_ctx); \ + } else { \ + if (filter_node(filter_ctx, node)) { \ + ret = cb(rbtree, node, arg); \ + if (ret != NULL) { \ + return ret; \ + } \ + } \ + return a_prefix##iter_recurse_filtered(rbtree, right, cb, arg, \ + filter_node, filter_subtree, filter_ctx); \ + } \ +} \ +a_attr a_type * \ +a_prefix##iter_filtered(a_rbt_type *rbtree, a_type *start, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + a_type *ret; \ + if (start != NULL) { \ + ret = a_prefix##iter_start_filtered(rbtree, start, \ + rbtree->rbt_root, cb, arg, filter_node, filter_subtree, \ + filter_ctx); \ + } else { \ + ret = a_prefix##iter_recurse_filtered(rbtree, rbtree->rbt_root, \ + cb, arg, filter_node, filter_subtree, filter_ctx); \ + } \ + return ret; \ +} \ +a_attr a_type * \ +a_prefix##reverse_iter_recurse_filtered(a_rbt_type *rbtree, \ + a_type *node, a_type *(*cb)(a_rbt_type *, a_type *, void *), \ + void *arg, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + if (node == NULL || !filter_subtree(filter_ctx, node)) { \ + return NULL; \ + } \ + a_type *ret; \ + a_type *left = rbtn_left_get(a_type, a_field, node); \ + a_type *right = rbtn_right_get(a_type, a_field, node); \ + ret = a_prefix##reverse_iter_recurse_filtered(rbtree, right, cb, \ + arg, filter_node, filter_subtree, filter_ctx); \ + if (ret != NULL) { \ + return ret; \ + } \ + if (filter_node(filter_ctx, node)) { \ + ret = cb(rbtree, node, arg); \ + } \ + if (ret != NULL) { \ + return ret; \ + } \ + return a_prefix##reverse_iter_recurse_filtered(rbtree, left, cb, \ + arg, filter_node, filter_subtree, filter_ctx); \ +} \ +a_attr a_type * \ +a_prefix##reverse_iter_start_filtered(a_rbt_type *rbtree, a_type *start,\ + a_type *node, a_type *(*cb)(a_rbt_type *, a_type *, void *), \ + void *arg, bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + if (!filter_subtree(filter_ctx, node)) { \ + return NULL; \ + } \ + int cmp = a_cmp(start, node); \ + a_type *ret; \ + a_type *left = rbtn_left_get(a_type, a_field, node); \ + a_type *right = rbtn_right_get(a_type, a_field, node); \ + if (cmp > 0) { \ + ret = a_prefix##reverse_iter_start_filtered(rbtree, start, \ + right, cb, arg, filter_node, filter_subtree, filter_ctx); \ + if (ret != NULL) { \ + return ret; \ + } \ + if (filter_node(filter_ctx, node)) { \ + ret = cb(rbtree, node, arg); \ + if (ret != NULL) { \ + return ret; \ + } \ + } \ + return a_prefix##reverse_iter_recurse_filtered(rbtree, left, cb,\ + arg, filter_node, filter_subtree, filter_ctx); \ + } else if (cmp < 0) { \ + return a_prefix##reverse_iter_start_filtered(rbtree, start, \ + left, cb, arg, filter_node, filter_subtree, filter_ctx); \ + } else { \ + if (filter_node(filter_ctx, node)) { \ + ret = cb(rbtree, node, arg); \ + if (ret != NULL) { \ + return ret; \ + } \ + } \ + return a_prefix##reverse_iter_recurse_filtered(rbtree, left, cb,\ + arg, filter_node, filter_subtree, filter_ctx); \ + } \ +} \ +a_attr a_type * \ +a_prefix##reverse_iter_filtered(a_rbt_type *rbtree, a_type *start, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + a_type *ret; \ + if (start != NULL) { \ + ret = a_prefix##reverse_iter_start_filtered(rbtree, start, \ + rbtree->rbt_root, cb, arg, filter_node, filter_subtree, \ + filter_ctx); \ + } else { \ + ret = a_prefix##reverse_iter_recurse_filtered(rbtree, \ + rbtree->rbt_root, cb, arg, filter_node, filter_subtree, \ + filter_ctx); \ + } \ + return ret; \ +} \ +) /* end rb_summarized_only */ + +#endif /* JEMALLOC_INTERNAL_RB_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/rtree.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/rtree.h new file mode 100644 index 000000000..f35368ae0 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/rtree.h @@ -0,0 +1,565 @@ +#ifndef JEMALLOC_INTERNAL_RTREE_H +#define JEMALLOC_INTERNAL_RTREE_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/base.h" +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/edata.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/rtree_tsd.h" +#include "jemalloc/internal/sc.h" +#include "jemalloc/internal/tsd.h" + +/* + * This radix tree implementation is tailored to the singular purpose of + * associating metadata with extents that are currently owned by jemalloc. + * + ******************************************************************************* + */ + +/* Number of high insignificant bits. */ +#define RTREE_NHIB ((1U << (LG_SIZEOF_PTR+3)) - LG_VADDR) +/* Number of low insigificant bits. */ +#define RTREE_NLIB LG_PAGE +/* Number of significant bits. */ +#define RTREE_NSB (LG_VADDR - RTREE_NLIB) +/* Number of levels in radix tree. */ +#if RTREE_NSB <= 10 +# define RTREE_HEIGHT 1 +#elif RTREE_NSB <= 36 +# define RTREE_HEIGHT 2 +#elif RTREE_NSB <= 52 +# define RTREE_HEIGHT 3 +#else +# error Unsupported number of significant virtual address bits +#endif +/* Use compact leaf representation if virtual address encoding allows. */ +#if RTREE_NHIB >= LG_CEIL(SC_NSIZES) +# define RTREE_LEAF_COMPACT +#endif + +typedef struct rtree_node_elm_s rtree_node_elm_t; +struct rtree_node_elm_s { + atomic_p_t child; /* (rtree_{node,leaf}_elm_t *) */ +}; + +typedef struct rtree_metadata_s rtree_metadata_t; +struct rtree_metadata_s { + szind_t szind; + extent_state_t state; /* Mirrors edata->state. */ + bool is_head; /* Mirrors edata->is_head. */ + bool slab; +}; + +typedef struct rtree_contents_s rtree_contents_t; +struct rtree_contents_s { + edata_t *edata; + rtree_metadata_t metadata; +}; + +#define RTREE_LEAF_STATE_WIDTH EDATA_BITS_STATE_WIDTH +#define RTREE_LEAF_STATE_SHIFT 2 +#define RTREE_LEAF_STATE_MASK MASK(RTREE_LEAF_STATE_WIDTH, RTREE_LEAF_STATE_SHIFT) + +struct rtree_leaf_elm_s { +#ifdef RTREE_LEAF_COMPACT + /* + * Single pointer-width field containing all three leaf element fields. + * For example, on a 64-bit x64 system with 48 significant virtual + * memory address bits, the index, edata, and slab fields are packed as + * such: + * + * x: index + * e: edata + * s: state + * h: is_head + * b: slab + * + * 00000000 xxxxxxxx eeeeeeee [...] eeeeeeee e00ssshb + */ + atomic_p_t le_bits; +#else + atomic_p_t le_edata; /* (edata_t *) */ + /* + * From high to low bits: szind (8 bits), state (4 bits), is_head, slab + */ + atomic_u_t le_metadata; +#endif +}; + +typedef struct rtree_level_s rtree_level_t; +struct rtree_level_s { + /* Number of key bits distinguished by this level. */ + unsigned bits; + /* + * Cumulative number of key bits distinguished by traversing to + * corresponding tree level. + */ + unsigned cumbits; +}; + +typedef struct rtree_s rtree_t; +struct rtree_s { + base_t *base; + malloc_mutex_t init_lock; + /* Number of elements based on rtree_levels[0].bits. */ +#if RTREE_HEIGHT > 1 + rtree_node_elm_t root[1U << (RTREE_NSB/RTREE_HEIGHT)]; +#else + rtree_leaf_elm_t root[1U << (RTREE_NSB/RTREE_HEIGHT)]; +#endif +}; + +/* + * Split the bits into one to three partitions depending on number of + * significant bits. It the number of bits does not divide evenly into the + * number of levels, place one remainder bit per level starting at the leaf + * level. + */ +static const rtree_level_t rtree_levels[] = { +#if RTREE_HEIGHT == 1 + {RTREE_NSB, RTREE_NHIB + RTREE_NSB} +#elif RTREE_HEIGHT == 2 + {RTREE_NSB/2, RTREE_NHIB + RTREE_NSB/2}, + {RTREE_NSB/2 + RTREE_NSB%2, RTREE_NHIB + RTREE_NSB} +#elif RTREE_HEIGHT == 3 + {RTREE_NSB/3, RTREE_NHIB + RTREE_NSB/3}, + {RTREE_NSB/3 + RTREE_NSB%3/2, + RTREE_NHIB + RTREE_NSB/3*2 + RTREE_NSB%3/2}, + {RTREE_NSB/3 + RTREE_NSB%3 - RTREE_NSB%3/2, RTREE_NHIB + RTREE_NSB} +#else +# error Unsupported rtree height +#endif +}; + +bool rtree_new(rtree_t *rtree, base_t *base, bool zeroed); + +rtree_leaf_elm_t *rtree_leaf_elm_lookup_hard(tsdn_t *tsdn, rtree_t *rtree, + rtree_ctx_t *rtree_ctx, uintptr_t key, bool dependent, bool init_missing); + +JEMALLOC_ALWAYS_INLINE unsigned +rtree_leaf_maskbits(void) { + unsigned ptrbits = ZU(1) << (LG_SIZEOF_PTR+3); + unsigned cumbits = (rtree_levels[RTREE_HEIGHT-1].cumbits - + rtree_levels[RTREE_HEIGHT-1].bits); + return ptrbits - cumbits; +} + +JEMALLOC_ALWAYS_INLINE uintptr_t +rtree_leafkey(uintptr_t key) { + uintptr_t mask = ~((ZU(1) << rtree_leaf_maskbits()) - 1); + return (key & mask); +} + +JEMALLOC_ALWAYS_INLINE size_t +rtree_cache_direct_map(uintptr_t key) { + return (size_t)((key >> rtree_leaf_maskbits()) & + (RTREE_CTX_NCACHE - 1)); +} + +JEMALLOC_ALWAYS_INLINE uintptr_t +rtree_subkey(uintptr_t key, unsigned level) { + unsigned ptrbits = ZU(1) << (LG_SIZEOF_PTR+3); + unsigned cumbits = rtree_levels[level].cumbits; + unsigned shiftbits = ptrbits - cumbits; + unsigned maskbits = rtree_levels[level].bits; + uintptr_t mask = (ZU(1) << maskbits) - 1; + return ((key >> shiftbits) & mask); +} + +/* + * Atomic getters. + * + * dependent: Reading a value on behalf of a pointer to a valid allocation + * is guaranteed to be a clean read even without synchronization, + * because the rtree update became visible in memory before the + * pointer came into existence. + * !dependent: An arbitrary read, e.g. on behalf of ivsalloc(), may not be + * dependent on a previous rtree write, which means a stale read + * could result if synchronization were omitted here. + */ +# ifdef RTREE_LEAF_COMPACT +JEMALLOC_ALWAYS_INLINE uintptr_t +rtree_leaf_elm_bits_read(tsdn_t *tsdn, rtree_t *rtree, + rtree_leaf_elm_t *elm, bool dependent) { + return (uintptr_t)atomic_load_p(&elm->le_bits, dependent + ? ATOMIC_RELAXED : ATOMIC_ACQUIRE); +} + +JEMALLOC_ALWAYS_INLINE uintptr_t +rtree_leaf_elm_bits_encode(rtree_contents_t contents) { + assert((uintptr_t)contents.edata % (uintptr_t)EDATA_ALIGNMENT == 0); + uintptr_t edata_bits = (uintptr_t)contents.edata + & (((uintptr_t)1 << LG_VADDR) - 1); + + uintptr_t szind_bits = (uintptr_t)contents.metadata.szind << LG_VADDR; + uintptr_t slab_bits = (uintptr_t)contents.metadata.slab; + uintptr_t is_head_bits = (uintptr_t)contents.metadata.is_head << 1; + uintptr_t state_bits = (uintptr_t)contents.metadata.state << + RTREE_LEAF_STATE_SHIFT; + uintptr_t metadata_bits = szind_bits | state_bits | is_head_bits | + slab_bits; + assert((edata_bits & metadata_bits) == 0); + + return edata_bits | metadata_bits; +} + +JEMALLOC_ALWAYS_INLINE rtree_contents_t +rtree_leaf_elm_bits_decode(uintptr_t bits) { + rtree_contents_t contents; + /* Do the easy things first. */ + contents.metadata.szind = bits >> LG_VADDR; + contents.metadata.slab = (bool)(bits & 1); + contents.metadata.is_head = (bool)(bits & (1 << 1)); + + uintptr_t state_bits = (bits & RTREE_LEAF_STATE_MASK) >> + RTREE_LEAF_STATE_SHIFT; + assert(state_bits <= extent_state_max); + contents.metadata.state = (extent_state_t)state_bits; + + uintptr_t low_bit_mask = ~((uintptr_t)EDATA_ALIGNMENT - 1); +# ifdef __aarch64__ + /* + * aarch64 doesn't sign extend the highest virtual address bit to set + * the higher ones. Instead, the high bits get zeroed. + */ + uintptr_t high_bit_mask = ((uintptr_t)1 << LG_VADDR) - 1; + /* Mask off metadata. */ + uintptr_t mask = high_bit_mask & low_bit_mask; + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + contents.edata = (edata_t *)(bits & mask); +# else + /* Restore sign-extended high bits, mask metadata bits. */ + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + contents.edata = (edata_t *)((uintptr_t)((intptr_t)(bits << RTREE_NHIB) + >> RTREE_NHIB) & low_bit_mask); +# endif + assert((uintptr_t)contents.edata % (uintptr_t)EDATA_ALIGNMENT == 0); + return contents; +} + +# endif /* RTREE_LEAF_COMPACT */ + +JEMALLOC_ALWAYS_INLINE rtree_contents_t +rtree_leaf_elm_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm, + bool dependent) { +#ifdef RTREE_LEAF_COMPACT + uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, dependent); + rtree_contents_t contents = rtree_leaf_elm_bits_decode(bits); + return contents; +#else + rtree_contents_t contents; + unsigned metadata_bits = atomic_load_u(&elm->le_metadata, dependent + ? ATOMIC_RELAXED : ATOMIC_ACQUIRE); + contents.metadata.slab = (bool)(metadata_bits & 1); + contents.metadata.is_head = (bool)(metadata_bits & (1 << 1)); + + uintptr_t state_bits = (metadata_bits & RTREE_LEAF_STATE_MASK) >> + RTREE_LEAF_STATE_SHIFT; + assert(state_bits <= extent_state_max); + contents.metadata.state = (extent_state_t)state_bits; + contents.metadata.szind = metadata_bits >> (RTREE_LEAF_STATE_SHIFT + + RTREE_LEAF_STATE_WIDTH); + + contents.edata = (edata_t *)atomic_load_p(&elm->le_edata, dependent + ? ATOMIC_RELAXED : ATOMIC_ACQUIRE); + + return contents; +#endif +} + +JEMALLOC_ALWAYS_INLINE void +rtree_contents_encode(rtree_contents_t contents, void **bits, + unsigned *additional) { +#ifdef RTREE_LEAF_COMPACT + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + *bits = (void *)rtree_leaf_elm_bits_encode(contents); + /* Suppress spurious warning from static analysis */ + if (config_debug) { + *additional = 0; + } +#else + *additional = (unsigned)contents.metadata.slab + | ((unsigned)contents.metadata.is_head << 1) + | ((unsigned)contents.metadata.state << RTREE_LEAF_STATE_SHIFT) + | ((unsigned)contents.metadata.szind << (RTREE_LEAF_STATE_SHIFT + + RTREE_LEAF_STATE_WIDTH)); + *bits = contents.edata; +#endif +} + +JEMALLOC_ALWAYS_INLINE void +rtree_leaf_elm_write_commit(tsdn_t *tsdn, rtree_t *rtree, + rtree_leaf_elm_t *elm, void *bits, unsigned additional) { +#ifdef RTREE_LEAF_COMPACT + atomic_store_p(&elm->le_bits, bits, ATOMIC_RELEASE); +#else + atomic_store_u(&elm->le_metadata, additional, ATOMIC_RELEASE); + /* + * Write edata last, since the element is atomically considered valid + * as soon as the edata field is non-NULL. + */ + atomic_store_p(&elm->le_edata, bits, ATOMIC_RELEASE); +#endif +} + +JEMALLOC_ALWAYS_INLINE void +rtree_leaf_elm_write(tsdn_t *tsdn, rtree_t *rtree, + rtree_leaf_elm_t *elm, rtree_contents_t contents) { + assert((uintptr_t)contents.edata % EDATA_ALIGNMENT == 0); + void *bits; + unsigned additional; + rtree_contents_encode(contents, &bits, &additional); + rtree_leaf_elm_write_commit(tsdn, rtree, elm, bits, additional); +} + +/* The state field can be updated independently (and more frequently). */ +JEMALLOC_ALWAYS_INLINE void +rtree_leaf_elm_state_update(tsdn_t *tsdn, rtree_t *rtree, + rtree_leaf_elm_t *elm1, rtree_leaf_elm_t *elm2, extent_state_t state) { + assert(elm1 != NULL); +#ifdef RTREE_LEAF_COMPACT + uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm1, + /* dependent */ true); + bits &= ~RTREE_LEAF_STATE_MASK; + bits |= state << RTREE_LEAF_STATE_SHIFT; + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + atomic_store_p(&elm1->le_bits, (void *)bits, ATOMIC_RELEASE); + if (elm2 != NULL) { + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + atomic_store_p(&elm2->le_bits, (void *)bits, ATOMIC_RELEASE); + } +#else + unsigned bits = atomic_load_u(&elm1->le_metadata, ATOMIC_RELAXED); + bits &= ~RTREE_LEAF_STATE_MASK; + bits |= state << RTREE_LEAF_STATE_SHIFT; + atomic_store_u(&elm1->le_metadata, bits, ATOMIC_RELEASE); + if (elm2 != NULL) { + atomic_store_u(&elm2->le_metadata, bits, ATOMIC_RELEASE); + } +#endif +} + +/* + * Tries to look up the key in the L1 cache, returning false if there's a hit, or + * true if there's a miss. + * Key is allowed to be NULL; returns true in this case. + */ +JEMALLOC_ALWAYS_INLINE bool +rtree_leaf_elm_lookup_fast(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t key, rtree_leaf_elm_t **elm) { + size_t slot = rtree_cache_direct_map(key); + uintptr_t leafkey = rtree_leafkey(key); + assert(leafkey != RTREE_LEAFKEY_INVALID); + + if (unlikely(rtree_ctx->cache[slot].leafkey != leafkey)) { + return true; + } + + rtree_leaf_elm_t *leaf = rtree_ctx->cache[slot].leaf; + assert(leaf != NULL); + uintptr_t subkey = rtree_subkey(key, RTREE_HEIGHT-1); + *elm = &leaf[subkey]; + + return false; +} + +JEMALLOC_ALWAYS_INLINE rtree_leaf_elm_t * +rtree_leaf_elm_lookup(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t key, bool dependent, bool init_missing) { + assert(key != 0); + assert(!dependent || !init_missing); + + size_t slot = rtree_cache_direct_map(key); + uintptr_t leafkey = rtree_leafkey(key); + assert(leafkey != RTREE_LEAFKEY_INVALID); + + /* Fast path: L1 direct mapped cache. */ + if (likely(rtree_ctx->cache[slot].leafkey == leafkey)) { + rtree_leaf_elm_t *leaf = rtree_ctx->cache[slot].leaf; + assert(leaf != NULL); + uintptr_t subkey = rtree_subkey(key, RTREE_HEIGHT-1); + return &leaf[subkey]; + } + /* + * Search the L2 LRU cache. On hit, swap the matching element into the + * slot in L1 cache, and move the position in L2 up by 1. + */ +#define RTREE_CACHE_CHECK_L2(i) do { \ + if (likely(rtree_ctx->l2_cache[i].leafkey == leafkey)) { \ + rtree_leaf_elm_t *leaf = rtree_ctx->l2_cache[i].leaf; \ + assert(leaf != NULL); \ + if (i > 0) { \ + /* Bubble up by one. */ \ + rtree_ctx->l2_cache[i].leafkey = \ + rtree_ctx->l2_cache[i - 1].leafkey; \ + rtree_ctx->l2_cache[i].leaf = \ + rtree_ctx->l2_cache[i - 1].leaf; \ + rtree_ctx->l2_cache[i - 1].leafkey = \ + rtree_ctx->cache[slot].leafkey; \ + rtree_ctx->l2_cache[i - 1].leaf = \ + rtree_ctx->cache[slot].leaf; \ + } else { \ + rtree_ctx->l2_cache[0].leafkey = \ + rtree_ctx->cache[slot].leafkey; \ + rtree_ctx->l2_cache[0].leaf = \ + rtree_ctx->cache[slot].leaf; \ + } \ + rtree_ctx->cache[slot].leafkey = leafkey; \ + rtree_ctx->cache[slot].leaf = leaf; \ + uintptr_t subkey = rtree_subkey(key, RTREE_HEIGHT-1); \ + return &leaf[subkey]; \ + } \ +} while (0) + /* Check the first cache entry. */ + RTREE_CACHE_CHECK_L2(0); + /* Search the remaining cache elements. */ + for (unsigned i = 1; i < RTREE_CTX_NCACHE_L2; i++) { + RTREE_CACHE_CHECK_L2(i); + } +#undef RTREE_CACHE_CHECK_L2 + + return rtree_leaf_elm_lookup_hard(tsdn, rtree, rtree_ctx, key, + dependent, init_missing); +} + +/* + * Returns true on lookup failure. + */ +static inline bool +rtree_read_independent(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t key, rtree_contents_t *r_contents) { + rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, rtree, rtree_ctx, + key, /* dependent */ false, /* init_missing */ false); + if (elm == NULL) { + return true; + } + *r_contents = rtree_leaf_elm_read(tsdn, rtree, elm, + /* dependent */ false); + return false; +} + +static inline rtree_contents_t +rtree_read(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t key) { + rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, rtree, rtree_ctx, + key, /* dependent */ true, /* init_missing */ false); + assert(elm != NULL); + return rtree_leaf_elm_read(tsdn, rtree, elm, /* dependent */ true); +} + +static inline rtree_metadata_t +rtree_metadata_read(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t key) { + rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, rtree, rtree_ctx, + key, /* dependent */ true, /* init_missing */ false); + assert(elm != NULL); + return rtree_leaf_elm_read(tsdn, rtree, elm, + /* dependent */ true).metadata; +} + +/* + * Returns true when the request cannot be fulfilled by fastpath. + */ +static inline bool +rtree_metadata_try_read_fast(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t key, rtree_metadata_t *r_rtree_metadata) { + rtree_leaf_elm_t *elm; + /* + * Should check the bool return value (lookup success or not) instead of + * elm == NULL (which will result in an extra branch). This is because + * when the cache lookup succeeds, there will never be a NULL pointer + * returned (which is unknown to the compiler). + */ + if (rtree_leaf_elm_lookup_fast(tsdn, rtree, rtree_ctx, key, &elm)) { + return true; + } + assert(elm != NULL); + *r_rtree_metadata = rtree_leaf_elm_read(tsdn, rtree, elm, + /* dependent */ true).metadata; + return false; +} + +JEMALLOC_ALWAYS_INLINE void +rtree_write_range_impl(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t base, uintptr_t end, rtree_contents_t contents, bool clearing) { + assert((base & PAGE_MASK) == 0 && (end & PAGE_MASK) == 0); + /* + * Only used for emap_(de)register_interior, which implies the + * boundaries have been registered already. Therefore all the lookups + * are dependent w/o init_missing, assuming the range spans across at + * most 2 rtree leaf nodes (each covers 1 GiB of vaddr). + */ + void *bits; + unsigned additional; + rtree_contents_encode(contents, &bits, &additional); + + rtree_leaf_elm_t *elm = NULL; /* Dead store. */ + for (uintptr_t addr = base; addr <= end; addr += PAGE) { + if (addr == base || + (addr & ((ZU(1) << rtree_leaf_maskbits()) - 1)) == 0) { + elm = rtree_leaf_elm_lookup(tsdn, rtree, rtree_ctx, addr, + /* dependent */ true, /* init_missing */ false); + assert(elm != NULL); + } + assert(elm == rtree_leaf_elm_lookup(tsdn, rtree, rtree_ctx, addr, + /* dependent */ true, /* init_missing */ false)); + assert(!clearing || rtree_leaf_elm_read(tsdn, rtree, elm, + /* dependent */ true).edata != NULL); + rtree_leaf_elm_write_commit(tsdn, rtree, elm, bits, additional); + elm++; + } +} + +JEMALLOC_ALWAYS_INLINE void +rtree_write_range(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t base, uintptr_t end, rtree_contents_t contents) { + rtree_write_range_impl(tsdn, rtree, rtree_ctx, base, end, contents, + /* clearing */ false); +} + +JEMALLOC_ALWAYS_INLINE bool +rtree_write(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, uintptr_t key, + rtree_contents_t contents) { + rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, rtree, rtree_ctx, + key, /* dependent */ false, /* init_missing */ true); + if (elm == NULL) { + return true; + } + + rtree_leaf_elm_write(tsdn, rtree, elm, contents); + + return false; +} + +static inline void +rtree_clear(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t key) { + rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, rtree, rtree_ctx, + key, /* dependent */ true, /* init_missing */ false); + assert(elm != NULL); + assert(rtree_leaf_elm_read(tsdn, rtree, elm, + /* dependent */ true).edata != NULL); + rtree_contents_t contents; + contents.edata = NULL; + contents.metadata.szind = SC_NSIZES; + contents.metadata.slab = false; + contents.metadata.is_head = false; + contents.metadata.state = (extent_state_t)0; + rtree_leaf_elm_write(tsdn, rtree, elm, contents); +} + +static inline void +rtree_clear_range(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t base, uintptr_t end) { + rtree_contents_t contents; + contents.edata = NULL; + contents.metadata.szind = SC_NSIZES; + contents.metadata.slab = false; + contents.metadata.is_head = false; + contents.metadata.state = (extent_state_t)0; + rtree_write_range_impl(tsdn, rtree, rtree_ctx, base, end, contents, + /* clearing */ true); +} + +#endif /* JEMALLOC_INTERNAL_RTREE_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/rtree_tsd.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/rtree_tsd.h new file mode 100644 index 000000000..59f185700 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/rtree_tsd.h @@ -0,0 +1,64 @@ +#ifndef JEMALLOC_INTERNAL_RTREE_CTX_H +#define JEMALLOC_INTERNAL_RTREE_CTX_H + +#include "jemalloc/internal/jemalloc_preamble.h" + +/* + * Number of leafkey/leaf pairs to cache in L1 and L2 level respectively. Each + * entry supports an entire leaf, so the cache hit rate is typically high even + * with a small number of entries. In rare cases extent activity will straddle + * the boundary between two leaf nodes. Furthermore, an arena may use a + * combination of dss and mmap. Note that as memory usage grows past the amount + * that this cache can directly cover, the cache will become less effective if + * locality of reference is low, but the consequence is merely cache misses + * while traversing the tree nodes. + * + * The L1 direct mapped cache offers consistent and low cost on cache hit. + * However collision could affect hit rate negatively. This is resolved by + * combining with a L2 LRU cache, which requires linear search and re-ordering + * on access but suffers no collision. Note that, the cache will itself suffer + * cache misses if made overly large, plus the cost of linear search in the LRU + * cache. + */ +#define RTREE_CTX_NCACHE 16 +#define RTREE_CTX_NCACHE_L2 8 + +/* Needed for initialization only. */ +#define RTREE_LEAFKEY_INVALID ((uintptr_t)1) +#define RTREE_CTX_CACHE_ELM_INVALID {RTREE_LEAFKEY_INVALID, NULL} + +#define RTREE_CTX_INIT_ELM_1 RTREE_CTX_CACHE_ELM_INVALID +#define RTREE_CTX_INIT_ELM_2 RTREE_CTX_INIT_ELM_1, RTREE_CTX_INIT_ELM_1 +#define RTREE_CTX_INIT_ELM_4 RTREE_CTX_INIT_ELM_2, RTREE_CTX_INIT_ELM_2 +#define RTREE_CTX_INIT_ELM_8 RTREE_CTX_INIT_ELM_4, RTREE_CTX_INIT_ELM_4 +#define RTREE_CTX_INIT_ELM_16 RTREE_CTX_INIT_ELM_8, RTREE_CTX_INIT_ELM_8 + +#define _RTREE_CTX_INIT_ELM_DATA(n) RTREE_CTX_INIT_ELM_##n +#define RTREE_CTX_INIT_ELM_DATA(n) _RTREE_CTX_INIT_ELM_DATA(n) + +/* + * Static initializer (to invalidate the cache entries) is required because the + * free fastpath may access the rtree cache before a full tsd initialization. + */ +#define RTREE_CTX_INITIALIZER {{RTREE_CTX_INIT_ELM_DATA(RTREE_CTX_NCACHE)}, \ + {RTREE_CTX_INIT_ELM_DATA(RTREE_CTX_NCACHE_L2)}} + +typedef struct rtree_leaf_elm_s rtree_leaf_elm_t; + +typedef struct rtree_ctx_cache_elm_s rtree_ctx_cache_elm_t; +struct rtree_ctx_cache_elm_s { + uintptr_t leafkey; + rtree_leaf_elm_t *leaf; +}; + +typedef struct rtree_ctx_s rtree_ctx_t; +struct rtree_ctx_s { + /* Direct mapped cache. */ + rtree_ctx_cache_elm_t cache[RTREE_CTX_NCACHE]; + /* L2 LRU cache. */ + rtree_ctx_cache_elm_t l2_cache[RTREE_CTX_NCACHE_L2]; +}; + +void rtree_ctx_data_init(rtree_ctx_t *ctx); + +#endif /* JEMALLOC_INTERNAL_RTREE_CTX_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/safety_check.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/safety_check.h new file mode 100644 index 000000000..194b7744e --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/safety_check.h @@ -0,0 +1,65 @@ +#ifndef JEMALLOC_INTERNAL_SAFETY_CHECK_H +#define JEMALLOC_INTERNAL_SAFETY_CHECK_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/pages.h" + +#define SAFETY_CHECK_DOUBLE_FREE_MAX_SCAN_DEFAULT 32 + +void safety_check_fail_sized_dealloc(bool current_dealloc, const void *ptr, + size_t true_size, size_t input_size); +void safety_check_fail(const char *format, ...); + +typedef void (*safety_check_abort_hook_t)(const char *message); + +/* Can set to NULL for a default. */ +void safety_check_set_abort(safety_check_abort_hook_t abort_fn); + +#define REDZONE_SIZE ((size_t) 32) +#define REDZONE_FILL_VALUE 0xBC + +/* + * Normally the redzone extends `REDZONE_SIZE` bytes beyond the end of + * the allocation. However, we don't let the redzone extend onto another + * OS page because this would impose additional overhead if that page was + * not already resident in memory. + */ +JEMALLOC_ALWAYS_INLINE const unsigned char * +compute_redzone_end(const void *_ptr, size_t usize, size_t bumped_usize) { + const unsigned char *ptr = (const unsigned char *) _ptr; + const unsigned char *redzone_end = usize + REDZONE_SIZE < bumped_usize ? + &ptr[usize + REDZONE_SIZE] : &ptr[bumped_usize]; + const unsigned char *page_end = (const unsigned char *) + ALIGNMENT_ADDR2CEILING(&ptr[usize], os_page); + return redzone_end < page_end ? redzone_end : page_end; +} + +JEMALLOC_ALWAYS_INLINE void +safety_check_set_redzone(void *ptr, size_t usize, size_t bumped_usize) { + assert(usize <= bumped_usize); + const unsigned char *redzone_end = + compute_redzone_end(ptr, usize, bumped_usize); + for (unsigned char *curr = &((unsigned char *)ptr)[usize]; + curr < redzone_end; curr++) { + *curr = REDZONE_FILL_VALUE; + } +} + +JEMALLOC_ALWAYS_INLINE void +safety_check_verify_redzone(const void *ptr, size_t usize, size_t bumped_usize) +{ + const unsigned char *redzone_end = + compute_redzone_end(ptr, usize, bumped_usize); + for (const unsigned char *curr= &((const unsigned char *)ptr)[usize]; + curr < redzone_end; curr++) { + if (unlikely(*curr != REDZONE_FILL_VALUE)) { + safety_check_fail("Use after free error\n"); + } + } +} + +#undef REDZONE_SIZE +#undef REDZONE_FILL_VALUE + +#endif /*JEMALLOC_INTERNAL_SAFETY_CHECK_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/san.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/san.h new file mode 100644 index 000000000..669f99dda --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/san.h @@ -0,0 +1,194 @@ +#ifndef JEMALLOC_INTERNAL_GUARD_H +#define JEMALLOC_INTERNAL_GUARD_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/ehooks.h" +#include "jemalloc/internal/emap.h" +#include "jemalloc/internal/jemalloc_internal_externs.h" +#include "jemalloc/internal/tsd.h" + +#define SAN_PAGE_GUARD PAGE +#define SAN_PAGE_GUARDS_SIZE (SAN_PAGE_GUARD * 2) + +#define SAN_GUARD_LARGE_EVERY_N_EXTENTS_DEFAULT 0 +#define SAN_GUARD_SMALL_EVERY_N_EXTENTS_DEFAULT 0 + +#define SAN_LG_UAF_ALIGN_DEFAULT (-1) +#define SAN_CACHE_BIN_NONFAST_MASK_DEFAULT (uintptr_t)(-1) + +static const uintptr_t uaf_detect_junk = (uintptr_t)0x5b5b5b5b5b5b5b5bULL; + +/* 0 means disabled, i.e. never guarded. */ +extern size_t opt_san_guard_large; +extern size_t opt_san_guard_small; +/* -1 means disabled, i.e. never check for use-after-free. */ +extern ssize_t opt_lg_san_uaf_align; + +void san_guard_pages(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + emap_t *emap, bool left, bool right, bool remap); +void san_unguard_pages(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + emap_t *emap, bool left, bool right); +/* + * Unguard the extent, but don't modify emap boundaries. Must be called on an + * extent that has been erased from emap and shouldn't be placed back. + */ +void san_unguard_pages_pre_destroy(tsdn_t *tsdn, ehooks_t *ehooks, + edata_t *edata, emap_t *emap); +void san_check_stashed_ptrs(void **ptrs, size_t nstashed, size_t usize); + +void tsd_san_init(tsd_t *tsd); +void san_init(ssize_t lg_san_uaf_align); + +static inline void +san_guard_pages_two_sided(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + emap_t *emap, bool remap) { + san_guard_pages(tsdn, ehooks, edata, emap, true, true, remap); +} + +static inline void +san_unguard_pages_two_sided(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + emap_t *emap) { + san_unguard_pages(tsdn, ehooks, edata, emap, true, true); +} + +static inline size_t +san_two_side_unguarded_sz(size_t size) { + assert(size % PAGE == 0); + assert(size >= SAN_PAGE_GUARDS_SIZE); + return size - SAN_PAGE_GUARDS_SIZE; +} + +static inline size_t +san_two_side_guarded_sz(size_t size) { + assert(size % PAGE == 0); + return size + SAN_PAGE_GUARDS_SIZE; +} + +static inline size_t +san_one_side_unguarded_sz(size_t size) { + assert(size % PAGE == 0); + assert(size >= SAN_PAGE_GUARD); + return size - SAN_PAGE_GUARD; +} + +static inline size_t +san_one_side_guarded_sz(size_t size) { + assert(size % PAGE == 0); + return size + SAN_PAGE_GUARD; +} + +static inline bool +san_guard_enabled(void) { + return (opt_san_guard_large != 0 || opt_san_guard_small != 0); +} + +static inline bool +san_large_extent_decide_guard(tsdn_t *tsdn, ehooks_t *ehooks, size_t size, + size_t alignment) { + if (opt_san_guard_large == 0 || ehooks_guard_will_fail(ehooks) || + tsdn_null(tsdn)) { + return false; + } + + tsd_t *tsd = tsdn_tsd(tsdn); + uint64_t n = tsd_san_extents_until_guard_large_get(tsd); + assert(n >= 1); + if (n > 1) { + /* + * Subtract conditionally because the guard may not happen due + * to alignment or size restriction below. + */ + *tsd_san_extents_until_guard_largep_get(tsd) = n - 1; + } + + if (n == 1 && (alignment <= PAGE) && + (san_two_side_guarded_sz(size) <= SC_LARGE_MAXCLASS)) { + *tsd_san_extents_until_guard_largep_get(tsd) = + opt_san_guard_large; + return true; + } else { + assert(tsd_san_extents_until_guard_large_get(tsd) >= 1); + return false; + } +} + +static inline bool +san_slab_extent_decide_guard(tsdn_t *tsdn, ehooks_t *ehooks) { + if (opt_san_guard_small == 0 || ehooks_guard_will_fail(ehooks) || + tsdn_null(tsdn)) { + return false; + } + + tsd_t *tsd = tsdn_tsd(tsdn); + uint64_t n = tsd_san_extents_until_guard_small_get(tsd); + assert(n >= 1); + if (n == 1) { + *tsd_san_extents_until_guard_smallp_get(tsd) = + opt_san_guard_small; + return true; + } else { + *tsd_san_extents_until_guard_smallp_get(tsd) = n - 1; + assert(tsd_san_extents_until_guard_small_get(tsd) >= 1); + return false; + } +} + +static inline void +san_junk_ptr_locations(void *ptr, size_t usize, void **first, void **mid, + void **last) { + size_t ptr_sz = sizeof(void *); + + *first = ptr; + + *mid = (void *)((byte_t *)ptr + ((usize >> 1) & ~(ptr_sz - 1))); + assert(*first != *mid || usize == ptr_sz); + assert((uintptr_t)*first <= (uintptr_t)*mid); + + /* + * When usize > 32K, the gap between requested_size and usize might be + * greater than 4K -- this means the last write may access an + * likely-untouched page (default settings w/ 4K pages). However by + * default the tcache only goes up to the 32K size class, and is usually + * tuned lower instead of higher, which makes it less of a concern. + */ + *last = (void *)((byte_t *)ptr + usize - sizeof(uaf_detect_junk)); + assert(*first != *last || usize == ptr_sz); + assert(*mid != *last || usize <= ptr_sz * 2); + assert((uintptr_t)*mid <= (uintptr_t)*last); +} + +static inline bool +san_junk_ptr_should_slow(void) { + /* + * The latter condition (pointer size greater than the min size class) + * is not expected -- fall back to the slow path for simplicity. + */ + return config_debug || (LG_SIZEOF_PTR > SC_LG_TINY_MIN); +} + +static inline void +san_junk_ptr(void *ptr, size_t usize) { + if (san_junk_ptr_should_slow()) { + memset(ptr, (char)uaf_detect_junk, usize); + return; + } + + void *first, *mid, *last; + san_junk_ptr_locations(ptr, usize, &first, &mid, &last); + *(uintptr_t *)first = uaf_detect_junk; + *(uintptr_t *)mid = uaf_detect_junk; + *(uintptr_t *)last = uaf_detect_junk; +} + +static inline bool +san_uaf_detection_enabled(void) { + bool ret = config_uaf_detection && (opt_lg_san_uaf_align != -1); + if (config_uaf_detection && ret) { + assert(san_cache_bin_nonfast_mask == ((uintptr_t)1 << + opt_lg_san_uaf_align) - 1); + } + + return ret; +} + +#endif /* JEMALLOC_INTERNAL_GUARD_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/san_bump.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/san_bump.h new file mode 100644 index 000000000..d6e9cfc5a --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/san_bump.h @@ -0,0 +1,54 @@ +#ifndef JEMALLOC_INTERNAL_SAN_BUMP_H +#define JEMALLOC_INTERNAL_SAN_BUMP_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/edata.h" +#include "jemalloc/internal/exp_grow.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/witness.h" + +#define SBA_RETAINED_ALLOC_SIZE ((size_t)4 << 20) + +extern bool opt_retain; + +typedef struct ehooks_s ehooks_t; +typedef struct pac_s pac_t; + +typedef struct san_bump_alloc_s san_bump_alloc_t; +struct san_bump_alloc_s { + malloc_mutex_t mtx; + + edata_t *curr_reg; +}; + +static inline bool +san_bump_enabled(void) { + /* + * We enable san_bump allocator only when it's possible to break up a + * mapping and unmap a part of it (maps_coalesce). This is needed to + * ensure the arena destruction process can destroy all retained guarded + * extents one by one and to unmap a trailing part of a retained guarded + * region when it's too small to fit a pending allocation. + * opt_retain is required, because this allocator retains a large + * virtual memory mapping and returns smaller parts of it. + */ + return maps_coalesce && opt_retain; +} + +static inline bool +san_bump_alloc_init(san_bump_alloc_t* sba) { + bool err = malloc_mutex_init(&sba->mtx, "sanitizer_bump_allocator", + WITNESS_RANK_SAN_BUMP_ALLOC, malloc_mutex_rank_exclusive); + if (err) { + return true; + } + sba->curr_reg = NULL; + + return false; +} + +edata_t * +san_bump_alloc(tsdn_t *tsdn, san_bump_alloc_t* sba, pac_t *pac, ehooks_t *ehooks, + size_t size, bool zero); + +#endif /* JEMALLOC_INTERNAL_SAN_BUMP_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/sc.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/sc.h new file mode 100644 index 000000000..770835ccd --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/sc.h @@ -0,0 +1,358 @@ +#ifndef JEMALLOC_INTERNAL_SC_H +#define JEMALLOC_INTERNAL_SC_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_types.h" + +/* + * Size class computations: + * + * These are a little tricky; we'll first start by describing how things + * generally work, and then describe some of the details. + * + * Ignore the first few size classes for a moment. We can then split all the + * remaining size classes into groups. The size classes in a group are spaced + * such that they cover allocation request sizes in a power-of-2 range. The + * power of two is called the base of the group, and the size classes in it + * satisfy allocations in the half-open range (base, base * 2]. There are + * SC_NGROUP size classes in each group, equally spaced in the range, so that + * each one covers allocations for base / SC_NGROUP possible allocation sizes. + * We call that value (base / SC_NGROUP) the delta of the group. Each size class + * is delta larger than the one before it (including the initial size class in a + * group, which is delta larger than base, the largest size class in the + * previous group). + * To make the math all work out nicely, we require that SC_NGROUP is a power of + * two, and define it in terms of SC_LG_NGROUP. We'll often talk in terms of + * lg_base and lg_delta. For each of these groups then, we have that + * lg_delta == lg_base - SC_LG_NGROUP. + * The size classes in a group with a given lg_base and lg_delta (which, recall, + * can be computed from lg_base for these groups) are therefore: + * base + 1 * delta + * which covers allocations in (base, base + 1 * delta] + * base + 2 * delta + * which covers allocations in (base + 1 * delta, base + 2 * delta]. + * base + 3 * delta + * which covers allocations in (base + 2 * delta, base + 3 * delta]. + * ... + * base + SC_NGROUP * delta ( == 2 * base) + * which covers allocations in (base + (SC_NGROUP - 1) * delta, 2 * base]. + * (Note that currently SC_NGROUP is always 4, so the "..." is empty in + * practice.) + * Note that the last size class in the group is the next power of two (after + * base), so that we've set up the induction correctly for the next group's + * selection of delta. + * + * Now, let's start considering the first few size classes. Two extra constants + * come into play here: LG_QUANTUM and SC_LG_TINY_MIN. LG_QUANTUM ensures + * correct platform alignment; all objects of size (1 << LG_QUANTUM) or larger + * are at least (1 << LG_QUANTUM) aligned; this can be used to ensure that we + * never return improperly aligned memory, by making (1 << LG_QUANTUM) equal the + * highest required alignment of a platform. For allocation sizes smaller than + * (1 << LG_QUANTUM) though, we can be more relaxed (since we don't support + * platforms with types with alignment larger than their size). To allow such + * allocations (without wasting space unnecessarily), we introduce tiny size + * classes; one per power of two, up until we hit the quantum size. There are + * therefore LG_QUANTUM - SC_LG_TINY_MIN such size classes. + * + * Next, we have a size class of size (1 << LG_QUANTUM). This can't be the + * start of a group in the sense we described above (covering a power of two + * range) since, if we divided into it to pick a value of delta, we'd get a + * delta smaller than (1 << LG_QUANTUM) for sizes >= (1 << LG_QUANTUM), which + * is against the rules. + * + * The first base we can divide by SC_NGROUP while still being at least + * (1 << LG_QUANTUM) is SC_NGROUP * (1 << LG_QUANTUM). We can get there by + * having SC_NGROUP size classes, spaced (1 << LG_QUANTUM) apart. These size + * classes are: + * 1 * (1 << LG_QUANTUM) + * 2 * (1 << LG_QUANTUM) + * 3 * (1 << LG_QUANTUM) + * ... (although, as above, this "..." is empty in practice) + * SC_NGROUP * (1 << LG_QUANTUM). + * + * There are SC_NGROUP of these size classes, so we can regard it as a sort of + * pseudo-group, even though it spans multiple powers of 2, is divided + * differently, and both starts and ends on a power of 2 (as opposed to just + * ending). SC_NGROUP is itself a power of two, so the first group after the + * pseudo-group has the power-of-two base SC_NGROUP * (1 << LG_QUANTUM), for a + * lg_base of LG_QUANTUM + SC_LG_NGROUP. We can divide this base into SC_NGROUP + * sizes without violating our LG_QUANTUM requirements, so we can safely set + * lg_delta = lg_base - SC_LG_GROUP (== LG_QUANTUM). + * + * So, in order, the size classes are: + * + * Tiny size classes: + * - Count: LG_QUANTUM - SC_LG_TINY_MIN. + * - Sizes: + * 1 << SC_LG_TINY_MIN + * 1 << (SC_LG_TINY_MIN + 1) + * 1 << (SC_LG_TINY_MIN + 2) + * ... + * 1 << (LG_QUANTUM - 1) + * + * Initial pseudo-group: + * - Count: SC_NGROUP + * - Sizes: + * 1 * (1 << LG_QUANTUM) + * 2 * (1 << LG_QUANTUM) + * 3 * (1 << LG_QUANTUM) + * ... + * SC_NGROUP * (1 << LG_QUANTUM) + * + * Regular group 0: + * - Count: SC_NGROUP + * - Sizes: + * (relative to lg_base of LG_QUANTUM + SC_LG_NGROUP and lg_delta of + * lg_base - SC_LG_NGROUP) + * (1 << lg_base) + 1 * (1 << lg_delta) + * (1 << lg_base) + 2 * (1 << lg_delta) + * (1 << lg_base) + 3 * (1 << lg_delta) + * ... + * (1 << lg_base) + SC_NGROUP * (1 << lg_delta) [ == (1 << (lg_base + 1)) ] + * + * Regular group 1: + * - Count: SC_NGROUP + * - Sizes: + * (relative to lg_base of LG_QUANTUM + SC_LG_NGROUP + 1 and lg_delta of + * lg_base - SC_LG_NGROUP) + * (1 << lg_base) + 1 * (1 << lg_delta) + * (1 << lg_base) + 2 * (1 << lg_delta) + * (1 << lg_base) + 3 * (1 << lg_delta) + * ... + * (1 << lg_base) + SC_NGROUP * (1 << lg_delta) [ == (1 << (lg_base + 1)) ] + * + * ... + * + * Regular group N: + * - Count: SC_NGROUP + * - Sizes: + * (relative to lg_base of LG_QUANTUM + SC_LG_NGROUP + N and lg_delta of + * lg_base - SC_LG_NGROUP) + * (1 << lg_base) + 1 * (1 << lg_delta) + * (1 << lg_base) + 2 * (1 << lg_delta) + * (1 << lg_base) + 3 * (1 << lg_delta) + * ... + * (1 << lg_base) + SC_NGROUP * (1 << lg_delta) [ == (1 << (lg_base + 1)) ] + * + * + * Representation of metadata: + * To make the math easy, we'll mostly work in lg quantities. We record lg_base, + * lg_delta, and ndelta (i.e. number of deltas above the base) on a + * per-size-class basis, and maintain the invariant that, across all size + * classes, size == (1 << lg_base) + ndelta * (1 << lg_delta). + * + * For regular groups (i.e. those with lg_base >= LG_QUANTUM + SC_LG_NGROUP), + * lg_delta is lg_base - SC_LG_NGROUP, and ndelta goes from 1 to SC_NGROUP. + * + * For the initial tiny size classes (if any), lg_base is lg(size class size). + * lg_delta is lg_base for the first size class, and lg_base - 1 for all + * subsequent ones. ndelta is always 0. + * + * For the pseudo-group, if there are no tiny size classes, then we set + * lg_base == LG_QUANTUM, lg_delta == LG_QUANTUM, and have ndelta range from 0 + * to SC_NGROUP - 1. (Note that delta == base, so base + (SC_NGROUP - 1) * delta + * is just SC_NGROUP * base, or (1 << (SC_LG_NGROUP + LG_QUANTUM)), so we do + * indeed get a power of two that way). If there *are* tiny size classes, then + * the first size class needs to have lg_delta relative to the largest tiny size + * class. We therefore set lg_base == LG_QUANTUM - 1, + * lg_delta == LG_QUANTUM - 1, and ndelta == 1, keeping the rest of the + * pseudo-group the same. + * + * + * Other terminology: + * "Small" size classes mean those that are allocated out of bins, which is the + * same as those that are slab allocated. + * "Large" size classes are those that are not small. The cutoff for counting as + * large is page size * group size. + */ + +/* + * Size class N + (1 << SC_LG_NGROUP) twice the size of size class N. + */ +#define SC_LG_NGROUP 2 +#define SC_LG_TINY_MIN 3 + +#if SC_LG_TINY_MIN == 0 +/* The div module doesn't support division by 1, which this would require. */ +#error "Unsupported LG_TINY_MIN" +#endif + +/* + * The definitions below are all determined by the above settings and system + * characteristics. + */ +#define SC_NGROUP (1ULL << SC_LG_NGROUP) +#define SC_PTR_BITS ((1ULL << LG_SIZEOF_PTR) * 8) +#define SC_NTINY (LG_QUANTUM - SC_LG_TINY_MIN) +#define SC_LG_TINY_MAXCLASS (LG_QUANTUM > SC_LG_TINY_MIN ? LG_QUANTUM - 1 : -1) +#define SC_NPSEUDO SC_NGROUP +#define SC_LG_FIRST_REGULAR_BASE (LG_QUANTUM + SC_LG_NGROUP) +/* + * We cap allocations to be less than 2 ** (ptr_bits - 1), so the highest base + * we need is 2 ** (ptr_bits - 2). (This also means that the last group is 1 + * size class shorter than the others). + * We could probably save some space in arenas by capping this at LG_VADDR size. + */ +#define SC_LG_BASE_MAX (SC_PTR_BITS - 2) +#define SC_NREGULAR (SC_NGROUP * \ + (SC_LG_BASE_MAX - SC_LG_FIRST_REGULAR_BASE + 1) - 1) +#define SC_NSIZES (SC_NTINY + SC_NPSEUDO + SC_NREGULAR) + +/* + * The number of size classes that are a multiple of the page size. + * + * Here are the first few bases that have a page-sized SC. + * + * lg(base) | base | highest SC | page-multiple SCs + * --------------|------------------------------------------ + * LG_PAGE - 1 | PAGE / 2 | PAGE | 1 + * LG_PAGE | PAGE | 2 * PAGE | 1 + * LG_PAGE + 1 | 2 * PAGE | 4 * PAGE | 2 + * LG_PAGE + 2 | 4 * PAGE | 8 * PAGE | 4 + * + * The number of page-multiple SCs continues to grow in powers of two, up until + * lg_delta == lg_page, which corresponds to setting lg_base to lg_page + + * SC_LG_NGROUP. So, then, the number of size classes that are multiples of the + * page size whose lg_delta is less than the page size are + * is 1 + (2**0 + 2**1 + ... + 2**(lg_ngroup - 1) == 2**lg_ngroup. + * + * For each base with lg_base in [lg_page + lg_ngroup, lg_base_max), there are + * NGROUP page-sized size classes, and when lg_base == lg_base_max, there are + * NGROUP - 1. + * + * This gives us the quantity we seek. + */ +#define SC_NPSIZES ( \ + SC_NGROUP \ + + (SC_LG_BASE_MAX - (LG_PAGE + SC_LG_NGROUP)) * SC_NGROUP \ + + SC_NGROUP - 1) + +/* + * We declare a size class is binnable if size < page size * group. Or, in other + * words, lg(size) < lg(page size) + lg(group size). + */ +#define SC_NBINS ( \ + /* Sub-regular size classes. */ \ + SC_NTINY + SC_NPSEUDO \ + /* Groups with lg_regular_min_base <= lg_base <= lg_base_max */ \ + + SC_NGROUP * (LG_PAGE + SC_LG_NGROUP - SC_LG_FIRST_REGULAR_BASE) \ + /* Last SC of the last group hits the bound exactly; exclude it. */ \ + - 1) + +/* + * The size2index_tab lookup table uses uint8_t to encode each bin index, so we + * cannot support more than 256 small size classes. + */ +#if (SC_NBINS > 256) +# error "Too many small size classes" +#endif + +/* The largest size class in the lookup table, and its binary log. */ +#define SC_LG_MAX_LOOKUP 12 +#define SC_LOOKUP_MAXCLASS (1 << SC_LG_MAX_LOOKUP) + +/* Internal, only used for the definition of SC_SMALL_MAXCLASS. */ +#define SC_SMALL_MAX_BASE (1 << (LG_PAGE + SC_LG_NGROUP - 1)) +#define SC_SMALL_MAX_DELTA (1 << (LG_PAGE - 1)) + +/* The largest size class allocated out of a slab. */ +#define SC_SMALL_MAXCLASS (SC_SMALL_MAX_BASE \ + + (SC_NGROUP - 1) * SC_SMALL_MAX_DELTA) + +/* The fastpath assumes all lookup-able sizes are small. */ +#if (SC_SMALL_MAXCLASS < SC_LOOKUP_MAXCLASS) +# error "Lookup table sizes must be small" +#endif + +/* The smallest size class not allocated out of a slab. */ +#define SC_LARGE_MINCLASS ((size_t)1ULL << (LG_PAGE + SC_LG_NGROUP)) +#define SC_LG_LARGE_MINCLASS (LG_PAGE + SC_LG_NGROUP) + +/* Internal; only used for the definition of SC_LARGE_MAXCLASS. */ +#define SC_MAX_BASE ((size_t)1 << (SC_PTR_BITS - 2)) +#define SC_MAX_DELTA ((size_t)1 << (SC_PTR_BITS - 2 - SC_LG_NGROUP)) + +/* The largest size class supported. */ +#define SC_LARGE_MAXCLASS (SC_MAX_BASE + (SC_NGROUP - 1) * SC_MAX_DELTA) + +/* Maximum number of regions in one slab. */ +#ifndef CONFIG_LG_SLAB_MAXREGS +# define SC_LG_SLAB_MAXREGS (LG_PAGE - SC_LG_TINY_MIN) +#else +# if CONFIG_LG_SLAB_MAXREGS < (LG_PAGE - SC_LG_TINY_MIN) +# error "Unsupported SC_LG_SLAB_MAXREGS" +# else +# define SC_LG_SLAB_MAXREGS CONFIG_LG_SLAB_MAXREGS +# endif +#endif + +#define SC_SLAB_MAXREGS (1U << SC_LG_SLAB_MAXREGS) + +typedef struct sc_s sc_t; +struct sc_s { + /* Size class index, or -1 if not a valid size class. */ + int index; + /* Lg group base size (no deltas added). */ + int lg_base; + /* Lg delta to previous size class. */ + int lg_delta; + /* Delta multiplier. size == 1<bytes += src->bytes; +} + +/* A collections of free extents, all of the same size. */ +typedef struct sec_bin_s sec_bin_t; +struct sec_bin_s { + /* + * When we fail to fulfill an allocation, we do a batch-alloc on the + * underlying allocator to fill extra items, as well. We drop the SEC + * lock while doing so, to allow operations on other bins to succeed. + * That introduces the possibility of other threads also trying to + * allocate out of this bin, failing, and also going to the backing + * allocator. To avoid a thundering herd problem in which lots of + * threads do batch allocs and overfill this bin as a result, we only + * allow one batch allocation at a time for a bin. This bool tracks + * whether or not some thread is already batch allocating. + * + * Eventually, the right answer may be a smarter sharding policy for the + * bins (e.g. a mutex per bin, which would also be more scalable + * generally; the batch-allocating thread could hold it while + * batch-allocating). + */ + bool being_batch_filled; + + /* + * Number of bytes in this particular bin (as opposed to the + * sec_shard_t's bytes_cur. This isn't user visible or reported in + * stats; rather, it allows us to quickly determine the change in the + * centralized counter when flushing. + */ + size_t bytes_cur; + edata_list_active_t freelist; +}; + +typedef struct sec_shard_s sec_shard_t; +struct sec_shard_s { + /* + * We don't keep per-bin mutexes, even though that would allow more + * sharding; this allows global cache-eviction, which in turn allows for + * better balancing across free lists. + */ + malloc_mutex_t mtx; + /* + * A SEC may need to be shut down (i.e. flushed of its contents and + * prevented from further caching). To avoid tricky synchronization + * issues, we just track enabled-status in each shard, guarded by a + * mutex. In practice, this is only ever checked during brief races, + * since the arena-level atomic boolean tracking HPA enabled-ness means + * that we won't go down these pathways very often after custom extent + * hooks are installed. + */ + bool enabled; + sec_bin_t *bins; + /* Number of bytes in all bins in the shard. */ + size_t bytes_cur; + /* The next pszind to flush in the flush-some pathways. */ + pszind_t to_flush_next; +}; + +typedef struct sec_s sec_t; +struct sec_s { + pai_t pai; + pai_t *fallback; + + sec_opts_t opts; + sec_shard_t *shards; + pszind_t npsizes; +}; + +bool sec_init(tsdn_t *tsdn, sec_t *sec, base_t *base, pai_t *fallback, + const sec_opts_t *opts); +void sec_flush(tsdn_t *tsdn, sec_t *sec); +void sec_disable(tsdn_t *tsdn, sec_t *sec); + +/* + * Morally, these two stats methods probably ought to be a single one (and the + * mutex_prof_data ought to live in the sec_stats_t. But splitting them apart + * lets them fit easily into the pa_shard stats framework (which also has this + * split), which simplifies the stats management. + */ +void sec_stats_merge(tsdn_t *tsdn, sec_t *sec, sec_stats_t *stats); +void sec_mutex_stats_read(tsdn_t *tsdn, sec_t *sec, + mutex_prof_data_t *mutex_prof_data); + +/* + * We use the arena lock ordering; these are acquired in phase 2 of forking, but + * should be acquired before the underlying allocator mutexes. + */ +void sec_prefork2(tsdn_t *tsdn, sec_t *sec); +void sec_postfork_parent(tsdn_t *tsdn, sec_t *sec); +void sec_postfork_child(tsdn_t *tsdn, sec_t *sec); + +#endif /* JEMALLOC_INTERNAL_SEC_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/sec_opts.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/sec_opts.h new file mode 100644 index 000000000..19ed14921 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/sec_opts.h @@ -0,0 +1,61 @@ +#ifndef JEMALLOC_INTERNAL_SEC_OPTS_H +#define JEMALLOC_INTERNAL_SEC_OPTS_H + +#include "jemalloc/internal/jemalloc_preamble.h" + +/* + * The configuration settings used by an sec_t. Morally, this is part of the + * SEC interface, but we put it here for header-ordering reasons. + */ + +typedef struct sec_opts_s sec_opts_t; +struct sec_opts_s { + /* + * We don't necessarily always use all the shards; requests are + * distributed across shards [0, nshards - 1). + */ + size_t nshards; + /* + * We'll automatically refuse to cache any objects in this sec if + * they're larger than max_alloc bytes, instead forwarding such objects + * directly to the fallback. + */ + size_t max_alloc; + /* + * Exceeding this amount of cached extents in a shard causes us to start + * flushing bins in that shard until we fall below bytes_after_flush. + */ + size_t max_bytes; + /* + * The number of bytes (in all bins) we flush down to when we exceed + * bytes_cur. We want this to be less than bytes_cur, because + * otherwise we could get into situations where a shard undergoing + * net-deallocation keeps bytes_cur very near to max_bytes, so that + * most deallocations get immediately forwarded to the underlying PAI + * implementation, defeating the point of the SEC. + */ + size_t bytes_after_flush; + /* + * When we can't satisfy an allocation out of the SEC because there are + * no available ones cached, we allocate multiple of that size out of + * the fallback allocator. Eventually we might want to do something + * cleverer, but for now we just grab a fixed number. + */ + size_t batch_fill_extra; +}; + +#define SEC_OPTS_DEFAULT { \ + /* nshards */ \ + 4, \ + /* max_alloc */ \ + (32 * 1024) < PAGE ? PAGE : (32 * 1024), \ + /* max_bytes */ \ + 256 * 1024, \ + /* bytes_after_flush */ \ + 128 * 1024, \ + /* batch_fill_extra */ \ + 0 \ +} + + +#endif /* JEMALLOC_INTERNAL_SEC_OPTS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/seq.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/seq.h new file mode 100644 index 000000000..9bb6b235d --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/seq.h @@ -0,0 +1,56 @@ +#ifndef JEMALLOC_INTERNAL_SEQ_H +#define JEMALLOC_INTERNAL_SEQ_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/atomic.h" + +/* + * A simple seqlock implementation. + */ + +#define seq_define(type, short_type) \ +typedef struct { \ + atomic_zu_t seq; \ + atomic_zu_t data[ \ + (sizeof(type) + sizeof(size_t) - 1) / sizeof(size_t)]; \ +} seq_##short_type##_t; \ + \ +/* \ + * No internal synchronization -- the caller must ensure that there's \ + * only a single writer at a time. \ + */ \ +static inline void \ +seq_store_##short_type(seq_##short_type##_t *dst, type *src) { \ + size_t buf[sizeof(dst->data) / sizeof(size_t)]; \ + buf[sizeof(buf) / sizeof(size_t) - 1] = 0; \ + memcpy(buf, src, sizeof(type)); \ + size_t old_seq = atomic_load_zu(&dst->seq, ATOMIC_RELAXED); \ + atomic_store_zu(&dst->seq, old_seq + 1, ATOMIC_RELAXED); \ + atomic_fence(ATOMIC_RELEASE); \ + for (size_t i = 0; i < sizeof(buf) / sizeof(size_t); i++) { \ + atomic_store_zu(&dst->data[i], buf[i], ATOMIC_RELAXED); \ + } \ + atomic_store_zu(&dst->seq, old_seq + 2, ATOMIC_RELEASE); \ +} \ + \ +/* Returns whether or not the read was consistent. */ \ +static inline bool \ +seq_try_load_##short_type(type *dst, seq_##short_type##_t *src) { \ + size_t buf[sizeof(src->data) / sizeof(size_t)]; \ + size_t seq1 = atomic_load_zu(&src->seq, ATOMIC_ACQUIRE); \ + if (seq1 % 2 != 0) { \ + return false; \ + } \ + for (size_t i = 0; i < sizeof(buf) / sizeof(size_t); i++) { \ + buf[i] = atomic_load_zu(&src->data[i], ATOMIC_RELAXED); \ + } \ + atomic_fence(ATOMIC_ACQUIRE); \ + size_t seq2 = atomic_load_zu(&src->seq, ATOMIC_RELAXED); \ + if (seq1 != seq2) { \ + return false; \ + } \ + memcpy(dst, buf, sizeof(type)); \ + return true; \ +} + +#endif /* JEMALLOC_INTERNAL_SEQ_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/slab_data.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/slab_data.h new file mode 100644 index 000000000..724c71e36 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/slab_data.h @@ -0,0 +1,13 @@ +#ifndef JEMALLOC_INTERNAL_SLAB_DATA_H +#define JEMALLOC_INTERNAL_SLAB_DATA_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/bitmap.h" + +typedef struct slab_data_s slab_data_t; +struct slab_data_s { + /* Per region allocated/deallocated bitmap. */ + bitmap_t bitmap[BITMAP_GROUPS_MAX]; +}; + +#endif /* JEMALLOC_INTERNAL_SLAB_DATA_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/smoothstep.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/smoothstep.h new file mode 100644 index 000000000..2e14430f5 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/smoothstep.h @@ -0,0 +1,232 @@ +#ifndef JEMALLOC_INTERNAL_SMOOTHSTEP_H +#define JEMALLOC_INTERNAL_SMOOTHSTEP_H + +/* + * This file was generated by the following command: + * sh smoothstep.sh smoother 200 24 3 15 + */ +/******************************************************************************/ + +/* + * This header defines a precomputed table based on the smoothstep family of + * sigmoidal curves (https://en.wikipedia.org/wiki/Smoothstep) that grow from 0 + * to 1 in 0 <= x <= 1. The table is stored as integer fixed point values so + * that floating point math can be avoided. + * + * 3 2 + * smoothstep(x) = -2x + 3x + * + * 5 4 3 + * smootherstep(x) = 6x - 15x + 10x + * + * 7 6 5 4 + * smootheststep(x) = -20x + 70x - 84x + 35x + */ + +#define SMOOTHSTEP_VARIANT "smoother" +#define SMOOTHSTEP_NSTEPS 200 +#define SMOOTHSTEP_BFP 24 +#define SMOOTHSTEP \ + /* STEP(step, h, x, y) */ \ + STEP( 1, UINT64_C(0x0000000000000014), 0.005, 0.000001240643750) \ + STEP( 2, UINT64_C(0x00000000000000a5), 0.010, 0.000009850600000) \ + STEP( 3, UINT64_C(0x0000000000000229), 0.015, 0.000032995181250) \ + STEP( 4, UINT64_C(0x0000000000000516), 0.020, 0.000077619200000) \ + STEP( 5, UINT64_C(0x00000000000009dc), 0.025, 0.000150449218750) \ + STEP( 6, UINT64_C(0x00000000000010e8), 0.030, 0.000257995800000) \ + STEP( 7, UINT64_C(0x0000000000001aa4), 0.035, 0.000406555756250) \ + STEP( 8, UINT64_C(0x0000000000002777), 0.040, 0.000602214400000) \ + STEP( 9, UINT64_C(0x00000000000037c2), 0.045, 0.000850847793750) \ + STEP( 10, UINT64_C(0x0000000000004be6), 0.050, 0.001158125000000) \ + STEP( 11, UINT64_C(0x000000000000643c), 0.055, 0.001529510331250) \ + STEP( 12, UINT64_C(0x000000000000811f), 0.060, 0.001970265600000) \ + STEP( 13, UINT64_C(0x000000000000a2e2), 0.065, 0.002485452368750) \ + STEP( 14, UINT64_C(0x000000000000c9d8), 0.070, 0.003079934200000) \ + STEP( 15, UINT64_C(0x000000000000f64f), 0.075, 0.003758378906250) \ + STEP( 16, UINT64_C(0x0000000000012891), 0.080, 0.004525260800000) \ + STEP( 17, UINT64_C(0x00000000000160e7), 0.085, 0.005384862943750) \ + STEP( 18, UINT64_C(0x0000000000019f95), 0.090, 0.006341279400000) \ + STEP( 19, UINT64_C(0x000000000001e4dc), 0.095, 0.007398417481250) \ + STEP( 20, UINT64_C(0x00000000000230fc), 0.100, 0.008560000000000) \ + STEP( 21, UINT64_C(0x0000000000028430), 0.105, 0.009829567518750) \ + STEP( 22, UINT64_C(0x000000000002deb0), 0.110, 0.011210480600000) \ + STEP( 23, UINT64_C(0x00000000000340b1), 0.115, 0.012705922056250) \ + STEP( 24, UINT64_C(0x000000000003aa67), 0.120, 0.014318899200000) \ + STEP( 25, UINT64_C(0x0000000000041c00), 0.125, 0.016052246093750) \ + STEP( 26, UINT64_C(0x00000000000495a8), 0.130, 0.017908625800000) \ + STEP( 27, UINT64_C(0x000000000005178b), 0.135, 0.019890532631250) \ + STEP( 28, UINT64_C(0x000000000005a1cf), 0.140, 0.022000294400000) \ + STEP( 29, UINT64_C(0x0000000000063498), 0.145, 0.024240074668750) \ + STEP( 30, UINT64_C(0x000000000006d009), 0.150, 0.026611875000000) \ + STEP( 31, UINT64_C(0x000000000007743f), 0.155, 0.029117537206250) \ + STEP( 32, UINT64_C(0x0000000000082157), 0.160, 0.031758745600000) \ + STEP( 33, UINT64_C(0x000000000008d76b), 0.165, 0.034537029243750) \ + STEP( 34, UINT64_C(0x0000000000099691), 0.170, 0.037453764200000) \ + STEP( 35, UINT64_C(0x00000000000a5edf), 0.175, 0.040510175781250) \ + STEP( 36, UINT64_C(0x00000000000b3067), 0.180, 0.043707340800000) \ + STEP( 37, UINT64_C(0x00000000000c0b38), 0.185, 0.047046189818750) \ + STEP( 38, UINT64_C(0x00000000000cef5e), 0.190, 0.050527509400000) \ + STEP( 39, UINT64_C(0x00000000000ddce6), 0.195, 0.054151944356250) \ + STEP( 40, UINT64_C(0x00000000000ed3d8), 0.200, 0.057920000000000) \ + STEP( 41, UINT64_C(0x00000000000fd439), 0.205, 0.061832044393750) \ + STEP( 42, UINT64_C(0x000000000010de0e), 0.210, 0.065888310600000) \ + STEP( 43, UINT64_C(0x000000000011f158), 0.215, 0.070088898931250) \ + STEP( 44, UINT64_C(0x0000000000130e17), 0.220, 0.074433779200000) \ + STEP( 45, UINT64_C(0x0000000000143448), 0.225, 0.078922792968750) \ + STEP( 46, UINT64_C(0x00000000001563e7), 0.230, 0.083555655800000) \ + STEP( 47, UINT64_C(0x0000000000169cec), 0.235, 0.088331959506250) \ + STEP( 48, UINT64_C(0x000000000017df4f), 0.240, 0.093251174400000) \ + STEP( 49, UINT64_C(0x0000000000192b04), 0.245, 0.098312651543750) \ + STEP( 50, UINT64_C(0x00000000001a8000), 0.250, 0.103515625000000) \ + STEP( 51, UINT64_C(0x00000000001bde32), 0.255, 0.108859214081250) \ + STEP( 52, UINT64_C(0x00000000001d458b), 0.260, 0.114342425600000) \ + STEP( 53, UINT64_C(0x00000000001eb5f8), 0.265, 0.119964156118750) \ + STEP( 54, UINT64_C(0x0000000000202f65), 0.270, 0.125723194200000) \ + STEP( 55, UINT64_C(0x000000000021b1bb), 0.275, 0.131618222656250) \ + STEP( 56, UINT64_C(0x0000000000233ce3), 0.280, 0.137647820800000) \ + STEP( 57, UINT64_C(0x000000000024d0c3), 0.285, 0.143810466693750) \ + STEP( 58, UINT64_C(0x0000000000266d40), 0.290, 0.150104539400000) \ + STEP( 59, UINT64_C(0x000000000028123d), 0.295, 0.156528321231250) \ + STEP( 60, UINT64_C(0x000000000029bf9c), 0.300, 0.163080000000000) \ + STEP( 61, UINT64_C(0x00000000002b753d), 0.305, 0.169757671268750) \ + STEP( 62, UINT64_C(0x00000000002d32fe), 0.310, 0.176559340600000) \ + STEP( 63, UINT64_C(0x00000000002ef8bc), 0.315, 0.183482925806250) \ + STEP( 64, UINT64_C(0x000000000030c654), 0.320, 0.190526259200000) \ + STEP( 65, UINT64_C(0x0000000000329b9f), 0.325, 0.197687089843750) \ + STEP( 66, UINT64_C(0x0000000000347875), 0.330, 0.204963085800000) \ + STEP( 67, UINT64_C(0x0000000000365cb0), 0.335, 0.212351836381250) \ + STEP( 68, UINT64_C(0x0000000000384825), 0.340, 0.219850854400000) \ + STEP( 69, UINT64_C(0x00000000003a3aa8), 0.345, 0.227457578418750) \ + STEP( 70, UINT64_C(0x00000000003c340f), 0.350, 0.235169375000000) \ + STEP( 71, UINT64_C(0x00000000003e342b), 0.355, 0.242983540956250) \ + STEP( 72, UINT64_C(0x0000000000403ace), 0.360, 0.250897305600000) \ + STEP( 73, UINT64_C(0x00000000004247c8), 0.365, 0.258907832993750) \ + STEP( 74, UINT64_C(0x0000000000445ae9), 0.370, 0.267012224200000) \ + STEP( 75, UINT64_C(0x0000000000467400), 0.375, 0.275207519531250) \ + STEP( 76, UINT64_C(0x00000000004892d8), 0.380, 0.283490700800000) \ + STEP( 77, UINT64_C(0x00000000004ab740), 0.385, 0.291858693568750) \ + STEP( 78, UINT64_C(0x00000000004ce102), 0.390, 0.300308369400000) \ + STEP( 79, UINT64_C(0x00000000004f0fe9), 0.395, 0.308836548106250) \ + STEP( 80, UINT64_C(0x00000000005143bf), 0.400, 0.317440000000000) \ + STEP( 81, UINT64_C(0x0000000000537c4d), 0.405, 0.326115448143750) \ + STEP( 82, UINT64_C(0x000000000055b95b), 0.410, 0.334859570600000) \ + STEP( 83, UINT64_C(0x000000000057fab1), 0.415, 0.343669002681250) \ + STEP( 84, UINT64_C(0x00000000005a4015), 0.420, 0.352540339200000) \ + STEP( 85, UINT64_C(0x00000000005c894e), 0.425, 0.361470136718750) \ + STEP( 86, UINT64_C(0x00000000005ed622), 0.430, 0.370454915800000) \ + STEP( 87, UINT64_C(0x0000000000612655), 0.435, 0.379491163256250) \ + STEP( 88, UINT64_C(0x00000000006379ac), 0.440, 0.388575334400000) \ + STEP( 89, UINT64_C(0x000000000065cfeb), 0.445, 0.397703855293750) \ + STEP( 90, UINT64_C(0x00000000006828d6), 0.450, 0.406873125000000) \ + STEP( 91, UINT64_C(0x00000000006a842f), 0.455, 0.416079517831250) \ + STEP( 92, UINT64_C(0x00000000006ce1bb), 0.460, 0.425319385600000) \ + STEP( 93, UINT64_C(0x00000000006f413a), 0.465, 0.434589059868750) \ + STEP( 94, UINT64_C(0x000000000071a270), 0.470, 0.443884854200000) \ + STEP( 95, UINT64_C(0x000000000074051d), 0.475, 0.453203066406250) \ + STEP( 96, UINT64_C(0x0000000000766905), 0.480, 0.462539980800000) \ + STEP( 97, UINT64_C(0x000000000078cde7), 0.485, 0.471891870443750) \ + STEP( 98, UINT64_C(0x00000000007b3387), 0.490, 0.481254999400000) \ + STEP( 99, UINT64_C(0x00000000007d99a4), 0.495, 0.490625624981250) \ + STEP( 100, UINT64_C(0x0000000000800000), 0.500, 0.500000000000000) \ + STEP( 101, UINT64_C(0x000000000082665b), 0.505, 0.509374375018750) \ + STEP( 102, UINT64_C(0x000000000084cc78), 0.510, 0.518745000600000) \ + STEP( 103, UINT64_C(0x0000000000873218), 0.515, 0.528108129556250) \ + STEP( 104, UINT64_C(0x00000000008996fa), 0.520, 0.537460019200000) \ + STEP( 105, UINT64_C(0x00000000008bfae2), 0.525, 0.546796933593750) \ + STEP( 106, UINT64_C(0x00000000008e5d8f), 0.530, 0.556115145800000) \ + STEP( 107, UINT64_C(0x000000000090bec5), 0.535, 0.565410940131250) \ + STEP( 108, UINT64_C(0x0000000000931e44), 0.540, 0.574680614400000) \ + STEP( 109, UINT64_C(0x0000000000957bd0), 0.545, 0.583920482168750) \ + STEP( 110, UINT64_C(0x000000000097d729), 0.550, 0.593126875000000) \ + STEP( 111, UINT64_C(0x00000000009a3014), 0.555, 0.602296144706250) \ + STEP( 112, UINT64_C(0x00000000009c8653), 0.560, 0.611424665600000) \ + STEP( 113, UINT64_C(0x00000000009ed9aa), 0.565, 0.620508836743750) \ + STEP( 114, UINT64_C(0x0000000000a129dd), 0.570, 0.629545084200000) \ + STEP( 115, UINT64_C(0x0000000000a376b1), 0.575, 0.638529863281250) \ + STEP( 116, UINT64_C(0x0000000000a5bfea), 0.580, 0.647459660800000) \ + STEP( 117, UINT64_C(0x0000000000a8054e), 0.585, 0.656330997318750) \ + STEP( 118, UINT64_C(0x0000000000aa46a4), 0.590, 0.665140429400000) \ + STEP( 119, UINT64_C(0x0000000000ac83b2), 0.595, 0.673884551856250) \ + STEP( 120, UINT64_C(0x0000000000aebc40), 0.600, 0.682560000000000) \ + STEP( 121, UINT64_C(0x0000000000b0f016), 0.605, 0.691163451893750) \ + STEP( 122, UINT64_C(0x0000000000b31efd), 0.610, 0.699691630600000) \ + STEP( 123, UINT64_C(0x0000000000b548bf), 0.615, 0.708141306431250) \ + STEP( 124, UINT64_C(0x0000000000b76d27), 0.620, 0.716509299200000) \ + STEP( 125, UINT64_C(0x0000000000b98c00), 0.625, 0.724792480468750) \ + STEP( 126, UINT64_C(0x0000000000bba516), 0.630, 0.732987775800000) \ + STEP( 127, UINT64_C(0x0000000000bdb837), 0.635, 0.741092167006250) \ + STEP( 128, UINT64_C(0x0000000000bfc531), 0.640, 0.749102694400000) \ + STEP( 129, UINT64_C(0x0000000000c1cbd4), 0.645, 0.757016459043750) \ + STEP( 130, UINT64_C(0x0000000000c3cbf0), 0.650, 0.764830625000000) \ + STEP( 131, UINT64_C(0x0000000000c5c557), 0.655, 0.772542421581250) \ + STEP( 132, UINT64_C(0x0000000000c7b7da), 0.660, 0.780149145600000) \ + STEP( 133, UINT64_C(0x0000000000c9a34f), 0.665, 0.787648163618750) \ + STEP( 134, UINT64_C(0x0000000000cb878a), 0.670, 0.795036914200000) \ + STEP( 135, UINT64_C(0x0000000000cd6460), 0.675, 0.802312910156250) \ + STEP( 136, UINT64_C(0x0000000000cf39ab), 0.680, 0.809473740800000) \ + STEP( 137, UINT64_C(0x0000000000d10743), 0.685, 0.816517074193750) \ + STEP( 138, UINT64_C(0x0000000000d2cd01), 0.690, 0.823440659400000) \ + STEP( 139, UINT64_C(0x0000000000d48ac2), 0.695, 0.830242328731250) \ + STEP( 140, UINT64_C(0x0000000000d64063), 0.700, 0.836920000000000) \ + STEP( 141, UINT64_C(0x0000000000d7edc2), 0.705, 0.843471678768750) \ + STEP( 142, UINT64_C(0x0000000000d992bf), 0.710, 0.849895460600000) \ + STEP( 143, UINT64_C(0x0000000000db2f3c), 0.715, 0.856189533306250) \ + STEP( 144, UINT64_C(0x0000000000dcc31c), 0.720, 0.862352179200000) \ + STEP( 145, UINT64_C(0x0000000000de4e44), 0.725, 0.868381777343750) \ + STEP( 146, UINT64_C(0x0000000000dfd09a), 0.730, 0.874276805800000) \ + STEP( 147, UINT64_C(0x0000000000e14a07), 0.735, 0.880035843881250) \ + STEP( 148, UINT64_C(0x0000000000e2ba74), 0.740, 0.885657574400000) \ + STEP( 149, UINT64_C(0x0000000000e421cd), 0.745, 0.891140785918750) \ + STEP( 150, UINT64_C(0x0000000000e58000), 0.750, 0.896484375000000) \ + STEP( 151, UINT64_C(0x0000000000e6d4fb), 0.755, 0.901687348456250) \ + STEP( 152, UINT64_C(0x0000000000e820b0), 0.760, 0.906748825600000) \ + STEP( 153, UINT64_C(0x0000000000e96313), 0.765, 0.911668040493750) \ + STEP( 154, UINT64_C(0x0000000000ea9c18), 0.770, 0.916444344200000) \ + STEP( 155, UINT64_C(0x0000000000ebcbb7), 0.775, 0.921077207031250) \ + STEP( 156, UINT64_C(0x0000000000ecf1e8), 0.780, 0.925566220800000) \ + STEP( 157, UINT64_C(0x0000000000ee0ea7), 0.785, 0.929911101068750) \ + STEP( 158, UINT64_C(0x0000000000ef21f1), 0.790, 0.934111689400000) \ + STEP( 159, UINT64_C(0x0000000000f02bc6), 0.795, 0.938167955606250) \ + STEP( 160, UINT64_C(0x0000000000f12c27), 0.800, 0.942080000000000) \ + STEP( 161, UINT64_C(0x0000000000f22319), 0.805, 0.945848055643750) \ + STEP( 162, UINT64_C(0x0000000000f310a1), 0.810, 0.949472490600000) \ + STEP( 163, UINT64_C(0x0000000000f3f4c7), 0.815, 0.952953810181250) \ + STEP( 164, UINT64_C(0x0000000000f4cf98), 0.820, 0.956292659200000) \ + STEP( 165, UINT64_C(0x0000000000f5a120), 0.825, 0.959489824218750) \ + STEP( 166, UINT64_C(0x0000000000f6696e), 0.830, 0.962546235800000) \ + STEP( 167, UINT64_C(0x0000000000f72894), 0.835, 0.965462970756250) \ + STEP( 168, UINT64_C(0x0000000000f7dea8), 0.840, 0.968241254400000) \ + STEP( 169, UINT64_C(0x0000000000f88bc0), 0.845, 0.970882462793750) \ + STEP( 170, UINT64_C(0x0000000000f92ff6), 0.850, 0.973388125000000) \ + STEP( 171, UINT64_C(0x0000000000f9cb67), 0.855, 0.975759925331250) \ + STEP( 172, UINT64_C(0x0000000000fa5e30), 0.860, 0.977999705600000) \ + STEP( 173, UINT64_C(0x0000000000fae874), 0.865, 0.980109467368750) \ + STEP( 174, UINT64_C(0x0000000000fb6a57), 0.870, 0.982091374200000) \ + STEP( 175, UINT64_C(0x0000000000fbe400), 0.875, 0.983947753906250) \ + STEP( 176, UINT64_C(0x0000000000fc5598), 0.880, 0.985681100800000) \ + STEP( 177, UINT64_C(0x0000000000fcbf4e), 0.885, 0.987294077943750) \ + STEP( 178, UINT64_C(0x0000000000fd214f), 0.890, 0.988789519400000) \ + STEP( 179, UINT64_C(0x0000000000fd7bcf), 0.895, 0.990170432481250) \ + STEP( 180, UINT64_C(0x0000000000fdcf03), 0.900, 0.991440000000000) \ + STEP( 181, UINT64_C(0x0000000000fe1b23), 0.905, 0.992601582518750) \ + STEP( 182, UINT64_C(0x0000000000fe606a), 0.910, 0.993658720600000) \ + STEP( 183, UINT64_C(0x0000000000fe9f18), 0.915, 0.994615137056250) \ + STEP( 184, UINT64_C(0x0000000000fed76e), 0.920, 0.995474739200000) \ + STEP( 185, UINT64_C(0x0000000000ff09b0), 0.925, 0.996241621093750) \ + STEP( 186, UINT64_C(0x0000000000ff3627), 0.930, 0.996920065800000) \ + STEP( 187, UINT64_C(0x0000000000ff5d1d), 0.935, 0.997514547631250) \ + STEP( 188, UINT64_C(0x0000000000ff7ee0), 0.940, 0.998029734400000) \ + STEP( 189, UINT64_C(0x0000000000ff9bc3), 0.945, 0.998470489668750) \ + STEP( 190, UINT64_C(0x0000000000ffb419), 0.950, 0.998841875000000) \ + STEP( 191, UINT64_C(0x0000000000ffc83d), 0.955, 0.999149152206250) \ + STEP( 192, UINT64_C(0x0000000000ffd888), 0.960, 0.999397785600000) \ + STEP( 193, UINT64_C(0x0000000000ffe55b), 0.965, 0.999593444243750) \ + STEP( 194, UINT64_C(0x0000000000ffef17), 0.970, 0.999742004200000) \ + STEP( 195, UINT64_C(0x0000000000fff623), 0.975, 0.999849550781250) \ + STEP( 196, UINT64_C(0x0000000000fffae9), 0.980, 0.999922380800000) \ + STEP( 197, UINT64_C(0x0000000000fffdd6), 0.985, 0.999967004818750) \ + STEP( 198, UINT64_C(0x0000000000ffff5a), 0.990, 0.999990149400000) \ + STEP( 199, UINT64_C(0x0000000000ffffeb), 0.995, 0.999998759356250) \ + STEP( 200, UINT64_C(0x0000000001000000), 1.000, 1.000000000000000) \ + +#endif /* JEMALLOC_INTERNAL_SMOOTHSTEP_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/spin.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/spin.h new file mode 100644 index 000000000..87c400d5a --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/spin.h @@ -0,0 +1,42 @@ +#ifndef JEMALLOC_INTERNAL_SPIN_H +#define JEMALLOC_INTERNAL_SPIN_H + +#include "jemalloc/internal/jemalloc_preamble.h" + +#define SPIN_INITIALIZER {0U} + +typedef struct { + unsigned iteration; +} spin_t; + +static inline void +spin_cpu_spinwait(void) { +# if HAVE_CPU_SPINWAIT + CPU_SPINWAIT; +# else + volatile int x = 0; + x = x; +# endif +} + +static inline void +spin_adaptive(spin_t *spin) { + volatile uint32_t i; + + if (spin->iteration < 5) { + for (i = 0; i < (1U << spin->iteration); i++) { + spin_cpu_spinwait(); + } + spin->iteration++; + } else { +#ifdef _WIN32 + SwitchToThread(); +#else + sched_yield(); +#endif + } +} + +#undef SPIN_INLINE + +#endif /* JEMALLOC_INTERNAL_SPIN_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/stats.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/stats.h new file mode 100644 index 000000000..310178eaa --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/stats.h @@ -0,0 +1,58 @@ +#ifndef JEMALLOC_INTERNAL_STATS_H +#define JEMALLOC_INTERNAL_STATS_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_types.h" +#include "jemalloc/internal/tsd_types.h" + +/* OPTION(opt, var_name, default, set_value_to) */ +#define STATS_PRINT_OPTIONS \ + OPTION('J', json, false, true) \ + OPTION('g', general, true, false) \ + OPTION('m', merged, config_stats, false) \ + OPTION('d', destroyed, config_stats, false) \ + OPTION('a', unmerged, config_stats, false) \ + OPTION('b', bins, true, false) \ + OPTION('l', large, true, false) \ + OPTION('x', mutex, true, false) \ + OPTION('e', extents, true, false) \ + OPTION('h', hpa, config_stats, false) + +enum { +#define OPTION(o, v, d, s) stats_print_option_num_##v, + STATS_PRINT_OPTIONS +#undef OPTION + stats_print_tot_num_options +}; + +/* Options for stats_print. */ +extern bool opt_stats_print; +extern char opt_stats_print_opts[stats_print_tot_num_options+1]; + +/* Utilities for stats_interval. */ +extern int64_t opt_stats_interval; +extern char opt_stats_interval_opts[stats_print_tot_num_options+1]; + +#define STATS_INTERVAL_DEFAULT -1 +/* + * Batch-increment the counter to reduce synchronization overhead. Each thread + * merges after (interval >> LG_BATCH_SIZE) bytes of allocations; also limit the + * BATCH_MAX for accuracy when the interval is huge (which is expected). + */ +#define STATS_INTERVAL_ACCUM_LG_BATCH_SIZE 6 +#define STATS_INTERVAL_ACCUM_BATCH_MAX (4 << 20) + +/* Only accessed by thread event. */ +uint64_t stats_interval_new_event_wait(tsd_t *tsd); +uint64_t stats_interval_postponed_event_wait(tsd_t *tsd); +void stats_interval_event_handler(tsd_t *tsd, uint64_t elapsed); + +/* Implements je_malloc_stats_print. */ +void stats_print(write_cb_t *write_cb, void *cbopaque, const char *opts); + +bool stats_boot(void); +void stats_prefork(tsdn_t *tsdn); +void stats_postfork_parent(tsdn_t *tsdn); +void stats_postfork_child(tsdn_t *tsdn); + +#endif /* JEMALLOC_INTERNAL_STATS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/sz.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/sz.h new file mode 100644 index 000000000..955d8ec09 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/sz.h @@ -0,0 +1,387 @@ +#ifndef JEMALLOC_INTERNAL_SIZE_H +#define JEMALLOC_INTERNAL_SIZE_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/bit_util.h" +#include "jemalloc/internal/pages.h" +#include "jemalloc/internal/sc.h" +#include "jemalloc/internal/util.h" + +/* + * sz module: Size computations. + * + * Some abbreviations used here: + * p: Page + * ind: Index + * s, sz: Size + * u: Usable size + * a: Aligned + * + * These are not always used completely consistently, but should be enough to + * interpret function names. E.g. sz_psz2ind converts page size to page size + * index; sz_sa2u converts a (size, alignment) allocation request to the usable + * size that would result from such an allocation. + */ + +/* Page size index type. */ +typedef unsigned pszind_t; + +/* Size class index type. */ +typedef unsigned szind_t; + +/* + * sz_pind2sz_tab encodes the same information as could be computed by + * sz_pind2sz_compute(). + */ +extern size_t sz_pind2sz_tab[SC_NPSIZES + 1]; +/* + * sz_index2size_tab encodes the same information as could be computed (at + * unacceptable cost in some code paths) by sz_index2size_compute(). + */ +extern size_t sz_index2size_tab[SC_NSIZES]; +/* + * sz_size2index_tab is a compact lookup table that rounds request sizes up to + * size classes. In order to reduce cache footprint, the table is compressed, + * and all accesses are via sz_size2index(). + */ +extern uint8_t sz_size2index_tab[]; + +/* + * Padding for large allocations: PAGE when opt_cache_oblivious == true (to + * enable cache index randomization); 0 otherwise. + */ +extern size_t sz_large_pad; + +extern void sz_boot(const sc_data_t *sc_data, bool cache_oblivious); + +JEMALLOC_ALWAYS_INLINE pszind_t +sz_psz2ind(size_t psz) { + assert(psz > 0); + if (unlikely(psz > SC_LARGE_MAXCLASS)) { + return SC_NPSIZES; + } + /* x is the lg of the first base >= psz. */ + pszind_t x = lg_ceil(psz); + /* + * sc.h introduces a lot of size classes. These size classes are divided + * into different size class groups. There is a very special size class + * group, each size class in or after it is an integer multiple of PAGE. + * We call it first_ps_rg. It means first page size regular group. The + * range of first_ps_rg is (base, base * 2], and base == PAGE * + * SC_NGROUP. off_to_first_ps_rg begins from 1, instead of 0. e.g. + * off_to_first_ps_rg is 1 when psz is (PAGE * SC_NGROUP + 1). + */ + pszind_t off_to_first_ps_rg = (x < SC_LG_NGROUP + LG_PAGE) ? + 0 : x - (SC_LG_NGROUP + LG_PAGE); + + /* + * Same as sc_s::lg_delta. + * Delta for off_to_first_ps_rg == 1 is PAGE, + * for each increase in offset, it's multiplied by two. + * Therefore, lg_delta = LG_PAGE + (off_to_first_ps_rg - 1). + */ + pszind_t lg_delta = (off_to_first_ps_rg == 0) ? + LG_PAGE : LG_PAGE + (off_to_first_ps_rg - 1); + + /* + * Let's write psz in binary, e.g. 0011 for 0x3, 0111 for 0x7. + * The leftmost bits whose len is lg_base decide the base of psz. + * The rightmost bits whose len is lg_delta decide (pgz % PAGE). + * The middle bits whose len is SC_LG_NGROUP decide ndelta. + * ndelta is offset to the first size class in the size class group, + * starts from 1. + * If you don't know lg_base, ndelta or lg_delta, see sc.h. + * |xxxxxxxxxxxxxxxxxxxx|------------------------|yyyyyyyyyyyyyyyyyyyyy| + * |<-- len: lg_base -->|<-- len: SC_LG_NGROUP-->|<-- len: lg_delta -->| + * |<-- ndelta -->| + * rg_inner_off = ndelta - 1 + * Why use (psz - 1)? + * To handle case: psz % (1 << lg_delta) == 0. + */ + pszind_t rg_inner_off = (((psz - 1)) >> lg_delta) & (SC_NGROUP - 1); + + pszind_t base_ind = off_to_first_ps_rg << SC_LG_NGROUP; + pszind_t ind = base_ind + rg_inner_off; + return ind; +} + +static inline size_t +sz_pind2sz_compute(pszind_t pind) { + if (unlikely(pind == SC_NPSIZES)) { + return SC_LARGE_MAXCLASS + PAGE; + } + size_t grp = pind >> SC_LG_NGROUP; + size_t mod = pind & ((ZU(1) << SC_LG_NGROUP) - 1); + + size_t grp_size_mask = ~((!!grp)-1); + size_t grp_size = ((ZU(1) << (LG_PAGE + (SC_LG_NGROUP-1))) << grp) + & grp_size_mask; + + size_t shift = (grp == 0) ? 1 : grp; + size_t lg_delta = shift + (LG_PAGE-1); + size_t mod_size = (mod+1) << lg_delta; + + size_t sz = grp_size + mod_size; + return sz; +} + +static inline size_t +sz_pind2sz_lookup(pszind_t pind) { + size_t ret = (size_t)sz_pind2sz_tab[pind]; + assert(ret == sz_pind2sz_compute(pind)); + return ret; +} + +static inline size_t +sz_pind2sz(pszind_t pind) { + assert(pind < SC_NPSIZES + 1); + return sz_pind2sz_lookup(pind); +} + +static inline size_t +sz_psz2u(size_t psz) { + if (unlikely(psz > SC_LARGE_MAXCLASS)) { + return SC_LARGE_MAXCLASS + PAGE; + } + size_t x = lg_floor((psz<<1)-1); + size_t lg_delta = (x < SC_LG_NGROUP + LG_PAGE + 1) ? + LG_PAGE : x - SC_LG_NGROUP - 1; + size_t delta = ZU(1) << lg_delta; + size_t delta_mask = delta - 1; + size_t usize = (psz + delta_mask) & ~delta_mask; + return usize; +} + +static inline szind_t +sz_size2index_compute(size_t size) { + if (unlikely(size > SC_LARGE_MAXCLASS)) { + return SC_NSIZES; + } + + if (size == 0) { + return 0; + } +#if (SC_NTINY != 0) + if (size <= (ZU(1) << SC_LG_TINY_MAXCLASS)) { + szind_t lg_tmin = SC_LG_TINY_MAXCLASS - SC_NTINY + 1; + szind_t lg_ceil = lg_floor(pow2_ceil_zu(size)); + return (lg_ceil < lg_tmin ? 0 : lg_ceil - lg_tmin); + } +#endif + { + szind_t x = lg_floor((size<<1)-1); + szind_t shift = (x < SC_LG_NGROUP + LG_QUANTUM) ? 0 : + x - (SC_LG_NGROUP + LG_QUANTUM); + szind_t grp = shift << SC_LG_NGROUP; + + szind_t lg_delta = (x < SC_LG_NGROUP + LG_QUANTUM + 1) + ? LG_QUANTUM : x - SC_LG_NGROUP - 1; + + size_t delta_inverse_mask = ZU(-1) << lg_delta; + szind_t mod = ((((size-1) & delta_inverse_mask) >> lg_delta)) & + ((ZU(1) << SC_LG_NGROUP) - 1); + + szind_t index = SC_NTINY + grp + mod; + return index; + } +} + +JEMALLOC_ALWAYS_INLINE szind_t +sz_size2index_lookup_impl(size_t size) { + assert(size <= SC_LOOKUP_MAXCLASS); + return sz_size2index_tab[(size + (ZU(1) << SC_LG_TINY_MIN) - 1) + >> SC_LG_TINY_MIN]; +} + +JEMALLOC_ALWAYS_INLINE szind_t +sz_size2index_lookup(size_t size) { + szind_t ret = sz_size2index_lookup_impl(size); + assert(ret == sz_size2index_compute(size)); + return ret; +} + +JEMALLOC_ALWAYS_INLINE szind_t +sz_size2index(size_t size) { + if (likely(size <= SC_LOOKUP_MAXCLASS)) { + return sz_size2index_lookup(size); + } + return sz_size2index_compute(size); +} + +static inline size_t +sz_index2size_compute(szind_t index) { +#if (SC_NTINY > 0) + if (index < SC_NTINY) { + return (ZU(1) << (SC_LG_TINY_MAXCLASS - SC_NTINY + 1 + index)); + } +#endif + { + size_t reduced_index = index - SC_NTINY; + size_t grp = reduced_index >> SC_LG_NGROUP; + size_t mod = reduced_index & ((ZU(1) << SC_LG_NGROUP) - + 1); + + size_t grp_size_mask = ~((!!grp)-1); + size_t grp_size = ((ZU(1) << (LG_QUANTUM + + (SC_LG_NGROUP-1))) << grp) & grp_size_mask; + + size_t shift = (grp == 0) ? 1 : grp; + size_t lg_delta = shift + (LG_QUANTUM-1); + size_t mod_size = (mod+1) << lg_delta; + + size_t usize = grp_size + mod_size; + return usize; + } +} + +JEMALLOC_ALWAYS_INLINE size_t +sz_index2size_lookup_impl(szind_t index) { + return sz_index2size_tab[index]; +} + +JEMALLOC_ALWAYS_INLINE size_t +sz_index2size_lookup(szind_t index) { + size_t ret = sz_index2size_lookup_impl(index); + assert(ret == sz_index2size_compute(index)); + return ret; +} + +JEMALLOC_ALWAYS_INLINE size_t +sz_index2size(szind_t index) { + assert(index < SC_NSIZES); + return sz_index2size_lookup(index); +} + +JEMALLOC_ALWAYS_INLINE void +sz_size2index_usize_fastpath(size_t size, szind_t *ind, size_t *usize) { + *ind = sz_size2index_lookup_impl(size); + *usize = sz_index2size_lookup_impl(*ind); +} + +JEMALLOC_ALWAYS_INLINE size_t +sz_s2u_compute(size_t size) { + if (unlikely(size > SC_LARGE_MAXCLASS)) { + return 0; + } + + if (size == 0) { + size++; + } +#if (SC_NTINY > 0) + if (size <= (ZU(1) << SC_LG_TINY_MAXCLASS)) { + size_t lg_tmin = SC_LG_TINY_MAXCLASS - SC_NTINY + 1; + size_t lg_ceil = lg_floor(pow2_ceil_zu(size)); + return (lg_ceil < lg_tmin ? (ZU(1) << lg_tmin) : + (ZU(1) << lg_ceil)); + } +#endif + { + size_t x = lg_floor((size<<1)-1); + size_t lg_delta = (x < SC_LG_NGROUP + LG_QUANTUM + 1) + ? LG_QUANTUM : x - SC_LG_NGROUP - 1; + size_t delta = ZU(1) << lg_delta; + size_t delta_mask = delta - 1; + size_t usize = (size + delta_mask) & ~delta_mask; + return usize; + } +} + +JEMALLOC_ALWAYS_INLINE size_t +sz_s2u_lookup(size_t size) { + size_t ret = sz_index2size_lookup(sz_size2index_lookup(size)); + + assert(ret == sz_s2u_compute(size)); + return ret; +} + +/* + * Compute usable size that would result from allocating an object with the + * specified size. + */ +JEMALLOC_ALWAYS_INLINE size_t +sz_s2u(size_t size) { + if (likely(size <= SC_LOOKUP_MAXCLASS)) { + return sz_s2u_lookup(size); + } + return sz_s2u_compute(size); +} + +/* + * Compute usable size that would result from allocating an object with the + * specified size and alignment. + */ +JEMALLOC_ALWAYS_INLINE size_t +sz_sa2u(size_t size, size_t alignment) { + size_t usize; + + assert(alignment != 0 && ((alignment - 1) & alignment) == 0); + + /* Try for a small size class. */ + if (size <= SC_SMALL_MAXCLASS && alignment <= PAGE) { + /* + * Round size up to the nearest multiple of alignment. + * + * This done, we can take advantage of the fact that for each + * small size class, every object is aligned at the smallest + * power of two that is non-zero in the base two representation + * of the size. For example: + * + * Size | Base 2 | Minimum alignment + * -----+----------+------------------ + * 96 | 1100000 | 32 + * 144 | 10100000 | 32 + * 192 | 11000000 | 64 + */ + usize = sz_s2u(ALIGNMENT_CEILING(size, alignment)); + if (usize < SC_LARGE_MINCLASS) { + return usize; + } + } + + /* Large size class. Beware of overflow. */ + + if (unlikely(alignment > SC_LARGE_MAXCLASS)) { + return 0; + } + + /* Make sure result is a large size class. */ + if (size <= SC_LARGE_MINCLASS) { + usize = SC_LARGE_MINCLASS; + } else { + usize = sz_s2u(size); + if (usize < size) { + /* size_t overflow. */ + return 0; + } + } + + /* + * Calculate the multi-page mapping that large_palloc() would need in + * order to guarantee the alignment. + */ + if (usize + sz_large_pad + PAGE_CEILING(alignment) - PAGE < usize) { + /* size_t overflow. */ + return 0; + } + return usize; +} + +/* + * Under normal circumstances, whether or not to use a slab + * to satisfy an allocation depends solely on the allocation's + * effective size. However, this is *not* the case when an allocation + * is sampled for profiling, in which case you *must not* use a slab + * regardless of the effective size. Thus `sz_can_use_slab` is called + * on the common path, but there exist `*_explicit_slab` variants of + * several functions for handling the aforementioned case of + * sampled allocations. + */ +JEMALLOC_ALWAYS_INLINE bool +sz_can_use_slab(size_t size) { + return size <= SC_SMALL_MAXCLASS; +} + +size_t sz_psz_quantize_floor(size_t size); +size_t sz_psz_quantize_ceil(size_t size); + +#endif /* JEMALLOC_INTERNAL_SIZE_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tcache_externs.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tcache_externs.h new file mode 100644 index 000000000..732adacb3 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tcache_externs.h @@ -0,0 +1,92 @@ +#ifndef JEMALLOC_INTERNAL_TCACHE_EXTERNS_H +#define JEMALLOC_INTERNAL_TCACHE_EXTERNS_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/base.h" +#include "jemalloc/internal/cache_bin.h" +#include "jemalloc/internal/sz.h" +#include "jemalloc/internal/tcache_types.h" + +extern bool opt_tcache; +extern size_t opt_tcache_max; +extern ssize_t opt_lg_tcache_nslots_mul; +extern unsigned opt_tcache_nslots_small_min; +extern unsigned opt_tcache_nslots_small_max; +extern unsigned opt_tcache_nslots_large; +extern ssize_t opt_lg_tcache_shift; +extern size_t opt_tcache_gc_incr_bytes; +extern size_t opt_tcache_gc_delay_bytes; +extern unsigned opt_lg_tcache_flush_small_div; +extern unsigned opt_lg_tcache_flush_large_div; + +/* + * Number of tcache bins. There are SC_NBINS small-object bins, plus 0 or more + * large-object bins. This is only used during threads initialization and + * changing it will not reflect on initialized threads as expected. Thus, + * it should not be changed on the fly. To change the number of tcache bins + * in use, refer to tcache_nbins of each tcache. + */ +extern unsigned global_do_not_change_tcache_nbins; + +/* + * Maximum cached size class. Same as above, this is only used during threads + * initialization and should not be changed. To change the maximum cached size + * class, refer to tcache_max of each tcache. + */ +extern size_t global_do_not_change_tcache_maxclass; + +/* + * Explicit tcaches, managed via the tcache.{create,flush,destroy} mallctls and + * usable via the MALLOCX_TCACHE() flag. The automatic per thread tcaches are + * completely disjoint from this data structure. tcaches starts off as a sparse + * array, so it has no physical memory footprint until individual pages are + * touched. This allows the entire array to be allocated the first time an + * explicit tcache is created without a disproportionate impact on memory usage. + */ +extern tcaches_t *tcaches; + +size_t tcache_salloc(tsdn_t *tsdn, const void *ptr); +void *tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache, + cache_bin_t *cache_bin, szind_t binind, bool *tcache_success); + +void tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, + cache_bin_t *cache_bin, szind_t binind, unsigned rem); +void tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, + cache_bin_t *cache_bin, szind_t binind, unsigned rem); +void tcache_bin_flush_stashed(tsd_t *tsd, tcache_t *tcache, + cache_bin_t *cache_bin, szind_t binind, bool is_small); +bool tcache_bin_info_default_init(const char *bin_settings_segment_cur, + size_t len_left); +bool tcache_bins_ncached_max_write(tsd_t *tsd, char *settings, size_t len); +bool tcache_bin_ncached_max_read(tsd_t *tsd, size_t bin_size, + cache_bin_sz_t *ncached_max); +void tcache_arena_reassociate(tsdn_t *tsdn, tcache_slow_t *tcache_slow, + tcache_t *tcache, arena_t *arena); +tcache_t *tcache_create_explicit(tsd_t *tsd); +void thread_tcache_max_set(tsd_t *tsd, size_t tcache_max); +void tcache_cleanup(tsd_t *tsd); +void tcache_stats_merge(tsdn_t *tsdn, tcache_t *tcache, arena_t *arena); +bool tcaches_create(tsd_t *tsd, base_t *base, unsigned *r_ind); +void tcaches_flush(tsd_t *tsd, unsigned ind); +void tcaches_destroy(tsd_t *tsd, unsigned ind); +bool tcache_boot(tsdn_t *tsdn, base_t *base); +void tcache_arena_associate(tsdn_t *tsdn, tcache_slow_t *tcache_slow, + tcache_t *tcache, arena_t *arena); +void tcache_prefork(tsdn_t *tsdn); +void tcache_postfork_parent(tsdn_t *tsdn); +void tcache_postfork_child(tsdn_t *tsdn); +void tcache_flush(tsd_t *tsd); +bool tsd_tcache_enabled_data_init(tsd_t *tsd); +void tcache_enabled_set(tsd_t *tsd, bool enabled); + +void tcache_assert_initialized(tcache_t *tcache); + +/* Only accessed by thread event. */ +uint64_t tcache_gc_new_event_wait(tsd_t *tsd); +uint64_t tcache_gc_postponed_event_wait(tsd_t *tsd); +void tcache_gc_event_handler(tsd_t *tsd, uint64_t elapsed); +uint64_t tcache_gc_dalloc_new_event_wait(tsd_t *tsd); +uint64_t tcache_gc_dalloc_postponed_event_wait(tsd_t *tsd); +void tcache_gc_dalloc_event_handler(tsd_t *tsd, uint64_t elapsed); + +#endif /* JEMALLOC_INTERNAL_TCACHE_EXTERNS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tcache_inlines.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tcache_inlines.h new file mode 100644 index 000000000..e8e3b41f8 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tcache_inlines.h @@ -0,0 +1,243 @@ +#ifndef JEMALLOC_INTERNAL_TCACHE_INLINES_H +#define JEMALLOC_INTERNAL_TCACHE_INLINES_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/arena_externs.h" +#include "jemalloc/internal/bin.h" +#include "jemalloc/internal/jemalloc_internal_inlines_b.h" +#include "jemalloc/internal/jemalloc_internal_types.h" +#include "jemalloc/internal/large_externs.h" +#include "jemalloc/internal/san.h" +#include "jemalloc/internal/sc.h" +#include "jemalloc/internal/sz.h" +#include "jemalloc/internal/tcache_externs.h" +#include "jemalloc/internal/util.h" + +static inline bool +tcache_enabled_get(tsd_t *tsd) { + return tsd_tcache_enabled_get(tsd); +} + +static inline unsigned +tcache_nbins_get(tcache_slow_t *tcache_slow) { + assert(tcache_slow != NULL); + unsigned nbins = tcache_slow->tcache_nbins; + assert(nbins <= TCACHE_NBINS_MAX); + return nbins; +} + +static inline size_t +tcache_max_get(tcache_slow_t *tcache_slow) { + assert(tcache_slow != NULL); + size_t tcache_max = sz_index2size(tcache_nbins_get(tcache_slow) - 1); + assert(tcache_max <= TCACHE_MAXCLASS_LIMIT); + return tcache_max; +} + +static inline void +tcache_max_set(tcache_slow_t *tcache_slow, size_t tcache_max) { + assert(tcache_slow != NULL); + assert(tcache_max <= TCACHE_MAXCLASS_LIMIT); + tcache_slow->tcache_nbins = sz_size2index(tcache_max) + 1; +} + +static inline void +tcache_bin_settings_backup(tcache_t *tcache, + cache_bin_info_t tcache_bin_info[TCACHE_NBINS_MAX]) { + for (unsigned i = 0; i < TCACHE_NBINS_MAX; i++) { + cache_bin_info_init(&tcache_bin_info[i], + cache_bin_ncached_max_get_unsafe(&tcache->bins[i])); + } +} + +JEMALLOC_ALWAYS_INLINE bool +tcache_bin_disabled(szind_t ind, cache_bin_t *bin, + tcache_slow_t *tcache_slow) { + assert(bin != NULL); + assert(ind < TCACHE_NBINS_MAX); + bool disabled = cache_bin_disabled(bin); + + /* + * If a bin's ind >= nbins or ncached_max == 0, it must be disabled. + * However, when ind < nbins, it could be either enabled + * (ncached_max > 0) or disabled (ncached_max == 0). Similarly, when + * ncached_max > 0, it could be either enabled (ind < nbins) or + * disabled (ind >= nbins). Thus, if a bin is disabled, it has either + * ind >= nbins or ncached_max == 0. If a bin is enabled, it has + * ind < nbins and ncached_max > 0. + */ + unsigned nbins = tcache_nbins_get(tcache_slow); + cache_bin_sz_t ncached_max = cache_bin_ncached_max_get_unsafe(bin); + if (ind >= nbins) { + assert(disabled); + } else { + assert(!disabled || ncached_max == 0); + } + if (ncached_max == 0) { + assert(disabled); + } else { + assert(!disabled || ind >= nbins); + } + if (disabled) { + assert(ind >= nbins || ncached_max == 0); + } else { + assert(ind < nbins && ncached_max > 0); + } + + return disabled; +} + +JEMALLOC_ALWAYS_INLINE void * +tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache, + size_t size, szind_t binind, bool zero, bool slow_path) { + void *ret; + bool tcache_success; + + assert(binind < SC_NBINS); + cache_bin_t *bin = &tcache->bins[binind]; + ret = cache_bin_alloc(bin, &tcache_success); + assert(tcache_success == (ret != NULL)); + if (unlikely(!tcache_success)) { + bool tcache_hard_success; + arena = arena_choose(tsd, arena); + if (unlikely(arena == NULL)) { + return NULL; + } + if (unlikely(tcache_bin_disabled(binind, bin, + tcache->tcache_slow))) { + /* stats and zero are handled directly by the arena. */ + return arena_malloc_hard(tsd_tsdn(tsd), arena, size, + binind, zero, /* slab */ true); + } + tcache_bin_flush_stashed(tsd, tcache, bin, binind, + /* is_small */ true); + + ret = tcache_alloc_small_hard(tsd_tsdn(tsd), arena, tcache, + bin, binind, &tcache_hard_success); + if (tcache_hard_success == false) { + return NULL; + } + } + + assert(ret); + if (unlikely(zero)) { + size_t usize = sz_index2size(binind); + assert(tcache_salloc(tsd_tsdn(tsd), ret) == usize); + memset(ret, 0, usize); + } + if (config_stats) { + bin->tstats.nrequests++; + } + return ret; +} + +JEMALLOC_ALWAYS_INLINE void * +tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size, + szind_t binind, bool zero, bool slow_path) { + void *ret; + bool tcache_success; + + cache_bin_t *bin = &tcache->bins[binind]; + assert(binind >= SC_NBINS && + !tcache_bin_disabled(binind, bin, tcache->tcache_slow)); + ret = cache_bin_alloc(bin, &tcache_success); + assert(tcache_success == (ret != NULL)); + if (unlikely(!tcache_success)) { + /* + * Only allocate one large object at a time, because it's quite + * expensive to create one and not use it. + */ + arena = arena_choose(tsd, arena); + if (unlikely(arena == NULL)) { + return NULL; + } + tcache_bin_flush_stashed(tsd, tcache, bin, binind, + /* is_small */ false); + + ret = large_malloc(tsd_tsdn(tsd), arena, sz_s2u(size), zero); + if (ret == NULL) { + return NULL; + } + } else { + if (unlikely(zero)) { + size_t usize = sz_index2size(binind); + assert(usize <= tcache_max_get(tcache->tcache_slow)); + memset(ret, 0, usize); + } + + if (config_stats) { + bin->tstats.nrequests++; + } + } + + return ret; +} + +JEMALLOC_ALWAYS_INLINE void +tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind, + bool slow_path) { + assert(tcache_salloc(tsd_tsdn(tsd), ptr) <= SC_SMALL_MAXCLASS); + + cache_bin_t *bin = &tcache->bins[binind]; + /* + * Not marking the branch unlikely because this is past free_fastpath() + * (which handles the most common cases), i.e. at this point it's often + * uncommon cases. + */ + if (cache_bin_nonfast_aligned(ptr)) { + /* Junk unconditionally, even if bin is full. */ + san_junk_ptr(ptr, sz_index2size(binind)); + if (cache_bin_stash(bin, ptr)) { + return; + } + assert(cache_bin_full(bin)); + /* Bin full; fall through into the flush branch. */ + } + + if (unlikely(!cache_bin_dalloc_easy(bin, ptr))) { + if (unlikely(tcache_bin_disabled(binind, bin, + tcache->tcache_slow))) { + arena_dalloc_small(tsd_tsdn(tsd), ptr); + return; + } + cache_bin_sz_t max = cache_bin_ncached_max_get(bin); + unsigned remain = max >> opt_lg_tcache_flush_small_div; + tcache_bin_flush_small(tsd, tcache, bin, binind, remain); + bool ret = cache_bin_dalloc_easy(bin, ptr); + assert(ret); + } +} + +JEMALLOC_ALWAYS_INLINE void +tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind, + bool slow_path) { + + assert(tcache_salloc(tsd_tsdn(tsd), ptr) > SC_SMALL_MAXCLASS); + assert(tcache_salloc(tsd_tsdn(tsd), ptr) <= + tcache_max_get(tcache->tcache_slow)); + assert(!tcache_bin_disabled(binind, &tcache->bins[binind], + tcache->tcache_slow)); + + cache_bin_t *bin = &tcache->bins[binind]; + if (unlikely(!cache_bin_dalloc_easy(bin, ptr))) { + unsigned remain = cache_bin_ncached_max_get(bin) >> + opt_lg_tcache_flush_large_div; + tcache_bin_flush_large(tsd, tcache, bin, binind, remain); + bool ret = cache_bin_dalloc_easy(bin, ptr); + assert(ret); + } +} + +JEMALLOC_ALWAYS_INLINE tcache_t * +tcaches_get(tsd_t *tsd, unsigned ind) { + tcaches_t *elm = &tcaches[ind]; + if (unlikely(elm->tcache == NULL)) { + malloc_printf(": invalid tcache id (%u).\n", ind); + abort(); + } else if (unlikely(elm->tcache == TCACHES_ELM_NEED_REINIT)) { + elm->tcache = tcache_create_explicit(tsd); + } + return elm->tcache; +} + +#endif /* JEMALLOC_INTERNAL_TCACHE_INLINES_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tcache_structs.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tcache_structs.h new file mode 100644 index 000000000..d94099b00 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tcache_structs.h @@ -0,0 +1,71 @@ +#ifndef JEMALLOC_INTERNAL_TCACHE_STRUCTS_H +#define JEMALLOC_INTERNAL_TCACHE_STRUCTS_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/cache_bin.h" +#include "jemalloc/internal/ql.h" +#include "jemalloc/internal/sc.h" +#include "jemalloc/internal/tcache_types.h" +#include "jemalloc/internal/ticker.h" + +/* + * The tcache state is split into the slow and hot path data. Each has a + * pointer to the other, and the data always comes in pairs. The layout of each + * of them varies in practice; tcache_slow lives in the TSD for the automatic + * tcache, and as part of a dynamic allocation for manual allocations. Keeping + * a pointer to tcache_slow lets us treat these cases uniformly, rather than + * splitting up the tcache [de]allocation code into those paths called with the + * TSD tcache and those called with a manual tcache. + */ + +struct tcache_slow_s { + /* Lets us track all the tcaches in an arena. */ + ql_elm(tcache_slow_t) link; + + /* + * The descriptor lets the arena find our cache bins without seeing the + * tcache definition. This enables arenas to aggregate stats across + * tcaches without having a tcache dependency. + */ + cache_bin_array_descriptor_t cache_bin_array_descriptor; + + /* The arena this tcache is associated with. */ + arena_t *arena; + /* The number of bins activated in the tcache. */ + unsigned tcache_nbins; + /* Next bin to GC. */ + szind_t next_gc_bin; + /* For small bins, fill (ncached_max >> lg_fill_div). */ + uint8_t lg_fill_div[SC_NBINS]; + /* For small bins, whether has been refilled since last GC. */ + bool bin_refilled[SC_NBINS]; + /* + * For small bins, the number of items we can pretend to flush before + * actually flushing. + */ + uint8_t bin_flush_delay_items[SC_NBINS]; + /* + * The start of the allocation containing the dynamic allocation for + * either the cache bins alone, or the cache bin memory as well as this + * tcache_slow_t and its associated tcache_t. + */ + void *dyn_alloc; + + /* The associated bins. */ + tcache_t *tcache; +}; + +struct tcache_s { + tcache_slow_t *tcache_slow; + cache_bin_t bins[TCACHE_NBINS_MAX]; +}; + +/* Linkage for list of available (previously used) explicit tcache IDs. */ +struct tcaches_s { + union { + tcache_t *tcache; + tcaches_t *next; + }; +}; + +#endif /* JEMALLOC_INTERNAL_TCACHE_STRUCTS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tcache_types.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tcache_types.h new file mode 100644 index 000000000..578a199ea --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tcache_types.h @@ -0,0 +1,27 @@ +#ifndef JEMALLOC_INTERNAL_TCACHE_TYPES_H +#define JEMALLOC_INTERNAL_TCACHE_TYPES_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/sc.h" + +typedef struct tcache_slow_s tcache_slow_t; +typedef struct tcache_s tcache_t; +typedef struct tcaches_s tcaches_t; + +/* Used in TSD static initializer only. Real init in tsd_tcache_data_init(). */ +#define TCACHE_ZERO_INITIALIZER {0} +#define TCACHE_SLOW_ZERO_INITIALIZER {{0}} + +/* Used in TSD static initializer only. Will be initialized to opt_tcache. */ +#define TCACHE_ENABLED_ZERO_INITIALIZER false + +/* Used for explicit tcache only. Means flushed but not destroyed. */ +/* NOLINTNEXTLINE(performance-no-int-to-ptr) */ +#define TCACHES_ELM_NEED_REINIT ((tcache_t *)(uintptr_t)1) + +#define TCACHE_LG_MAXCLASS_LIMIT 23 /* tcache_max = 8M */ +#define TCACHE_MAXCLASS_LIMIT ((size_t)1 << TCACHE_LG_MAXCLASS_LIMIT) +#define TCACHE_NBINS_MAX (SC_NBINS + SC_NGROUP * \ + (TCACHE_LG_MAXCLASS_LIMIT - SC_LG_LARGE_MINCLASS) + 1) + +#endif /* JEMALLOC_INTERNAL_TCACHE_TYPES_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/test_hooks.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/test_hooks.h new file mode 100644 index 000000000..af3f2755a --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/test_hooks.h @@ -0,0 +1,26 @@ +#ifndef JEMALLOC_INTERNAL_TEST_HOOKS_H +#define JEMALLOC_INTERNAL_TEST_HOOKS_H + +#include "jemalloc/internal/jemalloc_preamble.h" + +extern JEMALLOC_EXPORT void (*test_hooks_arena_new_hook)(void); +extern JEMALLOC_EXPORT void (*test_hooks_libc_hook)(void); + +#if defined(JEMALLOC_JET) || defined(JEMALLOC_UNIT_TEST) +# define JEMALLOC_TEST_HOOK(fn, hook) ((void)(hook != NULL && (hook(), 0)), fn) + +# define open JEMALLOC_TEST_HOOK(open, test_hooks_libc_hook) +# define read JEMALLOC_TEST_HOOK(read, test_hooks_libc_hook) +# define write JEMALLOC_TEST_HOOK(write, test_hooks_libc_hook) +# define readlink JEMALLOC_TEST_HOOK(readlink, test_hooks_libc_hook) +# define close JEMALLOC_TEST_HOOK(close, test_hooks_libc_hook) +# define creat JEMALLOC_TEST_HOOK(creat, test_hooks_libc_hook) +# define secure_getenv JEMALLOC_TEST_HOOK(secure_getenv, test_hooks_libc_hook) +/* Note that this is undef'd and re-define'd in src/prof.c. */ +# define _Unwind_Backtrace JEMALLOC_TEST_HOOK(_Unwind_Backtrace, test_hooks_libc_hook) +#else +# define JEMALLOC_TEST_HOOK(fn, hook) fn +#endif + + +#endif /* JEMALLOC_INTERNAL_TEST_HOOKS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/thread_event.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/thread_event.h new file mode 100644 index 000000000..46c57ed56 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/thread_event.h @@ -0,0 +1,302 @@ +#ifndef JEMALLOC_INTERNAL_THREAD_EVENT_H +#define JEMALLOC_INTERNAL_THREAD_EVENT_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/tsd.h" + +/* "te" is short for "thread_event" */ + +/* + * TE_MIN_START_WAIT should not exceed the minimal allocation usize. + */ +#define TE_MIN_START_WAIT ((uint64_t)1U) +#define TE_MAX_START_WAIT UINT64_MAX + +/* + * Maximum threshold on thread_(de)allocated_next_event_fast, so that there is + * no need to check overflow in malloc fast path. (The allocation size in malloc + * fast path never exceeds SC_LOOKUP_MAXCLASS.) + */ +#define TE_NEXT_EVENT_FAST_MAX (UINT64_MAX - SC_LOOKUP_MAXCLASS + 1U) + +/* + * The max interval helps make sure that malloc stays on the fast path in the + * common case, i.e. thread_allocated < thread_allocated_next_event_fast. When + * thread_allocated is within an event's distance to TE_NEXT_EVENT_FAST_MAX + * above, thread_allocated_next_event_fast is wrapped around and we fall back to + * the medium-fast path. The max interval makes sure that we're not staying on + * the fallback case for too long, even if there's no active event or if all + * active events have long wait times. + */ +#define TE_MAX_INTERVAL ((uint64_t)(4U << 20)) + +/* + * Invalid elapsed time, for situations where elapsed time is not needed. See + * comments in thread_event.c for more info. + */ +#define TE_INVALID_ELAPSED UINT64_MAX + +typedef struct te_ctx_s { + bool is_alloc; + uint64_t *current; + uint64_t *last_event; + uint64_t *next_event; + uint64_t *next_event_fast; +} te_ctx_t; + +void te_assert_invariants_debug(tsd_t *tsd); +void te_event_trigger(tsd_t *tsd, te_ctx_t *ctx); +void te_recompute_fast_threshold(tsd_t *tsd); +void tsd_te_init(tsd_t *tsd); + +/* + * List of all events, in the following format: + * E(event, (condition), is_alloc_event) + */ +#define ITERATE_OVER_ALL_EVENTS \ + E(tcache_gc, (opt_tcache_gc_incr_bytes > 0), true) \ + E(prof_sample, (config_prof && opt_prof), true) \ + E(stats_interval, (opt_stats_interval >= 0), true) \ + E(tcache_gc_dalloc, (opt_tcache_gc_incr_bytes > 0), false) \ + E(peak_alloc, config_stats, true) \ + E(peak_dalloc, config_stats, false) + +#define E(event, condition_unused, is_alloc_event_unused) \ + C(event##_event_wait) + +/* List of all thread event counters. */ +#define ITERATE_OVER_ALL_COUNTERS \ + C(thread_allocated) \ + C(thread_allocated_last_event) \ + ITERATE_OVER_ALL_EVENTS \ + C(prof_sample_last_event) \ + C(stats_interval_last_event) + +/* Getters directly wrap TSD getters. */ +#define C(counter) \ +JEMALLOC_ALWAYS_INLINE uint64_t \ +counter##_get(tsd_t *tsd) { \ + return tsd_##counter##_get(tsd); \ +} + +ITERATE_OVER_ALL_COUNTERS +#undef C + +/* + * Setters call the TSD pointer getters rather than the TSD setters, so that + * the counters can be modified even when TSD state is reincarnated or + * minimal_initialized: if an event is triggered in such cases, we will + * temporarily delay the event and let it be immediately triggered at the next + * allocation call. + */ +#define C(counter) \ +JEMALLOC_ALWAYS_INLINE void \ +counter##_set(tsd_t *tsd, uint64_t v) { \ + *tsd_##counter##p_get(tsd) = v; \ +} + +ITERATE_OVER_ALL_COUNTERS +#undef C + +/* + * For generating _event_wait getter / setter functions for each individual + * event. + */ +#undef E + +/* + * The malloc and free fastpath getters -- use the unsafe getters since tsd may + * be non-nominal, in which case the fast_threshold will be set to 0. This + * allows checking for events and tsd non-nominal in a single branch. + * + * Note that these can only be used on the fastpath. + */ +JEMALLOC_ALWAYS_INLINE void +te_malloc_fastpath_ctx(tsd_t *tsd, uint64_t *allocated, uint64_t *threshold) { + *allocated = *tsd_thread_allocatedp_get_unsafe(tsd); + *threshold = *tsd_thread_allocated_next_event_fastp_get_unsafe(tsd); + assert(*threshold <= TE_NEXT_EVENT_FAST_MAX); +} + +JEMALLOC_ALWAYS_INLINE void +te_free_fastpath_ctx(tsd_t *tsd, uint64_t *deallocated, uint64_t *threshold) { + /* Unsafe getters since this may happen before tsd_init. */ + *deallocated = *tsd_thread_deallocatedp_get_unsafe(tsd); + *threshold = *tsd_thread_deallocated_next_event_fastp_get_unsafe(tsd); + assert(*threshold <= TE_NEXT_EVENT_FAST_MAX); +} + +JEMALLOC_ALWAYS_INLINE bool +te_ctx_is_alloc(te_ctx_t *ctx) { + return ctx->is_alloc; +} + +JEMALLOC_ALWAYS_INLINE uint64_t +te_ctx_current_bytes_get(te_ctx_t *ctx) { + return *ctx->current; +} + +JEMALLOC_ALWAYS_INLINE void +te_ctx_current_bytes_set(te_ctx_t *ctx, uint64_t v) { + *ctx->current = v; +} + +JEMALLOC_ALWAYS_INLINE uint64_t +te_ctx_last_event_get(te_ctx_t *ctx) { + return *ctx->last_event; +} + +JEMALLOC_ALWAYS_INLINE void +te_ctx_last_event_set(te_ctx_t *ctx, uint64_t v) { + *ctx->last_event = v; +} + +/* Below 3 for next_event_fast. */ +JEMALLOC_ALWAYS_INLINE uint64_t +te_ctx_next_event_fast_get(te_ctx_t *ctx) { + uint64_t v = *ctx->next_event_fast; + assert(v <= TE_NEXT_EVENT_FAST_MAX); + return v; +} + +JEMALLOC_ALWAYS_INLINE void +te_ctx_next_event_fast_set(te_ctx_t *ctx, uint64_t v) { + assert(v <= TE_NEXT_EVENT_FAST_MAX); + *ctx->next_event_fast = v; +} + +JEMALLOC_ALWAYS_INLINE void +te_next_event_fast_set_non_nominal(tsd_t *tsd) { + /* + * Set the fast thresholds to zero when tsd is non-nominal. Use the + * unsafe getter as this may get called during tsd init and clean up. + */ + *tsd_thread_allocated_next_event_fastp_get_unsafe(tsd) = 0; + *tsd_thread_deallocated_next_event_fastp_get_unsafe(tsd) = 0; +} + +/* For next_event. Setter also updates the fast threshold. */ +JEMALLOC_ALWAYS_INLINE uint64_t +te_ctx_next_event_get(te_ctx_t *ctx) { + return *ctx->next_event; +} + +JEMALLOC_ALWAYS_INLINE void +te_ctx_next_event_set(tsd_t *tsd, te_ctx_t *ctx, uint64_t v) { + *ctx->next_event = v; + te_recompute_fast_threshold(tsd); +} + +/* + * The function checks in debug mode whether the thread event counters are in + * a consistent state, which forms the invariants before and after each round + * of thread event handling that we can rely on and need to promise. + * The invariants are only temporarily violated in the middle of + * te_event_advance() if an event is triggered (the te_event_trigger() call at + * the end will restore the invariants). + */ +JEMALLOC_ALWAYS_INLINE void +te_assert_invariants(tsd_t *tsd) { + if (config_debug) { + te_assert_invariants_debug(tsd); + } +} + +JEMALLOC_ALWAYS_INLINE void +te_ctx_get(tsd_t *tsd, te_ctx_t *ctx, bool is_alloc) { + ctx->is_alloc = is_alloc; + if (is_alloc) { + ctx->current = tsd_thread_allocatedp_get(tsd); + ctx->last_event = tsd_thread_allocated_last_eventp_get(tsd); + ctx->next_event = tsd_thread_allocated_next_eventp_get(tsd); + ctx->next_event_fast = + tsd_thread_allocated_next_event_fastp_get(tsd); + } else { + ctx->current = tsd_thread_deallocatedp_get(tsd); + ctx->last_event = tsd_thread_deallocated_last_eventp_get(tsd); + ctx->next_event = tsd_thread_deallocated_next_eventp_get(tsd); + ctx->next_event_fast = + tsd_thread_deallocated_next_event_fastp_get(tsd); + } +} + +/* + * The lookahead functionality facilitates events to be able to lookahead, i.e. + * without touching the event counters, to determine whether an event would be + * triggered. The event counters are not advanced until the end of the + * allocation / deallocation calls, so the lookahead can be useful if some + * preparation work for some event must be done early in the allocation / + * deallocation calls. + * + * Currently only the profiling sampling event needs the lookahead + * functionality, so we don't yet define general purpose lookahead functions. + * + * Surplus is a terminology referring to the amount of bytes beyond what's + * needed for triggering an event, which can be a useful quantity to have in + * general when lookahead is being called. + */ + +JEMALLOC_ALWAYS_INLINE bool +te_prof_sample_event_lookahead_surplus(tsd_t *tsd, size_t usize, + size_t *surplus) { + if (surplus != NULL) { + /* + * This is a dead store: the surplus will be overwritten before + * any read. The initialization suppresses compiler warnings. + * Meanwhile, using SIZE_MAX to initialize is good for + * debugging purpose, because a valid surplus value is strictly + * less than usize, which is at most SIZE_MAX. + */ + *surplus = SIZE_MAX; + } + if (unlikely(!tsd_nominal(tsd) || tsd_reentrancy_level_get(tsd) > 0)) { + return false; + } + /* The subtraction is intentionally susceptible to underflow. */ + uint64_t accumbytes = tsd_thread_allocated_get(tsd) + usize - + tsd_thread_allocated_last_event_get(tsd); + uint64_t sample_wait = tsd_prof_sample_event_wait_get(tsd); + if (accumbytes < sample_wait) { + return false; + } + assert(accumbytes - sample_wait < (uint64_t)usize); + if (surplus != NULL) { + *surplus = (size_t)(accumbytes - sample_wait); + } + return true; +} + +JEMALLOC_ALWAYS_INLINE bool +te_prof_sample_event_lookahead(tsd_t *tsd, size_t usize) { + return te_prof_sample_event_lookahead_surplus(tsd, usize, NULL); +} + +JEMALLOC_ALWAYS_INLINE void +te_event_advance(tsd_t *tsd, size_t usize, bool is_alloc) { + te_assert_invariants(tsd); + + te_ctx_t ctx; + te_ctx_get(tsd, &ctx, is_alloc); + + uint64_t bytes_before = te_ctx_current_bytes_get(&ctx); + te_ctx_current_bytes_set(&ctx, bytes_before + usize); + + /* The subtraction is intentionally susceptible to underflow. */ + if (likely(usize < te_ctx_next_event_get(&ctx) - bytes_before)) { + te_assert_invariants(tsd); + } else { + te_event_trigger(tsd, &ctx); + } +} + +JEMALLOC_ALWAYS_INLINE void +thread_dalloc_event(tsd_t *tsd, size_t usize) { + te_event_advance(tsd, usize, false); +} + +JEMALLOC_ALWAYS_INLINE void +thread_alloc_event(tsd_t *tsd, size_t usize) { + te_event_advance(tsd, usize, true); +} + +#endif /* JEMALLOC_INTERNAL_THREAD_EVENT_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/ticker.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/ticker.h new file mode 100644 index 000000000..dca9bd105 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/ticker.h @@ -0,0 +1,189 @@ +#ifndef JEMALLOC_INTERNAL_TICKER_H +#define JEMALLOC_INTERNAL_TICKER_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/prng.h" +#include "jemalloc/internal/util.h" + +/** + * A ticker makes it easy to count-down events until some limit. You + * ticker_init the ticker to trigger every nticks events. You then notify it + * that an event has occurred with calls to ticker_tick (or that nticks events + * have occurred with a call to ticker_ticks), which will return true (and reset + * the counter) if the countdown hit zero. + */ +typedef struct ticker_s ticker_t; +struct ticker_s { + int32_t tick; + int32_t nticks; +}; + +static inline void +ticker_init(ticker_t *ticker, int32_t nticks) { + ticker->tick = nticks; + ticker->nticks = nticks; +} + +static inline void +ticker_copy(ticker_t *ticker, const ticker_t *other) { + *ticker = *other; +} + +static inline int32_t +ticker_read(const ticker_t *ticker) { + return ticker->tick; +} + +/* + * Not intended to be a public API. Unfortunately, on x86, neither gcc nor + * clang seems smart enough to turn + * ticker->tick -= nticks; + * if (unlikely(ticker->tick < 0)) { + * fixup ticker + * return true; + * } + * return false; + * into + * subq %nticks_reg, (%ticker_reg) + * js fixup ticker + * + * unless we force "fixup ticker" out of line. In that case, gcc gets it right, + * but clang now does worse than before. So, on x86 with gcc, we force it out + * of line, but otherwise let the inlining occur. Ordinarily this wouldn't be + * worth the hassle, but this is on the fast path of both malloc and free (via + * tcache_event). + */ +#if defined(__GNUC__) && !defined(__clang__) \ + && (defined(__x86_64__) || defined(__i386__)) +JEMALLOC_NOINLINE +#endif +static bool +ticker_fixup(ticker_t *ticker, bool delay_trigger) { + if (delay_trigger) { + ticker->tick = 0; + return false; + } + ticker->tick = ticker->nticks; + return true; +} + +static inline bool +ticker_ticks(ticker_t *ticker, int32_t nticks, bool delay_trigger) { + ticker->tick -= nticks; + if (unlikely(ticker->tick < 0)) { + return ticker_fixup(ticker, delay_trigger); + } + return false; +} + +static inline bool +ticker_tick(ticker_t *ticker, bool delay_trigger) { + return ticker_ticks(ticker, 1, delay_trigger); +} + +/* + * Try to tick. If ticker would fire, return true, but rely on + * slowpath to reset ticker. + */ +static inline bool +ticker_trytick(ticker_t *ticker) { + --ticker->tick; + if (unlikely(ticker->tick < 0)) { + return true; + } + return false; +} + +/* + * The ticker_geom_t is much like the ticker_t, except that instead of ticker + * having a constant countdown, it has an approximate one; each tick has + * approximately a 1/nticks chance of triggering the count. + * + * The motivation is in triggering arena decay. With a naive strategy, each + * thread would maintain a ticker per arena, and check if decay is necessary + * each time that the arena's ticker fires. This has two costs: + * - Since under reasonable assumptions both threads and arenas can scale + * linearly with the number of CPUs, maintaining per-arena data in each thread + * scales quadratically with the number of CPUs. + * - These tickers are often a cache miss down tcache flush pathways. + * + * By giving each tick a 1/nticks chance of firing, we still maintain the same + * average number of ticks-until-firing per arena, with only a single ticker's + * worth of metadata. + */ + +/* See ticker.c for an explanation of these constants. */ +#define TICKER_GEOM_NBITS 6 +#define TICKER_GEOM_MUL 61 +extern const uint8_t ticker_geom_table[1 << TICKER_GEOM_NBITS]; + +/* Not actually any different from ticker_t; just for type safety. */ +typedef struct ticker_geom_s ticker_geom_t; +struct ticker_geom_s { + int32_t tick; + int32_t nticks; +}; + +/* + * Just pick the average delay for the first counter. We're more concerned with + * the behavior over long periods of time rather than the exact timing of the + * initial ticks. + */ +#define TICKER_GEOM_INIT(nticks) {nticks, nticks} + +static inline void +ticker_geom_init(ticker_geom_t *ticker, int32_t nticks) { + /* + * Make sure there's no overflow possible. This shouldn't really be a + * problem for reasonable nticks choices, which are all static and + * relatively small. + */ + assert((uint64_t)nticks * (uint64_t)255 / (uint64_t)TICKER_GEOM_MUL + <= (uint64_t)INT32_MAX); + ticker->tick = nticks; + ticker->nticks = nticks; +} + +static inline int32_t +ticker_geom_read(const ticker_geom_t *ticker) { + return ticker->tick; +} + +/* Same deal as above. */ +#if defined(__GNUC__) && !defined(__clang__) \ + && (defined(__x86_64__) || defined(__i386__)) +JEMALLOC_NOINLINE +#endif +static bool +ticker_geom_fixup(ticker_geom_t *ticker, uint64_t *prng_state, + bool delay_trigger) { + if (delay_trigger) { + ticker->tick = 0; + return false; + } + + uint64_t idx = prng_lg_range_u64(prng_state, TICKER_GEOM_NBITS); + ticker->tick = (uint32_t)( + (uint64_t)ticker->nticks * (uint64_t)ticker_geom_table[idx] + / (uint64_t)TICKER_GEOM_MUL); + + return true; +} + +static inline bool +ticker_geom_ticks(ticker_geom_t *ticker, uint64_t *prng_state, int32_t nticks, + bool delay_trigger) { + ticker->tick -= nticks; + if (unlikely(ticker->tick < 0)) { + return ticker_geom_fixup(ticker, prng_state, delay_trigger); + } + return false; +} + +static inline bool +ticker_geom_tick(ticker_geom_t *ticker, uint64_t *prng_state, + bool delay_trigger) { + return ticker_geom_ticks(ticker, prng_state, 1, delay_trigger); +} + +#endif /* JEMALLOC_INTERNAL_TICKER_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tsd.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tsd.h new file mode 100644 index 000000000..4f22dcff1 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tsd.h @@ -0,0 +1,225 @@ +#ifndef JEMALLOC_INTERNAL_TSD_H +#define JEMALLOC_INTERNAL_TSD_H + +/* + * We put the platform-specific data declarations and inlines into their own + * header files to avoid cluttering this file. They define tsd_boot0, + * tsd_boot1, tsd_boot, tsd_booted_get, tsd_get_allocates, tsd_get, and tsd_set. + */ +#ifdef JEMALLOC_MALLOC_THREAD_CLEANUP +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/tsd_malloc_thread_cleanup.h" +#elif (defined(JEMALLOC_TLS)) +#include "jemalloc/internal/tsd_tls.h" +#elif (defined(_WIN32)) +#include "jemalloc/internal/tsd_win.h" +#else +#include "jemalloc/internal/tsd_generic.h" +#endif + +/* + * tsd_foop_get_unsafe(tsd) returns a pointer to the thread-local instance of + * foo. This omits some safety checks, and so can be used during tsd + * initialization and cleanup. + */ +#define O(n, t, nt) \ +JEMALLOC_ALWAYS_INLINE t * \ +tsd_##n##p_get_unsafe(tsd_t *tsd) { \ + return &tsd->TSD_MANGLE(n); \ +} +TSD_DATA_SLOW +TSD_DATA_FAST +TSD_DATA_SLOWER +#undef O + +/* tsd_foop_get(tsd) returns a pointer to the thread-local instance of foo. */ +#define O(n, t, nt) \ +JEMALLOC_ALWAYS_INLINE t * \ +tsd_##n##p_get(tsd_t *tsd) { \ + /* \ + * Because the state might change asynchronously if it's \ + * nominal, we need to make sure that we only read it once. \ + */ \ + uint8_t state = tsd_state_get(tsd); \ + assert(state == tsd_state_nominal || \ + state == tsd_state_nominal_slow || \ + state == tsd_state_nominal_recompute || \ + state == tsd_state_reincarnated || \ + state == tsd_state_minimal_initialized); \ + return tsd_##n##p_get_unsafe(tsd); \ +} +TSD_DATA_SLOW +TSD_DATA_FAST +TSD_DATA_SLOWER +#undef O + +/* + * tsdn_foop_get(tsdn) returns either the thread-local instance of foo (if tsdn + * isn't NULL), or NULL (if tsdn is NULL), cast to the nullable pointer type. + */ +#define O(n, t, nt) \ +JEMALLOC_ALWAYS_INLINE nt * \ +tsdn_##n##p_get(tsdn_t *tsdn) { \ + if (tsdn_null(tsdn)) { \ + return NULL; \ + } \ + tsd_t *tsd = tsdn_tsd(tsdn); \ + return (nt *)tsd_##n##p_get(tsd); \ +} +TSD_DATA_SLOW +TSD_DATA_FAST +TSD_DATA_SLOWER +#undef O + +/* tsd_foo_get(tsd) returns the value of the thread-local instance of foo. */ +#define O(n, t, nt) \ +JEMALLOC_ALWAYS_INLINE t \ +tsd_##n##_get(tsd_t *tsd) { \ + return *tsd_##n##p_get(tsd); \ +} +TSD_DATA_SLOW +TSD_DATA_FAST +TSD_DATA_SLOWER +#undef O + +/* tsd_foo_set(tsd, val) updates the thread-local instance of foo to be val. */ +#define O(n, t, nt) \ +JEMALLOC_ALWAYS_INLINE void \ +tsd_##n##_set(tsd_t *tsd, t val) { \ + assert(tsd_state_get(tsd) != tsd_state_reincarnated && \ + tsd_state_get(tsd) != tsd_state_minimal_initialized); \ + *tsd_##n##p_get(tsd) = val; \ +} +TSD_DATA_SLOW +TSD_DATA_FAST +TSD_DATA_SLOWER +#undef O + +JEMALLOC_ALWAYS_INLINE void +tsd_assert_fast(tsd_t *tsd) { + /* + * Note that our fastness assertion does *not* include global slowness + * counters; it's not in general possible to ensure that they won't + * change asynchronously from underneath us. + */ + assert(!malloc_slow && tsd_tcache_enabled_get(tsd) && + tsd_reentrancy_level_get(tsd) == 0); +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_fast(tsd_t *tsd) { + bool fast = (tsd_state_get(tsd) == tsd_state_nominal); + if (fast) { + tsd_assert_fast(tsd); + } + + return fast; +} + +JEMALLOC_ALWAYS_INLINE tsd_t * +tsd_fetch_impl(bool init, bool minimal) { + tsd_t *tsd = tsd_get(init); + + if (!init && tsd_get_allocates() && tsd == NULL) { + return NULL; + } + assert(tsd != NULL); + + if (unlikely(tsd_state_get(tsd) != tsd_state_nominal)) { + return tsd_fetch_slow(tsd, minimal); + } + assert(tsd_fast(tsd)); + tsd_assert_fast(tsd); + + return tsd; +} + +/* Get a minimal TSD that requires no cleanup. See comments in free(). */ +JEMALLOC_ALWAYS_INLINE tsd_t * +tsd_fetch_min(void) { + return tsd_fetch_impl(true, true); +} + +/* For internal background threads use only. */ +JEMALLOC_ALWAYS_INLINE tsd_t * +tsd_internal_fetch(void) { + tsd_t *tsd = tsd_fetch_min(); + /* Use reincarnated state to prevent full initialization. */ + tsd_state_set(tsd, tsd_state_reincarnated); + + return tsd; +} + +JEMALLOC_ALWAYS_INLINE tsd_t * +tsd_fetch(void) { + return tsd_fetch_impl(true, false); +} + +static inline bool +tsd_nominal(tsd_t *tsd) { + bool nominal = tsd_state_get(tsd) <= tsd_state_nominal_max; + assert(nominal || tsd_reentrancy_level_get(tsd) > 0); + + return nominal; +} + +JEMALLOC_ALWAYS_INLINE tsdn_t * +tsdn_fetch(void) { + if (!tsd_booted_get()) { + return NULL; + } + + return tsd_tsdn(tsd_fetch_impl(false, false)); +} + +JEMALLOC_ALWAYS_INLINE rtree_ctx_t * +tsd_rtree_ctx(tsd_t *tsd) { + return tsd_rtree_ctxp_get(tsd); +} + +JEMALLOC_ALWAYS_INLINE rtree_ctx_t * +tsdn_rtree_ctx(tsdn_t *tsdn, rtree_ctx_t *fallback) { + /* + * If tsd cannot be accessed, initialize the fallback rtree_ctx and + * return a pointer to it. + */ + if (unlikely(tsdn_null(tsdn))) { + rtree_ctx_data_init(fallback); + return fallback; + } + return tsd_rtree_ctx(tsdn_tsd(tsdn)); +} + +static inline bool +tsd_state_nocleanup(tsd_t *tsd) { + return tsd_state_get(tsd) == tsd_state_reincarnated || + tsd_state_get(tsd) == tsd_state_minimal_initialized; +} + +/* + * These "raw" tsd reentrancy functions don't have any debug checking to make + * sure that we're not touching arena 0. Better is to call pre_reentrancy and + * post_reentrancy if this is possible. + */ +static inline void +tsd_pre_reentrancy_raw(tsd_t *tsd) { + bool fast = tsd_fast(tsd); + assert(tsd_reentrancy_level_get(tsd) < INT8_MAX); + ++*tsd_reentrancy_levelp_get(tsd); + if (fast) { + /* Prepare slow path for reentrancy. */ + tsd_slow_update(tsd); + assert(tsd_state_get(tsd) == tsd_state_nominal_slow); + } +} + +static inline void +tsd_post_reentrancy_raw(tsd_t *tsd) { + int8_t *reentrancy_level = tsd_reentrancy_levelp_get(tsd); + assert(*reentrancy_level > 0); + if (--*reentrancy_level == 0) { + tsd_slow_update(tsd); + } +} + +#endif /* JEMALLOC_INTERNAL_TSD_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tsd_generic.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tsd_generic.h new file mode 100644 index 000000000..aa8042a4e --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tsd_generic.h @@ -0,0 +1,187 @@ +#ifdef JEMALLOC_INTERNAL_TSD_GENERIC_H +#error This file should be included only once, by tsd.h. +#endif +#define JEMALLOC_INTERNAL_TSD_GENERIC_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/ql.h" +#include "jemalloc/internal/tsd_internals.h" +#include "jemalloc/internal/tsd_types.h" + +typedef struct tsd_init_block_s tsd_init_block_t; +struct tsd_init_block_s { + ql_elm(tsd_init_block_t) link; + pthread_t thread; + void *data; +}; + +/* Defined in tsd.c, to allow the mutex headers to have tsd dependencies. */ +typedef struct tsd_init_head_s tsd_init_head_t; + +typedef struct { + bool initialized; + tsd_t val; +} tsd_wrapper_t; + +void *tsd_init_check_recursion(tsd_init_head_t *head, + tsd_init_block_t *block); +void tsd_init_finish(tsd_init_head_t *head, tsd_init_block_t *block); + +extern pthread_key_t tsd_tsd; +extern tsd_init_head_t tsd_init_head; +extern tsd_wrapper_t tsd_boot_wrapper; +extern bool tsd_booted; + +/* Initialization/cleanup. */ +JEMALLOC_ALWAYS_INLINE void +tsd_cleanup_wrapper(void *arg) { + tsd_wrapper_t *wrapper = (tsd_wrapper_t *)arg; + + if (wrapper->initialized) { + wrapper->initialized = false; + tsd_cleanup(&wrapper->val); + if (wrapper->initialized) { + /* Trigger another cleanup round. */ + if (pthread_setspecific(tsd_tsd, (void *)wrapper) != 0) + { + malloc_write(": Error setting TSD\n"); + if (opt_abort) { + abort(); + } + } + return; + } + } + malloc_tsd_dalloc(wrapper); +} + +JEMALLOC_ALWAYS_INLINE void +tsd_wrapper_set(tsd_wrapper_t *wrapper) { + if (unlikely(!tsd_booted)) { + return; + } + if (pthread_setspecific(tsd_tsd, (void *)wrapper) != 0) { + malloc_write(": Error setting TSD\n"); + abort(); + } +} + +JEMALLOC_ALWAYS_INLINE tsd_wrapper_t * +tsd_wrapper_get(bool init) { + tsd_wrapper_t *wrapper; + + if (unlikely(!tsd_booted)) { + return &tsd_boot_wrapper; + } + + wrapper = (tsd_wrapper_t *)pthread_getspecific(tsd_tsd); + + if (init && unlikely(wrapper == NULL)) { + tsd_init_block_t block; + wrapper = (tsd_wrapper_t *) + tsd_init_check_recursion(&tsd_init_head, &block); + if (wrapper) { + return wrapper; + } + wrapper = (tsd_wrapper_t *) + malloc_tsd_malloc(sizeof(tsd_wrapper_t)); + block.data = (void *)wrapper; + if (wrapper == NULL) { + malloc_write(": Error allocating TSD\n"); + abort(); + } else { + wrapper->initialized = false; + JEMALLOC_DIAGNOSTIC_PUSH + JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS + tsd_t initializer = TSD_INITIALIZER; + JEMALLOC_DIAGNOSTIC_POP + wrapper->val = initializer; + } + tsd_wrapper_set(wrapper); + tsd_init_finish(&tsd_init_head, &block); + } + return wrapper; +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_boot0(void) { + tsd_wrapper_t *wrapper; + tsd_init_block_t block; + + wrapper = (tsd_wrapper_t *) + tsd_init_check_recursion(&tsd_init_head, &block); + if (wrapper) { + return false; + } + block.data = &tsd_boot_wrapper; + if (pthread_key_create(&tsd_tsd, tsd_cleanup_wrapper) != 0) { + return true; + } + tsd_booted = true; + tsd_wrapper_set(&tsd_boot_wrapper); + tsd_init_finish(&tsd_init_head, &block); + return false; +} + +JEMALLOC_ALWAYS_INLINE void +tsd_boot1(void) { + tsd_wrapper_t *wrapper; + wrapper = (tsd_wrapper_t *)malloc_tsd_malloc(sizeof(tsd_wrapper_t)); + if (wrapper == NULL) { + malloc_write(": Error allocating TSD\n"); + abort(); + } + tsd_boot_wrapper.initialized = false; + tsd_cleanup(&tsd_boot_wrapper.val); + wrapper->initialized = false; + JEMALLOC_DIAGNOSTIC_PUSH + JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS + tsd_t initializer = TSD_INITIALIZER; + JEMALLOC_DIAGNOSTIC_POP + wrapper->val = initializer; + tsd_wrapper_set(wrapper); +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_boot(void) { + if (tsd_boot0()) { + return true; + } + tsd_boot1(); + return false; +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_booted_get(void) { + return tsd_booted; +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_get_allocates(void) { + return true; +} + +/* Get/set. */ +JEMALLOC_ALWAYS_INLINE tsd_t * +tsd_get(bool init) { + tsd_wrapper_t *wrapper; + + assert(tsd_booted); + wrapper = tsd_wrapper_get(init); + if (tsd_get_allocates() && !init && wrapper == NULL) { + return NULL; + } + return &wrapper->val; +} + +JEMALLOC_ALWAYS_INLINE void +tsd_set(tsd_t *val) { + tsd_wrapper_t *wrapper; + + assert(tsd_booted); + wrapper = tsd_wrapper_get(true); + if (likely(&wrapper->val != val)) { + wrapper->val = *(val); + } + wrapper->initialized = true; +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tsd_internals.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tsd_internals.h new file mode 100644 index 000000000..439f1d10c --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tsd_internals.h @@ -0,0 +1,304 @@ +#ifdef JEMALLOC_INTERNAL_TSD_INTERNALS_H +#error This file should be included only once, by one of tsd_malloc_thread_cleanup.h, tsd_tls.h, tsd_generic.h, or tsd_win.h +#endif +#define JEMALLOC_INTERNAL_TSD_INTERNALS_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/activity_callback.h" +#include "jemalloc/internal/arena_types.h" +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/bin_types.h" +#include "jemalloc/internal/jemalloc_internal_externs.h" +#include "jemalloc/internal/peak.h" +#include "jemalloc/internal/prof_types.h" +#include "jemalloc/internal/ql.h" +#include "jemalloc/internal/rtree_tsd.h" +#include "jemalloc/internal/tcache_structs.h" +#include "jemalloc/internal/tcache_types.h" +#include "jemalloc/internal/tsd_types.h" +#include "jemalloc/internal/util.h" +#include "jemalloc/internal/witness.h" + +/* + * Thread-Specific-Data layout + * + * At least some thread-local data gets touched on the fast-path of almost all + * malloc operations. But much of it is only necessary down slow-paths, or + * testing. We want to colocate the fast-path data so that it can live on the + * same cacheline if possible. So we define three tiers of hotness: + * TSD_DATA_FAST: Touched on the alloc/dalloc fast paths. + * TSD_DATA_SLOW: Touched down slow paths. "Slow" here is sort of general; + * there are "semi-slow" paths like "not a sized deallocation, but can still + * live in the tcache". We'll want to keep these closer to the fast-path + * data. + * TSD_DATA_SLOWER: Only touched in test or debug modes, or not touched at all. + * + * An additional concern is that the larger tcache bins won't be used (we have a + * bin per size class, but by default only cache relatively small objects). So + * the earlier bins are in the TSD_DATA_FAST tier, but the later ones are in the + * TSD_DATA_SLOWER tier. + * + * As a result of all this, we put the slow data first, then the fast data, then + * the slower data, while keeping the tcache as the last element of the fast + * data (so that the fast -> slower transition happens midway through the + * tcache). While we don't yet play alignment tricks to guarantee it, this + * increases our odds of getting some cache/page locality on fast paths. + */ + +#ifdef JEMALLOC_JET +typedef void (*test_callback_t)(int *); +# define MALLOC_TSD_TEST_DATA_INIT 0x72b65c10 +# define MALLOC_TEST_TSD \ + O(test_data, int, int) \ + O(test_callback, test_callback_t, int) +# define MALLOC_TEST_TSD_INITIALIZER , MALLOC_TSD_TEST_DATA_INIT, NULL +#else +# define MALLOC_TEST_TSD +# define MALLOC_TEST_TSD_INITIALIZER +#endif + +typedef ql_elm(tsd_t) tsd_link_t; + +/* O(name, type, nullable type) */ +#define TSD_DATA_SLOW \ + O(tcache_enabled, bool, bool) \ + O(reentrancy_level, int8_t, int8_t) \ + O(min_init_state_nfetched, uint8_t, uint8_t) \ + O(thread_allocated_last_event, uint64_t, uint64_t) \ + O(thread_allocated_next_event, uint64_t, uint64_t) \ + O(thread_deallocated_last_event, uint64_t, uint64_t) \ + O(thread_deallocated_next_event, uint64_t, uint64_t) \ + O(tcache_gc_event_wait, uint64_t, uint64_t) \ + O(tcache_gc_dalloc_event_wait, uint64_t, uint64_t) \ + O(prof_sample_event_wait, uint64_t, uint64_t) \ + O(prof_sample_last_event, uint64_t, uint64_t) \ + O(stats_interval_event_wait, uint64_t, uint64_t) \ + O(stats_interval_last_event, uint64_t, uint64_t) \ + O(peak_alloc_event_wait, uint64_t, uint64_t) \ + O(peak_dalloc_event_wait, uint64_t, uint64_t) \ + O(prof_tdata, prof_tdata_t *, prof_tdata_t *) \ + O(prng_state, uint64_t, uint64_t) \ + O(san_extents_until_guard_small, uint64_t, uint64_t) \ + O(san_extents_until_guard_large, uint64_t, uint64_t) \ + O(iarena, arena_t *, arena_t *) \ + O(arena, arena_t *, arena_t *) \ + O(arena_decay_ticker, ticker_geom_t, ticker_geom_t) \ + O(sec_shard, uint8_t, uint8_t) \ + O(binshards, tsd_binshards_t, tsd_binshards_t)\ + O(tsd_link, tsd_link_t, tsd_link_t) \ + O(in_hook, bool, bool) \ + O(peak, peak_t, peak_t) \ + O(activity_callback_thunk, activity_callback_thunk_t, \ + activity_callback_thunk_t) \ + O(tcache_slow, tcache_slow_t, tcache_slow_t) \ + O(rtree_ctx, rtree_ctx_t, rtree_ctx_t) + +#define TSD_DATA_SLOW_INITIALIZER \ + /* tcache_enabled */ TCACHE_ENABLED_ZERO_INITIALIZER, \ + /* reentrancy_level */ 0, \ + /* min_init_state_nfetched */ 0, \ + /* thread_allocated_last_event */ 0, \ + /* thread_allocated_next_event */ 0, \ + /* thread_deallocated_last_event */ 0, \ + /* thread_deallocated_next_event */ 0, \ + /* tcache_gc_event_wait */ 0, \ + /* tcache_gc_dalloc_event_wait */ 0, \ + /* prof_sample_event_wait */ 0, \ + /* prof_sample_last_event */ 0, \ + /* stats_interval_event_wait */ 0, \ + /* stats_interval_last_event */ 0, \ + /* peak_alloc_event_wait */ 0, \ + /* peak_dalloc_event_wait */ 0, \ + /* prof_tdata */ NULL, \ + /* prng_state */ 0, \ + /* san_extents_until_guard_small */ 0, \ + /* san_extents_until_guard_large */ 0, \ + /* iarena */ NULL, \ + /* arena */ NULL, \ + /* arena_decay_ticker */ \ + TICKER_GEOM_INIT(ARENA_DECAY_NTICKS_PER_UPDATE), \ + /* sec_shard */ (uint8_t)-1, \ + /* binshards */ TSD_BINSHARDS_ZERO_INITIALIZER, \ + /* tsd_link */ {NULL}, \ + /* in_hook */ false, \ + /* peak */ PEAK_INITIALIZER, \ + /* activity_callback_thunk */ \ + ACTIVITY_CALLBACK_THUNK_INITIALIZER, \ + /* tcache_slow */ TCACHE_SLOW_ZERO_INITIALIZER, \ + /* rtree_ctx */ RTREE_CTX_INITIALIZER, + +/* O(name, type, nullable type) */ +#define TSD_DATA_FAST \ + O(thread_allocated, uint64_t, uint64_t) \ + O(thread_allocated_next_event_fast, uint64_t, uint64_t) \ + O(thread_deallocated, uint64_t, uint64_t) \ + O(thread_deallocated_next_event_fast, uint64_t, uint64_t) \ + O(tcache, tcache_t, tcache_t) + +#define TSD_DATA_FAST_INITIALIZER \ + /* thread_allocated */ 0, \ + /* thread_allocated_next_event_fast */ 0, \ + /* thread_deallocated */ 0, \ + /* thread_deallocated_next_event_fast */ 0, \ + /* tcache */ TCACHE_ZERO_INITIALIZER, + +/* O(name, type, nullable type) */ +#define TSD_DATA_SLOWER \ + O(witness_tsd, witness_tsd_t, witness_tsdn_t) \ + MALLOC_TEST_TSD + +#define TSD_DATA_SLOWER_INITIALIZER \ + /* witness */ WITNESS_TSD_INITIALIZER \ + /* test data */ MALLOC_TEST_TSD_INITIALIZER + + +#define TSD_INITIALIZER { \ + TSD_DATA_SLOW_INITIALIZER \ + /* state */ ATOMIC_INIT(tsd_state_uninitialized), \ + TSD_DATA_FAST_INITIALIZER \ + TSD_DATA_SLOWER_INITIALIZER \ +} + +#if defined(JEMALLOC_MALLOC_THREAD_CLEANUP) || defined(_WIN32) +void _malloc_tsd_cleanup_register(bool (*f)(void)); +#endif + +void *malloc_tsd_malloc(size_t size); +void malloc_tsd_dalloc(void *wrapper); +tsd_t *malloc_tsd_boot0(void); +void malloc_tsd_boot1(void); +void tsd_cleanup(void *arg); +tsd_t *tsd_fetch_slow(tsd_t *tsd, bool minimal); +void tsd_state_set(tsd_t *tsd, uint8_t new_state); +void tsd_slow_update(tsd_t *tsd); +void tsd_prefork(tsd_t *tsd); +void tsd_postfork_parent(tsd_t *tsd); +void tsd_postfork_child(tsd_t *tsd); + +/* + * Call ..._inc when your module wants to take all threads down the slow paths, + * and ..._dec when it no longer needs to. + */ +void tsd_global_slow_inc(tsdn_t *tsdn); +void tsd_global_slow_dec(tsdn_t *tsdn); +bool tsd_global_slow(void); + +#define TSD_MIN_INIT_STATE_MAX_FETCHED (128) + +enum { + /* Common case --> jnz. */ + tsd_state_nominal = 0, + /* Initialized but on slow path. */ + tsd_state_nominal_slow = 1, + /* + * Some thread has changed global state in such a way that all nominal + * threads need to recompute their fast / slow status the next time they + * get a chance. + * + * Any thread can change another thread's status *to* recompute, but + * threads are the only ones who can change their status *from* + * recompute. + */ + tsd_state_nominal_recompute = 2, + /* + * The above nominal states should be lower values. We use + * tsd_nominal_max to separate nominal states from threads in the + * process of being born / dying. + */ + tsd_state_nominal_max = 2, + + /* + * A thread might free() during its death as its only allocator action; + * in such scenarios, we need tsd, but set up in such a way that no + * cleanup is necessary. + */ + tsd_state_minimal_initialized = 3, + /* States during which we know we're in thread death. */ + tsd_state_purgatory = 4, + tsd_state_reincarnated = 5, + /* + * What it says on the tin; tsd that hasn't been initialized. Note + * that even when the tsd struct lives in TLS, when need to keep track + * of stuff like whether or not our pthread destructors have been + * scheduled, so this really truly is different than the nominal state. + */ + tsd_state_uninitialized = 6 +}; + +/* + * Some TSD accesses can only be done in a nominal state. To enforce this, we + * wrap TSD member access in a function that asserts on TSD state, and mangle + * field names to prevent touching them accidentally. + */ +#define TSD_MANGLE(n) cant_access_tsd_items_directly_use_a_getter_or_setter_##n + +#ifdef JEMALLOC_U8_ATOMICS +# define tsd_state_t atomic_u8_t +# define tsd_atomic_load atomic_load_u8 +# define tsd_atomic_store atomic_store_u8 +# define tsd_atomic_exchange atomic_exchange_u8 +#else +# define tsd_state_t atomic_u32_t +# define tsd_atomic_load atomic_load_u32 +# define tsd_atomic_store atomic_store_u32 +# define tsd_atomic_exchange atomic_exchange_u32 +#endif + +/* The actual tsd. */ +struct tsd_s { + /* + * The contents should be treated as totally opaque outside the tsd + * module. Access any thread-local state through the getters and + * setters below. + */ + +#define O(n, t, nt) \ + t TSD_MANGLE(n); + + TSD_DATA_SLOW + /* + * We manually limit the state to just a single byte. Unless the 8-bit + * atomics are unavailable (which is rare). + */ + tsd_state_t state; + TSD_DATA_FAST + TSD_DATA_SLOWER +#undef O +}; + +JEMALLOC_ALWAYS_INLINE uint8_t +tsd_state_get(tsd_t *tsd) { + /* + * This should be atomic. Unfortunately, compilers right now can't tell + * that this can be done as a memory comparison, and forces a load into + * a register that hurts fast-path performance. + */ + /* return atomic_load_u8(&tsd->state, ATOMIC_RELAXED); */ + return *(uint8_t *)&tsd->state; +} + +/* + * Wrapper around tsd_t that makes it possible to avoid implicit conversion + * between tsd_t and tsdn_t, where tsdn_t is "nullable" and has to be + * explicitly converted to tsd_t, which is non-nullable. + */ +struct tsdn_s { + tsd_t tsd; +}; +#define TSDN_NULL ((tsdn_t *)0) +JEMALLOC_ALWAYS_INLINE tsdn_t * +tsd_tsdn(tsd_t *tsd) { + return (tsdn_t *)tsd; +} + +JEMALLOC_ALWAYS_INLINE bool +tsdn_null(const tsdn_t *tsdn) { + return tsdn == NULL; +} + +JEMALLOC_ALWAYS_INLINE tsd_t * +tsdn_tsd(tsdn_t *tsdn) { + assert(!tsdn_null(tsdn)); + + return &tsdn->tsd; +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tsd_malloc_thread_cleanup.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tsd_malloc_thread_cleanup.h new file mode 100644 index 000000000..fb9ea1b45 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tsd_malloc_thread_cleanup.h @@ -0,0 +1,65 @@ +#ifdef JEMALLOC_INTERNAL_TSD_MALLOC_THREAD_CLEANUP_H +#error This file should be included only once, by tsd.h. +#endif +#define JEMALLOC_INTERNAL_TSD_MALLOC_THREAD_CLEANUP_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/tsd_internals.h" +#include "jemalloc/internal/tsd_types.h" + +#define JEMALLOC_TSD_TYPE_ATTR(type) __thread type JEMALLOC_TLS_MODEL + +extern JEMALLOC_TSD_TYPE_ATTR(tsd_t) tsd_tls; +extern JEMALLOC_TSD_TYPE_ATTR(bool) tsd_initialized; +extern bool tsd_booted; + +/* Initialization/cleanup. */ +JEMALLOC_ALWAYS_INLINE bool +tsd_cleanup_wrapper(void) { + if (tsd_initialized) { + tsd_initialized = false; + tsd_cleanup(&tsd_tls); + } + return tsd_initialized; +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_boot0(void) { + _malloc_tsd_cleanup_register(&tsd_cleanup_wrapper); + tsd_booted = true; + return false; +} + +JEMALLOC_ALWAYS_INLINE void +tsd_boot1(void) { + /* Do nothing. */ +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_boot(void) { + return tsd_boot0(); +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_booted_get(void) { + return tsd_booted; +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_get_allocates(void) { + return false; +} + +/* Get/set. */ +JEMALLOC_ALWAYS_INLINE tsd_t * +tsd_get(bool init) { + return &tsd_tls; +} +JEMALLOC_ALWAYS_INLINE void +tsd_set(tsd_t *val) { + assert(tsd_booted); + if (likely(&tsd_tls != val)) { + tsd_tls = (*val); + } + tsd_initialized = true; +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tsd_tls.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tsd_tls.h new file mode 100644 index 000000000..5e5a6e5e3 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tsd_tls.h @@ -0,0 +1,64 @@ +#ifdef JEMALLOC_INTERNAL_TSD_TLS_H +#error This file should be included only once, by tsd.h. +#endif +#define JEMALLOC_INTERNAL_TSD_TLS_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/tsd_internals.h" +#include "jemalloc/internal/tsd_types.h" + +#define JEMALLOC_TSD_TYPE_ATTR(type) __thread type JEMALLOC_TLS_MODEL + +extern JEMALLOC_TSD_TYPE_ATTR(tsd_t) tsd_tls; +extern pthread_key_t tsd_tsd; +extern bool tsd_booted; + +/* Initialization/cleanup. */ +JEMALLOC_ALWAYS_INLINE bool +tsd_boot0(void) { + if (pthread_key_create(&tsd_tsd, &tsd_cleanup) != 0) { + return true; + } + tsd_booted = true; + return false; +} + +JEMALLOC_ALWAYS_INLINE void +tsd_boot1(void) { + /* Do nothing. */ +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_boot(void) { + return tsd_boot0(); +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_booted_get(void) { + return tsd_booted; +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_get_allocates(void) { + return false; +} + +/* Get/set. */ +JEMALLOC_ALWAYS_INLINE tsd_t * +tsd_get(bool init) { + return &tsd_tls; +} + +JEMALLOC_ALWAYS_INLINE void +tsd_set(tsd_t *val) { + assert(tsd_booted); + if (likely(&tsd_tls != val)) { + tsd_tls = (*val); + } + if (pthread_setspecific(tsd_tsd, (void *)(&tsd_tls)) != 0) { + malloc_write(": Error setting tsd.\n"); + if (opt_abort) { + abort(); + } + } +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tsd_types.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tsd_types.h new file mode 100644 index 000000000..73bbe486e --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tsd_types.h @@ -0,0 +1,12 @@ +#ifndef JEMALLOC_INTERNAL_TSD_TYPES_H +#define JEMALLOC_INTERNAL_TSD_TYPES_H + +#define MALLOC_TSD_CLEANUPS_MAX 4 + +#include "jemalloc/internal/jemalloc_preamble.h" + +typedef struct tsd_s tsd_t; +typedef struct tsdn_s tsdn_t; +typedef bool (*malloc_tsd_cleanup_t)(void); + +#endif /* JEMALLOC_INTERNAL_TSD_TYPES_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tsd_win.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tsd_win.h new file mode 100644 index 000000000..8ec7eda72 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/tsd_win.h @@ -0,0 +1,143 @@ +#ifdef JEMALLOC_INTERNAL_TSD_WIN_H +#error This file should be included only once, by tsd.h. +#endif +#define JEMALLOC_INTERNAL_TSD_WIN_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/tsd_internals.h" +#include "jemalloc/internal/tsd_types.h" + +typedef struct { + bool initialized; + tsd_t val; +} tsd_wrapper_t; + +extern DWORD tsd_tsd; +extern tsd_wrapper_t tsd_boot_wrapper; +extern bool tsd_booted; + +/* Initialization/cleanup. */ +JEMALLOC_ALWAYS_INLINE bool +tsd_cleanup_wrapper(void) { + DWORD error = GetLastError(); + tsd_wrapper_t *wrapper = (tsd_wrapper_t *)TlsGetValue(tsd_tsd); + SetLastError(error); + + if (wrapper == NULL) { + return false; + } + + if (wrapper->initialized) { + wrapper->initialized = false; + tsd_cleanup(&wrapper->val); + if (wrapper->initialized) { + /* Trigger another cleanup round. */ + return true; + } + } + malloc_tsd_dalloc(wrapper); + return false; +} + +JEMALLOC_ALWAYS_INLINE void +tsd_wrapper_set(tsd_wrapper_t *wrapper) { + if (!TlsSetValue(tsd_tsd, (void *)wrapper)) { + malloc_write(": Error setting TSD\n"); + abort(); + } +} + +JEMALLOC_ALWAYS_INLINE tsd_wrapper_t * +tsd_wrapper_get(bool init) { + DWORD error = GetLastError(); + tsd_wrapper_t *wrapper = (tsd_wrapper_t *) TlsGetValue(tsd_tsd); + SetLastError(error); + + if (init && unlikely(wrapper == NULL)) { + wrapper = (tsd_wrapper_t *) + malloc_tsd_malloc(sizeof(tsd_wrapper_t)); + if (wrapper == NULL) { + malloc_write(": Error allocating TSD\n"); + abort(); + } else { + wrapper->initialized = false; + /* MSVC is finicky about aggregate initialization. */ + tsd_t tsd_initializer = TSD_INITIALIZER; + wrapper->val = tsd_initializer; + } + tsd_wrapper_set(wrapper); + } + return wrapper; +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_boot0(void) { + tsd_tsd = TlsAlloc(); + if (tsd_tsd == TLS_OUT_OF_INDEXES) { + return true; + } + _malloc_tsd_cleanup_register(&tsd_cleanup_wrapper); + tsd_wrapper_set(&tsd_boot_wrapper); + tsd_booted = true; + return false; +} + +JEMALLOC_ALWAYS_INLINE void +tsd_boot1(void) { + tsd_wrapper_t *wrapper; + wrapper = (tsd_wrapper_t *) + malloc_tsd_malloc(sizeof(tsd_wrapper_t)); + if (wrapper == NULL) { + malloc_write(": Error allocating TSD\n"); + abort(); + } + tsd_boot_wrapper.initialized = false; + tsd_cleanup(&tsd_boot_wrapper.val); + wrapper->initialized = false; + tsd_t initializer = TSD_INITIALIZER; + wrapper->val = initializer; + tsd_wrapper_set(wrapper); +} +JEMALLOC_ALWAYS_INLINE bool +tsd_boot(void) { + if (tsd_boot0()) { + return true; + } + tsd_boot1(); + return false; +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_booted_get(void) { + return tsd_booted; +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_get_allocates(void) { + return true; +} + +/* Get/set. */ +JEMALLOC_ALWAYS_INLINE tsd_t * +tsd_get(bool init) { + tsd_wrapper_t *wrapper; + + assert(tsd_booted); + wrapper = tsd_wrapper_get(init); + if (tsd_get_allocates() && !init && wrapper == NULL) { + return NULL; + } + return &wrapper->val; +} + +JEMALLOC_ALWAYS_INLINE void +tsd_set(tsd_t *val) { + tsd_wrapper_t *wrapper; + + assert(tsd_booted); + wrapper = tsd_wrapper_get(true); + if (likely(&wrapper->val != val)) { + wrapper->val = *(val); + } + wrapper->initialized = true; +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/typed_list.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/typed_list.h new file mode 100644 index 000000000..6535055a1 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/typed_list.h @@ -0,0 +1,55 @@ +#ifndef JEMALLOC_INTERNAL_TYPED_LIST_H +#define JEMALLOC_INTERNAL_TYPED_LIST_H + +/* + * This wraps the ql module to implement a list class in a way that's a little + * bit easier to use; it handles ql_elm_new calls and provides type safety. + */ + +#define TYPED_LIST(list_type, el_type, linkage) \ +typedef struct { \ + ql_head(el_type) head; \ +} list_type##_t; \ +static inline void \ +list_type##_init(list_type##_t *list) { \ + ql_new(&list->head); \ +} \ +static inline el_type * \ +list_type##_first(const list_type##_t *list) { \ + return ql_first(&list->head); \ +} \ +static inline el_type * \ +list_type##_last(const list_type##_t *list) { \ + return ql_last(&list->head, linkage); \ +} \ +static inline void \ +list_type##_append(list_type##_t *list, el_type *item) { \ + ql_elm_new(item, linkage); \ + ql_tail_insert(&list->head, item, linkage); \ +} \ +static inline void \ +list_type##_prepend(list_type##_t *list, el_type *item) { \ + ql_elm_new(item, linkage); \ + ql_head_insert(&list->head, item, linkage); \ +} \ +static inline void \ +list_type##_replace(list_type##_t *list, el_type *to_remove, \ + el_type *to_insert) { \ + ql_elm_new(to_insert, linkage); \ + ql_after_insert(to_remove, to_insert, linkage); \ + ql_remove(&list->head, to_remove, linkage); \ +} \ +static inline void \ +list_type##_remove(list_type##_t *list, el_type *item) { \ + ql_remove(&list->head, item, linkage); \ +} \ +static inline bool \ +list_type##_empty(list_type##_t *list) { \ + return ql_empty(&list->head); \ +} \ +static inline void \ +list_type##_concat(list_type##_t *list_a, list_type##_t *list_b) { \ + ql_concat(&list_a->head, &list_b->head, linkage); \ +} + +#endif /* JEMALLOC_INTERNAL_TYPED_LIST_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/util.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/util.h new file mode 100644 index 000000000..f40350951 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/util.h @@ -0,0 +1,141 @@ +#ifndef JEMALLOC_INTERNAL_UTIL_H +#define JEMALLOC_INTERNAL_UTIL_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_types.h" + +#define UTIL_INLINE static inline + +/* Junk fill patterns. */ +#ifndef JEMALLOC_ALLOC_JUNK +# define JEMALLOC_ALLOC_JUNK ((uint8_t)0xa5) +#endif +#ifndef JEMALLOC_FREE_JUNK +# define JEMALLOC_FREE_JUNK ((uint8_t)0x5a) +#endif + +/* + * Wrap a cpp argument that contains commas such that it isn't broken up into + * multiple arguments. + */ +#define JEMALLOC_ARG_CONCAT(...) __VA_ARGS__ + +/* cpp macro definition stringification. */ +#define STRINGIFY_HELPER(x) #x +#define STRINGIFY(x) STRINGIFY_HELPER(x) + +/* + * Silence compiler warnings due to uninitialized values. This is used + * wherever the compiler fails to recognize that the variable is never used + * uninitialized. + */ +#define JEMALLOC_CC_SILENCE_INIT(v) = v + +#ifdef __GNUC__ +# define likely(x) __builtin_expect(!!(x), 1) +# define unlikely(x) __builtin_expect(!!(x), 0) +#else +# define likely(x) !!(x) +# define unlikely(x) !!(x) +#endif + +#if !defined(JEMALLOC_INTERNAL_UNREACHABLE) +# error JEMALLOC_INTERNAL_UNREACHABLE should have been defined by configure +#endif + +#define unreachable() JEMALLOC_INTERNAL_UNREACHABLE() + +/* Set error code. */ +UTIL_INLINE void +set_errno(int errnum) { +#ifdef _WIN32 + SetLastError(errnum); +#else + errno = errnum; +#endif +} + +/* Get last error code. */ +UTIL_INLINE int +get_errno(void) { +#ifdef _WIN32 + return GetLastError(); +#else + return errno; +#endif +} + +#ifdef _MSC_VER +#define util_assume __assume +#elif defined(__clang__) && (__clang_major__ > 3 || \ + (__clang_major__ == 3 && __clang_minor__ >= 6)) +#define util_assume __builtin_assume +#else +#define util_assume(expr) \ + do { \ + if (!(expr)) { \ + unreachable(); \ + } \ + } while(0) +#endif + +/* ptr should be valid. */ +JEMALLOC_ALWAYS_INLINE void +util_prefetch_read(void *ptr) { + /* + * This should arguably be a config check; but any version of GCC so old + * that it doesn't support __builtin_prefetch is also too old to build + * jemalloc. + */ +#ifdef __GNUC__ + if (config_debug) { + /* Enforce the "valid ptr" requirement. */ + *(volatile char *)ptr; + } + __builtin_prefetch(ptr, /* read or write */ 0, /* locality hint */ 3); +#else + *(volatile char *)ptr; +#endif +} + +JEMALLOC_ALWAYS_INLINE void +util_prefetch_write(void *ptr) { +#ifdef __GNUC__ + if (config_debug) { + *(volatile char *)ptr; + } + /* + * The only difference from the read variant is that this has a 1 as the + * second argument (the write hint). + */ + __builtin_prefetch(ptr, 1, 3); +#else + *(volatile char *)ptr; +#endif +} + +JEMALLOC_ALWAYS_INLINE void +util_prefetch_read_range(void *ptr, size_t sz) { + for (size_t i = 0; i < sz; i += CACHELINE) { + util_prefetch_read((void *)((byte_t *)ptr + i)); + } +} + +JEMALLOC_ALWAYS_INLINE void +util_prefetch_write_range(void *ptr, size_t sz) { + for (size_t i = 0; i < sz; i += CACHELINE) { + util_prefetch_write((void *)((byte_t *)ptr + i)); + } +} + +#undef UTIL_INLINE + +/* + * Reads the settings in the following format: + * key1-key2:value|key3-key4:value|... + * Note it does not handle the ending '\0'. + */ +bool +multi_setting_parse_next(const char **setting_segment_cur, size_t *len_left, + size_t *key_start, size_t *key_end, size_t *value); +#endif /* JEMALLOC_INTERNAL_UTIL_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/witness.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/witness.h new file mode 100644 index 000000000..afee1246a --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/internal/witness.h @@ -0,0 +1,384 @@ +#ifndef JEMALLOC_INTERNAL_WITNESS_H +#define JEMALLOC_INTERNAL_WITNESS_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/ql.h" + +/******************************************************************************/ +/* LOCK RANKS */ +/******************************************************************************/ + +enum witness_rank_e { + /* + * Order matters within this enum listing -- higher valued locks can + * only be acquired after lower-valued ones. We use the + * auto-incrementing-ness of enum values to enforce this. + */ + + /* + * Witnesses with rank WITNESS_RANK_OMIT are completely ignored by the + * witness machinery. + */ + WITNESS_RANK_OMIT, + WITNESS_RANK_MIN, + WITNESS_RANK_INIT = WITNESS_RANK_MIN, + WITNESS_RANK_CTL, + WITNESS_RANK_TCACHES, + WITNESS_RANK_ARENAS, + WITNESS_RANK_BACKGROUND_THREAD_GLOBAL, + WITNESS_RANK_PROF_DUMP, + WITNESS_RANK_PROF_BT2GCTX, + WITNESS_RANK_PROF_TDATAS, + WITNESS_RANK_PROF_TDATA, + WITNESS_RANK_PROF_LOG, + WITNESS_RANK_PROF_GCTX, + WITNESS_RANK_PROF_RECENT_DUMP, + WITNESS_RANK_BACKGROUND_THREAD, + /* + * Used as an argument to witness_assert_depth_to_rank() in order to + * validate depth excluding non-core locks with lower ranks. Since the + * rank argument to witness_assert_depth_to_rank() is inclusive rather + * than exclusive, this definition can have the same value as the + * minimally ranked core lock. + */ + WITNESS_RANK_CORE, + WITNESS_RANK_DECAY = WITNESS_RANK_CORE, + WITNESS_RANK_TCACHE_QL, + + WITNESS_RANK_SEC_SHARD, + + WITNESS_RANK_EXTENT_GROW, + WITNESS_RANK_HPA_SHARD_GROW = WITNESS_RANK_EXTENT_GROW, + WITNESS_RANK_SAN_BUMP_ALLOC = WITNESS_RANK_EXTENT_GROW, + + WITNESS_RANK_EXTENTS, + WITNESS_RANK_HPA_SHARD = WITNESS_RANK_EXTENTS, + + WITNESS_RANK_HPA_CENTRAL_GROW, + WITNESS_RANK_HPA_CENTRAL, + + WITNESS_RANK_EDATA_CACHE, + + WITNESS_RANK_RTREE, + WITNESS_RANK_BASE, + WITNESS_RANK_ARENA_LARGE, + WITNESS_RANK_HOOK, + WITNESS_RANK_BIN, + + WITNESS_RANK_LEAF=0x1000, + WITNESS_RANK_BATCHER=WITNESS_RANK_LEAF, + WITNESS_RANK_ARENA_STATS = WITNESS_RANK_LEAF, + WITNESS_RANK_COUNTER_ACCUM = WITNESS_RANK_LEAF, + WITNESS_RANK_DSS = WITNESS_RANK_LEAF, + WITNESS_RANK_PROF_ACTIVE = WITNESS_RANK_LEAF, + WITNESS_RANK_PROF_DUMP_FILENAME = WITNESS_RANK_LEAF, + WITNESS_RANK_PROF_GDUMP = WITNESS_RANK_LEAF, + WITNESS_RANK_PROF_NEXT_THR_UID = WITNESS_RANK_LEAF, + WITNESS_RANK_PROF_RECENT_ALLOC = WITNESS_RANK_LEAF, + WITNESS_RANK_PROF_STATS = WITNESS_RANK_LEAF, + WITNESS_RANK_PROF_THREAD_ACTIVE_INIT = WITNESS_RANK_LEAF, +}; +typedef enum witness_rank_e witness_rank_t; + +/******************************************************************************/ +/* PER-WITNESS DATA */ +/******************************************************************************/ +#if defined(JEMALLOC_DEBUG) +# define WITNESS_INITIALIZER(name, rank) {name, rank, NULL, NULL, {NULL, NULL}} +#else +# define WITNESS_INITIALIZER(name, rank) +#endif + +typedef struct witness_s witness_t; +typedef ql_head(witness_t) witness_list_t; +typedef int witness_comp_t (const witness_t *, void *, const witness_t *, + void *); + +struct witness_s { + /* Name, used for printing lock order reversal messages. */ + const char *name; + + /* + * Witness rank, where 0 is lowest and WITNESS_RANK_LEAF is highest. + * Witnesses must be acquired in order of increasing rank. + */ + witness_rank_t rank; + + /* + * If two witnesses are of equal rank and they have the samp comp + * function pointer, it is called as a last attempt to differentiate + * between witnesses of equal rank. + */ + witness_comp_t *comp; + + /* Opaque data, passed to comp(). */ + void *opaque; + + /* Linkage for thread's currently owned locks. */ + ql_elm(witness_t) link; +}; + +/******************************************************************************/ +/* PER-THREAD DATA */ +/******************************************************************************/ +typedef struct witness_tsd_s witness_tsd_t; +struct witness_tsd_s { + witness_list_t witnesses; + bool forking; +}; + +#define WITNESS_TSD_INITIALIZER { ql_head_initializer(witnesses), false } +#define WITNESS_TSDN_NULL ((witness_tsdn_t *)0) + +/******************************************************************************/ +/* (PER-THREAD) NULLABILITY HELPERS */ +/******************************************************************************/ +typedef struct witness_tsdn_s witness_tsdn_t; +struct witness_tsdn_s { + witness_tsd_t witness_tsd; +}; + +JEMALLOC_ALWAYS_INLINE witness_tsdn_t * +witness_tsd_tsdn(witness_tsd_t *witness_tsd) { + return (witness_tsdn_t *)witness_tsd; +} + +JEMALLOC_ALWAYS_INLINE bool +witness_tsdn_null(witness_tsdn_t *witness_tsdn) { + return witness_tsdn == NULL; +} + +JEMALLOC_ALWAYS_INLINE witness_tsd_t * +witness_tsdn_tsd(witness_tsdn_t *witness_tsdn) { + assert(!witness_tsdn_null(witness_tsdn)); + return &witness_tsdn->witness_tsd; +} + +/******************************************************************************/ +/* API */ +/******************************************************************************/ +void witness_init(witness_t *witness, const char *name, witness_rank_t rank, + witness_comp_t *comp, void *opaque); + +typedef void (witness_lock_error_t)(const witness_list_t *, const witness_t *); +extern witness_lock_error_t *JET_MUTABLE witness_lock_error; + +typedef void (witness_owner_error_t)(const witness_t *); +extern witness_owner_error_t *JET_MUTABLE witness_owner_error; + +typedef void (witness_not_owner_error_t)(const witness_t *); +extern witness_not_owner_error_t *JET_MUTABLE witness_not_owner_error; + +typedef void (witness_depth_error_t)(const witness_list_t *, + witness_rank_t rank_inclusive, unsigned depth); +extern witness_depth_error_t *JET_MUTABLE witness_depth_error; + +void witnesses_cleanup(witness_tsd_t *witness_tsd); +void witness_prefork(witness_tsd_t *witness_tsd); +void witness_postfork_parent(witness_tsd_t *witness_tsd); +void witness_postfork_child(witness_tsd_t *witness_tsd); + +/* Helper, not intended for direct use. */ +static inline bool +witness_owner(witness_tsd_t *witness_tsd, const witness_t *witness) { + witness_list_t *witnesses; + witness_t *w; + + cassert(config_debug); + + witnesses = &witness_tsd->witnesses; + ql_foreach(w, witnesses, link) { + if (w == witness) { + return true; + } + } + + return false; +} + +static inline void +witness_assert_owner(witness_tsdn_t *witness_tsdn, const witness_t *witness) { + witness_tsd_t *witness_tsd; + + if (!config_debug) { + return; + } + + if (witness_tsdn_null(witness_tsdn)) { + return; + } + witness_tsd = witness_tsdn_tsd(witness_tsdn); + if (witness->rank == WITNESS_RANK_OMIT) { + return; + } + + if (witness_owner(witness_tsd, witness)) { + return; + } + witness_owner_error(witness); +} + +static inline void +witness_assert_not_owner(witness_tsdn_t *witness_tsdn, + const witness_t *witness) { + witness_tsd_t *witness_tsd; + witness_list_t *witnesses; + witness_t *w; + + if (!config_debug) { + return; + } + + if (witness_tsdn_null(witness_tsdn)) { + return; + } + witness_tsd = witness_tsdn_tsd(witness_tsdn); + if (witness->rank == WITNESS_RANK_OMIT) { + return; + } + + witnesses = &witness_tsd->witnesses; + ql_foreach(w, witnesses, link) { + if (w == witness) { + witness_not_owner_error(witness); + } + } +} + +/* Returns depth. Not intended for direct use. */ +static inline unsigned +witness_depth_to_rank(witness_list_t *witnesses, witness_rank_t rank_inclusive) +{ + unsigned d = 0; + witness_t *w = ql_last(witnesses, link); + + if (w != NULL) { + ql_reverse_foreach(w, witnesses, link) { + if (w->rank < rank_inclusive) { + break; + } + d++; + } + } + + return d; +} + +static inline void +witness_assert_depth_to_rank(witness_tsdn_t *witness_tsdn, + witness_rank_t rank_inclusive, unsigned depth) { + if (!config_debug || witness_tsdn_null(witness_tsdn)) { + return; + } + + witness_list_t *witnesses = &witness_tsdn_tsd(witness_tsdn)->witnesses; + unsigned d = witness_depth_to_rank(witnesses, rank_inclusive); + + if (d != depth) { + witness_depth_error(witnesses, rank_inclusive, depth); + } +} + +static inline void +witness_assert_depth(witness_tsdn_t *witness_tsdn, unsigned depth) { + witness_assert_depth_to_rank(witness_tsdn, WITNESS_RANK_MIN, depth); +} + +static inline void +witness_assert_lockless(witness_tsdn_t *witness_tsdn) { + witness_assert_depth(witness_tsdn, 0); +} + +static inline void +witness_assert_positive_depth_to_rank(witness_tsdn_t *witness_tsdn, + witness_rank_t rank_inclusive) { + if (!config_debug || witness_tsdn_null(witness_tsdn)) { + return; + } + + witness_list_t *witnesses = &witness_tsdn_tsd(witness_tsdn)->witnesses; + unsigned d = witness_depth_to_rank(witnesses, rank_inclusive); + + if (d == 0) { + witness_depth_error(witnesses, rank_inclusive, 1); + } +} + +static inline void +witness_lock(witness_tsdn_t *witness_tsdn, witness_t *witness) { + witness_tsd_t *witness_tsd; + witness_list_t *witnesses; + witness_t *w; + + if (!config_debug) { + return; + } + + if (witness_tsdn_null(witness_tsdn)) { + return; + } + witness_tsd = witness_tsdn_tsd(witness_tsdn); + if (witness->rank == WITNESS_RANK_OMIT) { + return; + } + + witness_assert_not_owner(witness_tsdn, witness); + + witnesses = &witness_tsd->witnesses; + w = ql_last(witnesses, link); + if (w == NULL) { + /* No other locks; do nothing. */ + } else if (witness_tsd->forking && w->rank <= witness->rank) { + /* Forking, and relaxed ranking satisfied. */ + } else if (w->rank > witness->rank) { + /* Not forking, rank order reversal. */ + witness_lock_error(witnesses, witness); + } else if (w->rank == witness->rank && (w->comp == NULL || w->comp != + witness->comp || w->comp(w, w->opaque, witness, witness->opaque) > + 0)) { + /* + * Missing/incompatible comparison function, or comparison + * function indicates rank order reversal. + */ + witness_lock_error(witnesses, witness); + } + + /* Suppress spurious warning from static analysis */ + assert(ql_empty(witnesses) || + qr_prev(ql_first(witnesses), link) != NULL); + ql_elm_new(witness, link); + ql_tail_insert(witnesses, witness, link); +} + +static inline void +witness_unlock(witness_tsdn_t *witness_tsdn, witness_t *witness) { + witness_tsd_t *witness_tsd; + witness_list_t *witnesses; + + if (!config_debug) { + return; + } + + if (witness_tsdn_null(witness_tsdn)) { + return; + } + witness_tsd = witness_tsdn_tsd(witness_tsdn); + if (witness->rank == WITNESS_RANK_OMIT) { + return; + } + + /* + * Check whether owner before removal, rather than relying on + * witness_assert_owner() to abort, so that unit tests can test this + * function's failure mode without causing undefined behavior. + */ + if (witness_owner(witness_tsd, witness)) { + witnesses = &witness_tsd->witnesses; + ql_remove(witnesses, witness, link); + } else { + witness_assert_owner(witness_tsdn, witness); + } +} + +#endif /* JEMALLOC_INTERNAL_WITNESS_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc.h new file mode 100644 index 000000000..8d4c1b13f --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc.h @@ -0,0 +1,506 @@ +#ifndef JEMALLOC_H_ +#define JEMALLOC_H_ +#pragma GCC system_header +#ifdef __cplusplus +extern "C" { +#endif + +// DuckDB uses a 5s decay +#define DUCKDB_JEMALLOC_DECAY 5 + +/* Defined if __attribute__((...)) syntax is supported. */ +#define JEMALLOC_HAVE_ATTR + +/* Defined if alloc_size attribute is supported. */ +#define JEMALLOC_HAVE_ATTR_ALLOC_SIZE + +/* Defined if format_arg(...) attribute is supported. */ +#define JEMALLOC_HAVE_ATTR_FORMAT_ARG + +/* Defined if format(gnu_printf, ...) attribute is supported. */ +/* #undef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF */ + +/* Defined if format(printf, ...) attribute is supported. */ +#define JEMALLOC_HAVE_ATTR_FORMAT_PRINTF + +/* Defined if fallthrough attribute is supported. */ +#define JEMALLOC_HAVE_ATTR_FALLTHROUGH + +/* Defined if cold attribute is supported. */ +#define JEMALLOC_HAVE_ATTR_COLD + +/* Defined if deprecated attribute is supported. */ +#define JEMALLOC_HAVE_ATTR_DEPRECATED + +/* + * Define overrides for non-standard allocator-related functions if they are + * present on the system. + */ +/* #undef JEMALLOC_OVERRIDE_MEMALIGN */ +#define JEMALLOC_OVERRIDE_VALLOC +/* #undef JEMALLOC_OVERRIDE_PVALLOC */ + +/* + * At least Linux omits the "const" in: + * + * size_t malloc_usable_size(const void *ptr); + * + * Match the operating system's prototype. + */ +#define JEMALLOC_USABLE_SIZE_CONST const + +/* + * If defined, specify throw() for the public function prototypes when compiling + * with C++. The only justification for this is to match the prototypes that + * glibc defines. + */ +/* #undef JEMALLOC_USE_CXX_THROW */ + +#ifdef _MSC_VER +# ifdef _WIN64 +# define LG_SIZEOF_PTR_WIN 3 +# else +# define LG_SIZEOF_PTR_WIN 2 +# endif +#endif + + /* sizeof(void *) == 2^LG_SIZEOF_PTR. */ +#include +#ifdef _MSC_VER +# define LG_SIZEOF_PTR LG_SIZEOF_PTR_WIN +#elif INTPTR_MAX == INT64_MAX +# define LG_SIZEOF_PTR 3 +#else +# define LG_SIZEOF_PTR 2 +#endif + +/* + * Name mangling for public symbols is controlled by --with-mangling and + * --with-jemalloc-prefix. With default settings the je_ prefix is stripped by + * these macro definitions. + */ +#ifndef JEMALLOC_NO_RENAME +# define je_aligned_alloc duckdb_je_aligned_alloc +# define je_calloc duckdb_je_calloc +# define je_dallocx duckdb_je_dallocx +# define je_free duckdb_je_free +# define je_free_sized duckdb_je_free_sized +# define je_free_aligned_sized duckdb_je_free_aligned_sized +# define je_mallctl duckdb_je_mallctl +# define je_mallctlbymib duckdb_je_mallctlbymib +# define je_mallctlnametomib duckdb_je_mallctlnametomib +# define je_malloc duckdb_je_malloc +# define je_malloc_conf duckdb_je_malloc_conf +# define je_malloc_conf_2_conf_harder duckdb_je_malloc_conf_2_conf_harder +# define je_malloc_message duckdb_je_malloc_message +# define je_malloc_stats_print duckdb_je_malloc_stats_print +# define je_malloc_usable_size duckdb_je_malloc_usable_size +# define je_mallocx duckdb_je_mallocx +# define je_smallocx_a25b9b8ba91881964be3083db349991bbbbf1661 duckdb_je_smallocx_a25b9b8ba91881964be3083db349991bbbbf1661 +# define je_nallocx duckdb_je_nallocx +# define je_posix_memalign duckdb_je_posix_memalign +# define je_rallocx duckdb_je_rallocx +# define je_realloc duckdb_je_realloc +# define je_sallocx duckdb_je_sallocx +# define je_sdallocx duckdb_je_sdallocx +# define je_xallocx duckdb_je_xallocx +# define je_valloc duckdb_je_valloc +# define je_malloc_size duckdb_je_malloc_size +#endif + +#include +#include +#include +#include +#include + +#define JEMALLOC_VERSION "5.3.0-196-ga25b9b8ba91881964be3083db349991bbbbf1661" +#define JEMALLOC_VERSION_MAJOR 5 +#define JEMALLOC_VERSION_MINOR 3 +#define JEMALLOC_VERSION_BUGFIX 0 +#define JEMALLOC_VERSION_NREV 196 +#define JEMALLOC_VERSION_GID "a25b9b8ba91881964be3083db349991bbbbf1661" +#define JEMALLOC_VERSION_GID_IDENT a25b9b8ba91881964be3083db349991bbbbf1661 + +#define MALLOCX_LG_ALIGN(la) ((int)(la)) +#if LG_SIZEOF_PTR == 2 +# define MALLOCX_ALIGN(a) ((int)(ffs((int)(a))-1)) +#else +# define MALLOCX_ALIGN(a) \ + ((int)(((size_t)(a) < (size_t)INT_MAX) ? ffs((int)(a))-1 : \ + ffs((int)(((size_t)(a))>>32))+31)) +#endif +#define MALLOCX_ZERO ((int)0x40) +/* + * Bias tcache index bits so that 0 encodes "automatic tcache management", and 1 + * encodes MALLOCX_TCACHE_NONE. + */ +#define MALLOCX_TCACHE(tc) ((int)(((tc)+2) << 8)) +#define MALLOCX_TCACHE_NONE MALLOCX_TCACHE(-1) +/* + * Bias arena index bits so that 0 encodes "use an automatically chosen arena". + */ +#define MALLOCX_ARENA(a) ((((int)(a))+1) << 20) + +/* + * Use as arena index in "arena..{purge,decay,dss}" and + * "stats.arenas..*" mallctl interfaces to select all arenas. This + * definition is intentionally specified in raw decimal format to support + * cpp-based string concatenation, e.g. + * + * #define STRINGIFY_HELPER(x) #x + * #define STRINGIFY(x) STRINGIFY_HELPER(x) + * + * mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".purge", NULL, NULL, NULL, + * 0); + */ +#define MALLCTL_ARENAS_ALL 4096 +/* + * Use as arena index in "stats.arenas..*" mallctl interfaces to select + * destroyed arenas. + */ +#define MALLCTL_ARENAS_DESTROYED 4097 + +#if defined(__cplusplus) && defined(JEMALLOC_USE_CXX_THROW) +# define JEMALLOC_CXX_THROW noexcept (true) +#else +# define JEMALLOC_CXX_THROW +#endif + +#if defined(_MSC_VER) +# define JEMALLOC_ATTR(s) +# define JEMALLOC_ALIGNED(s) __declspec(align(s)) +# define JEMALLOC_ALLOC_SIZE(s) +# define JEMALLOC_ALLOC_SIZE2(s1, s2) +# ifndef JEMALLOC_EXPORT +# ifdef DLLEXPORT +# define JEMALLOC_EXPORT __declspec(dllexport) +# else +# define JEMALLOC_EXPORT __declspec(dllimport) +# endif +# endif +# define JEMALLOC_FORMAT_ARG(i) +# define JEMALLOC_FORMAT_PRINTF(s, i) +# define JEMALLOC_FALLTHROUGH +# define JEMALLOC_NOINLINE __declspec(noinline) +# ifdef __cplusplus +# define JEMALLOC_NOTHROW __declspec(nothrow) +# else +# define JEMALLOC_NOTHROW +# endif +# define JEMALLOC_SECTION(s) __declspec(allocate(s)) +# define JEMALLOC_RESTRICT_RETURN __declspec(restrict) +# if _MSC_VER >= 1900 && !defined(__EDG__) +# define JEMALLOC_ALLOCATOR __declspec(allocator) +# else +# define JEMALLOC_ALLOCATOR +# endif +# define JEMALLOC_COLD +# define JEMALLOC_WARN_ON_USAGE(warning_message) +#elif defined(JEMALLOC_HAVE_ATTR) +# define JEMALLOC_ATTR(s) __attribute__((s)) +# define JEMALLOC_ALIGNED(s) JEMALLOC_ATTR(aligned(s)) +# ifdef JEMALLOC_HAVE_ATTR_ALLOC_SIZE +# define JEMALLOC_ALLOC_SIZE(s) JEMALLOC_ATTR(alloc_size(s)) +# define JEMALLOC_ALLOC_SIZE2(s1, s2) JEMALLOC_ATTR(alloc_size(s1, s2)) +# else +# define JEMALLOC_ALLOC_SIZE(s) +# define JEMALLOC_ALLOC_SIZE2(s1, s2) +# endif +# ifndef JEMALLOC_EXPORT +# define JEMALLOC_EXPORT JEMALLOC_ATTR(visibility("default")) +# endif +# ifdef JEMALLOC_HAVE_ATTR_FORMAT_ARG +# define JEMALLOC_FORMAT_ARG(i) JEMALLOC_ATTR(__format_arg__(3)) +# else +# define JEMALLOC_FORMAT_ARG(i) +# endif +# ifdef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF +# define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(gnu_printf, s, i)) +# elif defined(JEMALLOC_HAVE_ATTR_FORMAT_PRINTF) +# define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(printf, s, i)) +# else +# define JEMALLOC_FORMAT_PRINTF(s, i) +# endif +# ifdef JEMALLOC_HAVE_ATTR_FALLTHROUGH +# define JEMALLOC_FALLTHROUGH JEMALLOC_ATTR(fallthrough) +# else +# define JEMALLOC_FALLTHROUGH +# endif +# define JEMALLOC_NOINLINE JEMALLOC_ATTR(noinline) +# define JEMALLOC_NOTHROW JEMALLOC_ATTR(nothrow) +# define JEMALLOC_SECTION(s) JEMALLOC_ATTR(section(s)) +# define JEMALLOC_RESTRICT_RETURN +# define JEMALLOC_ALLOCATOR +# ifdef JEMALLOC_HAVE_ATTR_COLD +# define JEMALLOC_COLD JEMALLOC_ATTR(__cold__) +# else +# define JEMALLOC_COLD +# endif +# ifdef JEMALLOC_HAVE_ATTR_DEPRECATED +# define JEMALLOC_WARN_ON_USAGE(warning_message) JEMALLOC_ATTR(deprecated(warning_message)) +# else +# define JEMALLOC_WARN_ON_USAGE(warning_message) +# endif +#else +# define JEMALLOC_ATTR(s) +# define JEMALLOC_ALIGNED(s) +# define JEMALLOC_ALLOC_SIZE(s) +# define JEMALLOC_ALLOC_SIZE2(s1, s2) +# define JEMALLOC_EXPORT +# define JEMALLOC_FORMAT_PRINTF(s, i) +# define JEMALLOC_FALLTHROUGH +# define JEMALLOC_NOINLINE +# define JEMALLOC_NOTHROW +# define JEMALLOC_SECTION(s) +# define JEMALLOC_RESTRICT_RETURN +# define JEMALLOC_ALLOCATOR +# define JEMALLOC_COLD +# define JEMALLOC_WARN_ON_USAGE(warning_message) +#endif + +#if (defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || (defined(__linux__) && !defined(__GLIBC__))) && !defined(JEMALLOC_NO_RENAME) +# define JEMALLOC_SYS_NOTHROW +#else +# define JEMALLOC_SYS_NOTHROW JEMALLOC_NOTHROW +#endif + +/* + * The je_ prefix on the following public symbol declarations is an artifact + * of namespace management, and should be omitted in application code unless + * JEMALLOC_NO_DEMANGLE is defined (see jemalloc_mangle.h). + */ +extern JEMALLOC_EXPORT const char *je_malloc_conf; +extern JEMALLOC_EXPORT const char *je_malloc_conf_2_conf_harder; +extern JEMALLOC_EXPORT void (*je_malloc_message)(void *cbopaque, + const char *s); + +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *je_malloc(size_t size) + JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1); +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *je_calloc(size_t num, size_t size) + JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2); +JEMALLOC_EXPORT int JEMALLOC_SYS_NOTHROW je_posix_memalign( + void **memptr, size_t alignment, size_t size) JEMALLOC_CXX_THROW + JEMALLOC_ATTR(nonnull(1)); +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *je_aligned_alloc(size_t alignment, + size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) + JEMALLOC_ALLOC_SIZE(2); +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *je_realloc(void *ptr, size_t size) + JEMALLOC_CXX_THROW JEMALLOC_ALLOC_SIZE(2); +JEMALLOC_EXPORT void JEMALLOC_SYS_NOTHROW je_free(void *ptr) + JEMALLOC_CXX_THROW; +JEMALLOC_EXPORT void JEMALLOC_NOTHROW je_free_sized(void *ptr, size_t size); +JEMALLOC_EXPORT void JEMALLOC_NOTHROW je_free_aligned_sized( + void *ptr, size_t alignment, size_t size); + +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_NOTHROW *je_mallocx(size_t size, int flags) + JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1); +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_NOTHROW *je_rallocx(void *ptr, size_t size, + int flags) JEMALLOC_ALLOC_SIZE(2); +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_xallocx(void *ptr, size_t size, + size_t extra, int flags); +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_sallocx(const void *ptr, + int flags) JEMALLOC_ATTR(pure); +JEMALLOC_EXPORT void JEMALLOC_NOTHROW je_dallocx(void *ptr, int flags); +JEMALLOC_EXPORT void JEMALLOC_NOTHROW je_sdallocx(void *ptr, size_t size, + int flags); +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_nallocx(size_t size, int flags) + JEMALLOC_ATTR(pure); + +JEMALLOC_EXPORT int JEMALLOC_NOTHROW je_mallctl(const char *name, + void *oldp, size_t *oldlenp, void *newp, size_t newlen); +JEMALLOC_EXPORT int JEMALLOC_NOTHROW je_mallctlnametomib(const char *name, + size_t *mibp, size_t *miblenp); +JEMALLOC_EXPORT int JEMALLOC_NOTHROW je_mallctlbymib(const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen); +JEMALLOC_EXPORT void JEMALLOC_NOTHROW je_malloc_stats_print( + void (*write_cb)(void *, const char *), void *je_cbopaque, + const char *opts); +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_malloc_usable_size( + JEMALLOC_USABLE_SIZE_CONST void *ptr) JEMALLOC_CXX_THROW; +#ifdef JEMALLOC_HAVE_MALLOC_SIZE +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_malloc_size( + const void *ptr); +#endif + +#ifdef JEMALLOC_OVERRIDE_MEMALIGN +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *je_memalign(size_t alignment, size_t size) + JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc); +#endif + +#ifdef JEMALLOC_OVERRIDE_VALLOC +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *je_valloc(size_t size) JEMALLOC_CXX_THROW + JEMALLOC_ATTR(malloc); +#endif + +#ifdef JEMALLOC_OVERRIDE_PVALLOC +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *je_pvalloc(size_t size) JEMALLOC_CXX_THROW + JEMALLOC_ATTR(malloc); +#endif + +typedef struct extent_hooks_s extent_hooks_t; + +/* + * void * + * extent_alloc(extent_hooks_t *extent_hooks, void *new_addr, size_t size, + * size_t alignment, bool *zero, bool *commit, unsigned arena_ind); + */ +typedef void *(extent_alloc_t)(extent_hooks_t *, void *, size_t, size_t, bool *, + bool *, unsigned); + +/* + * bool + * extent_dalloc(extent_hooks_t *extent_hooks, void *addr, size_t size, + * bool committed, unsigned arena_ind); + */ +typedef bool (extent_dalloc_t)(extent_hooks_t *, void *, size_t, bool, + unsigned); + +/* + * void + * extent_destroy(extent_hooks_t *extent_hooks, void *addr, size_t size, + * bool committed, unsigned arena_ind); + */ +typedef void (extent_destroy_t)(extent_hooks_t *, void *, size_t, bool, + unsigned); + +/* + * bool + * extent_commit(extent_hooks_t *extent_hooks, void *addr, size_t size, + * size_t offset, size_t length, unsigned arena_ind); + */ +typedef bool (extent_commit_t)(extent_hooks_t *, void *, size_t, size_t, size_t, + unsigned); + +/* + * bool + * extent_decommit(extent_hooks_t *extent_hooks, void *addr, size_t size, + * size_t offset, size_t length, unsigned arena_ind); + */ +typedef bool (extent_decommit_t)(extent_hooks_t *, void *, size_t, size_t, + size_t, unsigned); + +/* + * bool + * extent_purge(extent_hooks_t *extent_hooks, void *addr, size_t size, + * size_t offset, size_t length, unsigned arena_ind); + */ +typedef bool (extent_purge_t)(extent_hooks_t *, void *, size_t, size_t, size_t, + unsigned); + +/* + * bool + * extent_split(extent_hooks_t *extent_hooks, void *addr, size_t size, + * size_t size_a, size_t size_b, bool committed, unsigned arena_ind); + */ +typedef bool (extent_split_t)(extent_hooks_t *, void *, size_t, size_t, size_t, + bool, unsigned); + +/* + * bool + * extent_merge(extent_hooks_t *extent_hooks, void *addr_a, size_t size_a, + * void *addr_b, size_t size_b, bool committed, unsigned arena_ind); + */ +typedef bool (extent_merge_t)(extent_hooks_t *, void *, size_t, void *, size_t, + bool, unsigned); + +struct extent_hooks_s { + extent_alloc_t *alloc; + extent_dalloc_t *dalloc; + extent_destroy_t *destroy; + extent_commit_t *commit; + extent_decommit_t *decommit; + extent_purge_t *purge_lazy; + extent_purge_t *purge_forced; + extent_split_t *split; + extent_merge_t *merge; +}; + +/* + * By default application code must explicitly refer to mangled symbol names, + * so that it is possible to use jemalloc in conjunction with another allocator + * in the same application. Define JEMALLOC_MANGLE in order to cause automatic + * name mangling that matches the API prefixing that happened as a result of + * --with-mangling and/or --with-jemalloc-prefix configuration settings. + */ +#ifdef JEMALLOC_MANGLE +# ifndef JEMALLOC_NO_DEMANGLE +# define JEMALLOC_NO_DEMANGLE +# endif +# define aligned_alloc je_aligned_alloc +# define calloc je_calloc +# define dallocx je_dallocx +# define free je_free +# define free_sized je_free_sized +# define free_aligned_sized je_free_aligned_sized +# define mallctl je_mallctl +# define mallctlbymib je_mallctlbymib +# define mallctlnametomib je_mallctlnametomib +# define malloc je_malloc +# define malloc_conf je_malloc_conf +# define malloc_conf_2_conf_harder je_malloc_conf_2_conf_harder +# define malloc_message je_malloc_message +# define malloc_stats_print je_malloc_stats_print +# define malloc_usable_size je_malloc_usable_size +# define mallocx je_mallocx +# define smallocx_a25b9b8ba91881964be3083db349991bbbbf1661 je_smallocx_a25b9b8ba91881964be3083db349991bbbbf1661 +# define nallocx je_nallocx +# define posix_memalign je_posix_memalign +# define rallocx je_rallocx +# define realloc je_realloc +# define sallocx je_sallocx +# define sdallocx je_sdallocx +# define xallocx je_xallocx +# define valloc je_valloc +# define malloc_size je_malloc_size +#endif + +/* + * The je_* macros can be used as stable alternative names for the + * public jemalloc API if JEMALLOC_NO_DEMANGLE is defined. This is primarily + * meant for use in jemalloc itself, but it can be used by application code to + * provide isolation from the name mangling specified via --with-mangling + * and/or --with-jemalloc-prefix. + */ +#ifndef JEMALLOC_NO_DEMANGLE +# undef je_aligned_alloc +# undef je_calloc +# undef je_dallocx +# undef je_free +# undef je_free_sized +# undef je_free_aligned_sized +# undef je_mallctl +# undef je_mallctlbymib +# undef je_mallctlnametomib +# undef je_malloc +# undef je_malloc_conf +# undef je_malloc_conf_2_conf_harder +# undef je_malloc_message +# undef je_malloc_stats_print +# undef je_malloc_usable_size +# undef je_mallocx +# undef je_smallocx_a25b9b8ba91881964be3083db349991bbbbf1661 +# undef je_nallocx +# undef je_posix_memalign +# undef je_rallocx +# undef je_realloc +# undef je_sallocx +# undef je_sdallocx +# undef je_xallocx +# undef je_valloc +# undef je_malloc_size +#endif + +#ifdef __cplusplus +} +#endif +#endif /* JEMALLOC_H_ */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc_defs.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc_defs.h new file mode 100644 index 000000000..96d42c086 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc_defs.h @@ -0,0 +1,66 @@ +/* include/jemalloc/jemalloc_defs.h. Generated from jemalloc_defs.h.in by configure. */ +/* Defined if __attribute__((...)) syntax is supported. */ +#define JEMALLOC_HAVE_ATTR + +/* Defined if alloc_size attribute is supported. */ +#define JEMALLOC_HAVE_ATTR_ALLOC_SIZE + +/* Defined if format_arg(...) attribute is supported. */ +#define JEMALLOC_HAVE_ATTR_FORMAT_ARG + +/* Defined if format(gnu_printf, ...) attribute is supported. */ +/* #undef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF */ + +/* Defined if format(printf, ...) attribute is supported. */ +#define JEMALLOC_HAVE_ATTR_FORMAT_PRINTF + +/* Defined if fallthrough attribute is supported. */ +#define JEMALLOC_HAVE_ATTR_FALLTHROUGH + +/* Defined if cold attribute is supported. */ +#define JEMALLOC_HAVE_ATTR_COLD + +/* Defined if deprecated attribute is supported. */ +// #define JEMALLOC_HAVE_ATTR_DEPRECATED + +/* + * Define overrides for non-standard allocator-related functions if they are + * present on the system. + */ +/* #undef JEMALLOC_OVERRIDE_MEMALIGN */ +#define JEMALLOC_OVERRIDE_VALLOC +/* #undef JEMALLOC_OVERRIDE_PVALLOC */ + +/* + * At least Linux omits the "const" in: + * + * size_t malloc_usable_size(const void *ptr); + * + * Match the operating system's prototype. + */ +#define JEMALLOC_USABLE_SIZE_CONST const + +/* + * If defined, specify throw() for the public function prototypes when compiling + * with C++. The only justification for this is to match the prototypes that + * glibc defines. + */ +/* #undef JEMALLOC_USE_CXX_THROW */ + +#ifdef _MSC_VER +# ifdef _WIN64 +# define LG_SIZEOF_PTR_WIN 3 +# else +# define LG_SIZEOF_PTR_WIN 2 +# endif +#endif + +/* sizeof(void *) == 2^LG_SIZEOF_PTR. */ +#include +#ifdef _MSC_VER +# define LG_SIZEOF_PTR LG_SIZEOF_PTR_WIN +#elif INTPTR_MAX == INT64_MAX +# define LG_SIZEOF_PTR 3 +#else +# define LG_SIZEOF_PTR 2 +#endif diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc_macros.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc_macros.h new file mode 100644 index 000000000..44ab5a3fa --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc_macros.h @@ -0,0 +1,156 @@ +#include +#include +#include +#include +#include + +#define JEMALLOC_VERSION "5.3.0-196-ga25b9b8ba91881964be3083db349991bbbbf1661" +#define JEMALLOC_VERSION_MAJOR 5 +#define JEMALLOC_VERSION_MINOR 3 +#define JEMALLOC_VERSION_BUGFIX 0 +#define JEMALLOC_VERSION_NREV 196 +#define JEMALLOC_VERSION_GID "a25b9b8ba91881964be3083db349991bbbbf1661" +#define JEMALLOC_VERSION_GID_IDENT a25b9b8ba91881964be3083db349991bbbbf1661 + +#define MALLOCX_LG_ALIGN(la) ((int)(la)) +#if LG_SIZEOF_PTR == 2 +# define MALLOCX_ALIGN(a) ((int)(ffs((int)(a))-1)) +#else +# define MALLOCX_ALIGN(a) \ + ((int)(((size_t)(a) < (size_t)INT_MAX) ? ffs((int)(a))-1 : \ + ffs((int)(((size_t)(a))>>32))+31)) +#endif +#define MALLOCX_ZERO ((int)0x40) +/* + * Bias tcache index bits so that 0 encodes "automatic tcache management", and 1 + * encodes MALLOCX_TCACHE_NONE. + */ +#define MALLOCX_TCACHE(tc) ((int)(((tc)+2) << 8)) +#define MALLOCX_TCACHE_NONE MALLOCX_TCACHE(-1) +/* + * Bias arena index bits so that 0 encodes "use an automatically chosen arena". + */ +#define MALLOCX_ARENA(a) ((((int)(a))+1) << 20) + +/* + * Use as arena index in "arena..{purge,decay,dss}" and + * "stats.arenas..*" mallctl interfaces to select all arenas. This + * definition is intentionally specified in raw decimal format to support + * cpp-based string concatenation, e.g. + * + * #define STRINGIFY_HELPER(x) #x + * #define STRINGIFY(x) STRINGIFY_HELPER(x) + * + * mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".purge", NULL, NULL, NULL, + * 0); + */ +#define MALLCTL_ARENAS_ALL 4096 +/* + * Use as arena index in "stats.arenas..*" mallctl interfaces to select + * destroyed arenas. + */ +#define MALLCTL_ARENAS_DESTROYED 4097 + +#if defined(__cplusplus) && defined(JEMALLOC_USE_CXX_THROW) +# define JEMALLOC_CXX_THROW noexcept (true) +#else +# define JEMALLOC_CXX_THROW +#endif + +#if defined(_MSC_VER) +# define JEMALLOC_ATTR(s) +# define JEMALLOC_ALIGNED(s) __declspec(align(s)) +# define JEMALLOC_ALLOC_SIZE(s) +# define JEMALLOC_ALLOC_SIZE2(s1, s2) +# ifndef JEMALLOC_EXPORT +# ifdef DLLEXPORT +# define JEMALLOC_EXPORT __declspec(dllexport) +# else +# define JEMALLOC_EXPORT __declspec(dllimport) +# endif +# endif +# define JEMALLOC_FORMAT_ARG(i) +# define JEMALLOC_FORMAT_PRINTF(s, i) +# define JEMALLOC_FALLTHROUGH +# define JEMALLOC_NOINLINE __declspec(noinline) +# ifdef __cplusplus +# define JEMALLOC_NOTHROW __declspec(nothrow) +# else +# define JEMALLOC_NOTHROW +# endif +# define JEMALLOC_SECTION(s) __declspec(allocate(s)) +# define JEMALLOC_RESTRICT_RETURN __declspec(restrict) +# if _MSC_VER >= 1900 && !defined(__EDG__) +# define JEMALLOC_ALLOCATOR __declspec(allocator) +# else +# define JEMALLOC_ALLOCATOR +# endif +# define JEMALLOC_COLD +# define JEMALLOC_WARN_ON_USAGE(warning_message) +#elif defined(JEMALLOC_HAVE_ATTR) +# define JEMALLOC_ATTR(s) __attribute__((s)) +# define JEMALLOC_ALIGNED(s) JEMALLOC_ATTR(aligned(s)) +# ifdef JEMALLOC_HAVE_ATTR_ALLOC_SIZE +# define JEMALLOC_ALLOC_SIZE(s) JEMALLOC_ATTR(alloc_size(s)) +# define JEMALLOC_ALLOC_SIZE2(s1, s2) JEMALLOC_ATTR(alloc_size(s1, s2)) +# else +# define JEMALLOC_ALLOC_SIZE(s) +# define JEMALLOC_ALLOC_SIZE2(s1, s2) +# endif +# ifndef JEMALLOC_EXPORT +# define JEMALLOC_EXPORT JEMALLOC_ATTR(visibility("default")) +# endif +# ifdef JEMALLOC_HAVE_ATTR_FORMAT_ARG +# define JEMALLOC_FORMAT_ARG(i) JEMALLOC_ATTR(__format_arg__(3)) +# else +# define JEMALLOC_FORMAT_ARG(i) +# endif +# ifdef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF +# define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(gnu_printf, s, i)) +# elif defined(JEMALLOC_HAVE_ATTR_FORMAT_PRINTF) +# define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(printf, s, i)) +# else +# define JEMALLOC_FORMAT_PRINTF(s, i) +# endif +# ifdef JEMALLOC_HAVE_ATTR_FALLTHROUGH +# define JEMALLOC_FALLTHROUGH JEMALLOC_ATTR(fallthrough) +# else +# define JEMALLOC_FALLTHROUGH +# endif +# define JEMALLOC_NOINLINE JEMALLOC_ATTR(noinline) +# define JEMALLOC_NOTHROW JEMALLOC_ATTR(nothrow) +# define JEMALLOC_SECTION(s) JEMALLOC_ATTR(section(s)) +# define JEMALLOC_RESTRICT_RETURN +# define JEMALLOC_ALLOCATOR +# ifdef JEMALLOC_HAVE_ATTR_COLD +# define JEMALLOC_COLD JEMALLOC_ATTR(__cold__) +# else +# define JEMALLOC_COLD +# endif +# ifdef JEMALLOC_HAVE_ATTR_DEPRECATED +# define JEMALLOC_WARN_ON_USAGE(warning_message) JEMALLOC_ATTR(deprecated(warning_message)) +# else +# define JEMALLOC_WARN_ON_USAGE(warning_message) +# endif +#else +# define JEMALLOC_ATTR(s) +# define JEMALLOC_ALIGNED(s) +# define JEMALLOC_ALLOC_SIZE(s) +# define JEMALLOC_ALLOC_SIZE2(s1, s2) +# define JEMALLOC_EXPORT +# define JEMALLOC_FORMAT_PRINTF(s, i) +# define JEMALLOC_FALLTHROUGH +# define JEMALLOC_NOINLINE +# define JEMALLOC_NOTHROW +# define JEMALLOC_SECTION(s) +# define JEMALLOC_RESTRICT_RETURN +# define JEMALLOC_ALLOCATOR +# define JEMALLOC_COLD +# define JEMALLOC_WARN_ON_USAGE(warning_message) +#endif + +#if (defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || (defined(__linux__) && !defined(__GLIBC__))) && !defined(JEMALLOC_NO_RENAME) +# define JEMALLOC_SYS_NOTHROW +#else +# define JEMALLOC_SYS_NOTHROW JEMALLOC_NOTHROW +#endif diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc_mangle.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc_mangle.h new file mode 100644 index 000000000..cb4379aab --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc_mangle.h @@ -0,0 +1,74 @@ +/* + * By default application code must explicitly refer to mangled symbol names, + * so that it is possible to use jemalloc in conjunction with another allocator + * in the same application. Define JEMALLOC_MANGLE in order to cause automatic + * name mangling that matches the API prefixing that happened as a result of + * --with-mangling and/or --with-jemalloc-prefix configuration settings. + */ +#ifdef JEMALLOC_MANGLE +# ifndef JEMALLOC_NO_DEMANGLE +# define JEMALLOC_NO_DEMANGLE +# endif +# define aligned_alloc je_aligned_alloc +# define calloc je_calloc +# define dallocx je_dallocx +# define free je_free +# define free_sized je_free_sized +# define free_aligned_sized je_free_aligned_sized +# define mallctl je_mallctl +# define mallctlbymib je_mallctlbymib +# define mallctlnametomib je_mallctlnametomib +# define malloc je_malloc +# define malloc_conf je_malloc_conf +# define malloc_conf_2_conf_harder je_malloc_conf_2_conf_harder +# define malloc_message je_malloc_message +# define malloc_stats_print je_malloc_stats_print +# define malloc_usable_size je_malloc_usable_size +# define mallocx je_mallocx +# define smallocx_a25b9b8ba91881964be3083db349991bbbbf1661 je_smallocx_a25b9b8ba91881964be3083db349991bbbbf1661 +# define nallocx je_nallocx +# define posix_memalign je_posix_memalign +# define rallocx je_rallocx +# define realloc je_realloc +# define sallocx je_sallocx +# define sdallocx je_sdallocx +# define xallocx je_xallocx +# define valloc je_valloc +# define malloc_size je_malloc_size +#endif + +/* + * The je_* macros can be used as stable alternative names for the + * public jemalloc API if JEMALLOC_NO_DEMANGLE is defined. This is primarily + * meant for use in jemalloc itself, but it can be used by application code to + * provide isolation from the name mangling specified via --with-mangling + * and/or --with-jemalloc-prefix. + */ +#ifndef JEMALLOC_NO_DEMANGLE +# undef je_aligned_alloc +# undef je_calloc +# undef je_dallocx +# undef je_free +# undef je_free_sized +# undef je_free_aligned_sized +# undef je_mallctl +# undef je_mallctlbymib +# undef je_mallctlnametomib +# undef je_malloc +# undef je_malloc_conf +# undef je_malloc_conf_2_conf_harder +# undef je_malloc_message +# undef je_malloc_stats_print +# undef je_malloc_usable_size +# undef je_mallocx +# undef je_smallocx_a25b9b8ba91881964be3083db349991bbbbf1661 +# undef je_nallocx +# undef je_posix_memalign +# undef je_rallocx +# undef je_realloc +# undef je_sallocx +# undef je_sdallocx +# undef je_xallocx +# undef je_valloc +# undef je_malloc_size +#endif diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc_mangle_jet.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc_mangle_jet.h new file mode 100644 index 000000000..02c97ee1a --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc_mangle_jet.h @@ -0,0 +1,74 @@ +/* + * By default application code must explicitly refer to mangled symbol names, + * so that it is possible to use jemalloc in conjunction with another allocator + * in the same application. Define JEMALLOC_MANGLE in order to cause automatic + * name mangling that matches the API prefixing that happened as a result of + * --with-mangling and/or --with-jemalloc-prefix configuration settings. + */ +#ifdef JEMALLOC_MANGLE +# ifndef JEMALLOC_NO_DEMANGLE +# define JEMALLOC_NO_DEMANGLE +# endif +# define aligned_alloc jet_aligned_alloc +# define calloc jet_calloc +# define dallocx jet_dallocx +# define free jet_free +# define free_sized jet_free_sized +# define free_aligned_sized jet_free_aligned_sized +# define mallctl jet_mallctl +# define mallctlbymib jet_mallctlbymib +# define mallctlnametomib jet_mallctlnametomib +# define malloc jet_malloc +# define malloc_conf jet_malloc_conf +# define malloc_conf_2_conf_harder jet_malloc_conf_2_conf_harder +# define malloc_message jet_malloc_message +# define malloc_stats_print jet_malloc_stats_print +# define malloc_usable_size jet_malloc_usable_size +# define mallocx jet_mallocx +# define smallocx_a25b9b8ba91881964be3083db349991bbbbf1661 jet_smallocx_a25b9b8ba91881964be3083db349991bbbbf1661 +# define nallocx jet_nallocx +# define posix_memalign jet_posix_memalign +# define rallocx jet_rallocx +# define realloc jet_realloc +# define sallocx jet_sallocx +# define sdallocx jet_sdallocx +# define xallocx jet_xallocx +# define valloc jet_valloc +# define malloc_size jet_malloc_size +#endif + +/* + * The jet_* macros can be used as stable alternative names for the + * public jemalloc API if JEMALLOC_NO_DEMANGLE is defined. This is primarily + * meant for use in jemalloc itself, but it can be used by application code to + * provide isolation from the name mangling specified via --with-mangling + * and/or --with-jemalloc-prefix. + */ +#ifndef JEMALLOC_NO_DEMANGLE +# undef jet_aligned_alloc +# undef jet_calloc +# undef jet_dallocx +# undef jet_free +# undef jet_free_sized +# undef jet_free_aligned_sized +# undef jet_mallctl +# undef jet_mallctlbymib +# undef jet_mallctlnametomib +# undef jet_malloc +# undef jet_malloc_conf +# undef jet_malloc_conf_2_conf_harder +# undef jet_malloc_message +# undef jet_malloc_stats_print +# undef jet_malloc_usable_size +# undef jet_mallocx +# undef jet_smallocx_a25b9b8ba91881964be3083db349991bbbbf1661 +# undef jet_nallocx +# undef jet_posix_memalign +# undef jet_rallocx +# undef jet_realloc +# undef jet_sallocx +# undef jet_sdallocx +# undef jet_xallocx +# undef jet_valloc +# undef jet_malloc_size +#endif diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc_protos.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc_protos.h new file mode 100644 index 000000000..12c388e2c --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc_protos.h @@ -0,0 +1,81 @@ +/* + * The je_ prefix on the following public symbol declarations is an artifact + * of namespace management, and should be omitted in application code unless + * JEMALLOC_NO_DEMANGLE is defined (see jemalloc_mangle.h). + */ +extern JEMALLOC_EXPORT const char *je_malloc_conf; +extern JEMALLOC_EXPORT const char *je_malloc_conf_2_conf_harder; +extern JEMALLOC_EXPORT void (*je_malloc_message)(void *cbopaque, + const char *s); + +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *je_malloc(size_t size) + JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1); +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *je_calloc(size_t num, size_t size) + JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2); +JEMALLOC_EXPORT int JEMALLOC_SYS_NOTHROW je_posix_memalign( + void **memptr, size_t alignment, size_t size) JEMALLOC_CXX_THROW + JEMALLOC_ATTR(nonnull(1)); +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *je_aligned_alloc(size_t alignment, + size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) + JEMALLOC_ALLOC_SIZE(2); +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *je_realloc(void *ptr, size_t size) + JEMALLOC_CXX_THROW JEMALLOC_ALLOC_SIZE(2); +JEMALLOC_EXPORT void JEMALLOC_SYS_NOTHROW je_free(void *ptr) + JEMALLOC_CXX_THROW; +JEMALLOC_EXPORT void JEMALLOC_NOTHROW je_free_sized(void *ptr, size_t size); +JEMALLOC_EXPORT void JEMALLOC_NOTHROW je_free_aligned_sized( + void *ptr, size_t alignment, size_t size); + +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_NOTHROW *je_mallocx(size_t size, int flags) + JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1); +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_NOTHROW *je_rallocx(void *ptr, size_t size, + int flags) JEMALLOC_ALLOC_SIZE(2); +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_xallocx(void *ptr, size_t size, + size_t extra, int flags); +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_sallocx(const void *ptr, + int flags) JEMALLOC_ATTR(pure); +JEMALLOC_EXPORT void JEMALLOC_NOTHROW je_dallocx(void *ptr, int flags); +JEMALLOC_EXPORT void JEMALLOC_NOTHROW je_sdallocx(void *ptr, size_t size, + int flags); +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_nallocx(size_t size, int flags) + JEMALLOC_ATTR(pure); + +JEMALLOC_EXPORT int JEMALLOC_NOTHROW je_mallctl(const char *name, + void *oldp, size_t *oldlenp, void *newp, size_t newlen); +JEMALLOC_EXPORT int JEMALLOC_NOTHROW je_mallctlnametomib(const char *name, + size_t *mibp, size_t *miblenp); +JEMALLOC_EXPORT int JEMALLOC_NOTHROW je_mallctlbymib(const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen); +JEMALLOC_EXPORT void JEMALLOC_NOTHROW je_malloc_stats_print( + void (*write_cb)(void *, const char *), void *je_cbopaque, + const char *opts); +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_malloc_usable_size( + JEMALLOC_USABLE_SIZE_CONST void *ptr) JEMALLOC_CXX_THROW; +#ifdef JEMALLOC_HAVE_MALLOC_SIZE +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_malloc_size( + const void *ptr); +#endif + +#ifdef JEMALLOC_OVERRIDE_MEMALIGN +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *je_memalign(size_t alignment, size_t size) + JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc); +#endif + +#ifdef JEMALLOC_OVERRIDE_VALLOC +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *je_valloc(size_t size) JEMALLOC_CXX_THROW + JEMALLOC_ATTR(malloc); +#endif + +#ifdef JEMALLOC_OVERRIDE_PVALLOC +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *je_pvalloc(size_t size) JEMALLOC_CXX_THROW + JEMALLOC_ATTR(malloc); +#endif diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc_protos_jet.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc_protos_jet.h new file mode 100644 index 000000000..fb51e1b7f --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc_protos_jet.h @@ -0,0 +1,81 @@ +/* + * The jet_ prefix on the following public symbol declarations is an artifact + * of namespace management, and should be omitted in application code unless + * JEMALLOC_NO_DEMANGLE is defined (see jemalloc_mangle@install_suffix@.h). + */ +extern JEMALLOC_EXPORT const char *jet_malloc_conf; +extern JEMALLOC_EXPORT const char *jet_malloc_conf_2_conf_harder; +extern JEMALLOC_EXPORT void (*jet_malloc_message)(void *cbopaque, + const char *s); + +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *jet_malloc(size_t size) + JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1); +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *jet_calloc(size_t num, size_t size) + JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2); +JEMALLOC_EXPORT int JEMALLOC_SYS_NOTHROW jet_posix_memalign( + void **memptr, size_t alignment, size_t size) JEMALLOC_CXX_THROW + JEMALLOC_ATTR(nonnull(1)); +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *jet_aligned_alloc(size_t alignment, + size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) + JEMALLOC_ALLOC_SIZE(2); +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *jet_realloc(void *ptr, size_t size) + JEMALLOC_CXX_THROW JEMALLOC_ALLOC_SIZE(2); +JEMALLOC_EXPORT void JEMALLOC_SYS_NOTHROW jet_free(void *ptr) + JEMALLOC_CXX_THROW; +JEMALLOC_EXPORT void JEMALLOC_NOTHROW jet_free_sized(void *ptr, size_t size); +JEMALLOC_EXPORT void JEMALLOC_NOTHROW jet_free_aligned_sized( + void *ptr, size_t alignment, size_t size); + +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_NOTHROW *jet_mallocx(size_t size, int flags) + JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1); +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_NOTHROW *jet_rallocx(void *ptr, size_t size, + int flags) JEMALLOC_ALLOC_SIZE(2); +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW jet_xallocx(void *ptr, size_t size, + size_t extra, int flags); +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW jet_sallocx(const void *ptr, + int flags) JEMALLOC_ATTR(pure); +JEMALLOC_EXPORT void JEMALLOC_NOTHROW jet_dallocx(void *ptr, int flags); +JEMALLOC_EXPORT void JEMALLOC_NOTHROW jet_sdallocx(void *ptr, size_t size, + int flags); +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW jet_nallocx(size_t size, int flags) + JEMALLOC_ATTR(pure); + +JEMALLOC_EXPORT int JEMALLOC_NOTHROW jet_mallctl(const char *name, + void *oldp, size_t *oldlenp, void *newp, size_t newlen); +JEMALLOC_EXPORT int JEMALLOC_NOTHROW jet_mallctlnametomib(const char *name, + size_t *mibp, size_t *miblenp); +JEMALLOC_EXPORT int JEMALLOC_NOTHROW jet_mallctlbymib(const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen); +JEMALLOC_EXPORT void JEMALLOC_NOTHROW jet_malloc_stats_print( + void (*write_cb)(void *, const char *), void *jet_cbopaque, + const char *opts); +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW jet_malloc_usable_size( + JEMALLOC_USABLE_SIZE_CONST void *ptr) JEMALLOC_CXX_THROW; +#ifdef JEMALLOC_HAVE_MALLOC_SIZE +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW jet_malloc_size( + const void *ptr); +#endif + +#ifdef JEMALLOC_OVERRIDE_MEMALIGN +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *jet_memalign(size_t alignment, size_t size) + JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc); +#endif + +#ifdef JEMALLOC_OVERRIDE_VALLOC +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *jet_valloc(size_t size) JEMALLOC_CXX_THROW + JEMALLOC_ATTR(malloc); +#endif + +#ifdef JEMALLOC_OVERRIDE_PVALLOC +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *jet_pvalloc(size_t size) JEMALLOC_CXX_THROW + JEMALLOC_ATTR(malloc); +#endif diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc_rename.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc_rename.h new file mode 100644 index 000000000..ac0d2dffa --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc_rename.h @@ -0,0 +1,33 @@ +/* + * Name mangling for public symbols is controlled by --with-mangling and + * --with-jemalloc-prefix. With default settings the je_ prefix is stripped by + * these macro definitions. + */ +#ifndef JEMALLOC_NO_RENAME +# define je_aligned_alloc duckdb_je_aligned_alloc +# define je_calloc duckdb_je_calloc +# define je_dallocx duckdb_je_dallocx +# define je_free duckdb_je_free +# define je_free_sized duckdb_je_free_sized +# define je_free_aligned_sized duckdb_je_free_aligned_sized +# define je_mallctl duckdb_je_mallctl +# define je_mallctlbymib duckdb_je_mallctlbymib +# define je_mallctlnametomib duckdb_je_mallctlnametomib +# define je_malloc duckdb_je_malloc +# define je_malloc_conf duckdb_je_malloc_conf +# define je_malloc_conf_2_conf_harder duckdb_je_malloc_conf_2_conf_harder +# define je_malloc_message duckdb_je_malloc_message +# define je_malloc_stats_print duckdb_je_malloc_stats_print +# define je_malloc_usable_size duckdb_je_malloc_usable_size +# define je_mallocx duckdb_je_mallocx +# define je_smallocx_a25b9b8ba91881964be3083db349991bbbbf1661 duckdb_je_smallocx_a25b9b8ba91881964be3083db349991bbbbf1661 +# define je_nallocx duckdb_je_nallocx +# define je_posix_memalign duckdb_je_posix_memalign +# define je_rallocx duckdb_je_rallocx +# define je_realloc duckdb_je_realloc +# define je_sallocx duckdb_je_sallocx +# define je_sdallocx duckdb_je_sdallocx +# define je_xallocx duckdb_je_xallocx +# define je_valloc duckdb_je_valloc +# define je_malloc_size duckdb_je_malloc_size +#endif diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc_typedefs.h b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc_typedefs.h new file mode 100644 index 000000000..1a5887430 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/jemalloc/jemalloc_typedefs.h @@ -0,0 +1,77 @@ +typedef struct extent_hooks_s extent_hooks_t; + +/* + * void * + * extent_alloc(extent_hooks_t *extent_hooks, void *new_addr, size_t size, + * size_t alignment, bool *zero, bool *commit, unsigned arena_ind); + */ +typedef void *(extent_alloc_t)(extent_hooks_t *, void *, size_t, size_t, bool *, + bool *, unsigned); + +/* + * bool + * extent_dalloc(extent_hooks_t *extent_hooks, void *addr, size_t size, + * bool committed, unsigned arena_ind); + */ +typedef bool (extent_dalloc_t)(extent_hooks_t *, void *, size_t, bool, + unsigned); + +/* + * void + * extent_destroy(extent_hooks_t *extent_hooks, void *addr, size_t size, + * bool committed, unsigned arena_ind); + */ +typedef void (extent_destroy_t)(extent_hooks_t *, void *, size_t, bool, + unsigned); + +/* + * bool + * extent_commit(extent_hooks_t *extent_hooks, void *addr, size_t size, + * size_t offset, size_t length, unsigned arena_ind); + */ +typedef bool (extent_commit_t)(extent_hooks_t *, void *, size_t, size_t, size_t, + unsigned); + +/* + * bool + * extent_decommit(extent_hooks_t *extent_hooks, void *addr, size_t size, + * size_t offset, size_t length, unsigned arena_ind); + */ +typedef bool (extent_decommit_t)(extent_hooks_t *, void *, size_t, size_t, + size_t, unsigned); + +/* + * bool + * extent_purge(extent_hooks_t *extent_hooks, void *addr, size_t size, + * size_t offset, size_t length, unsigned arena_ind); + */ +typedef bool (extent_purge_t)(extent_hooks_t *, void *, size_t, size_t, size_t, + unsigned); + +/* + * bool + * extent_split(extent_hooks_t *extent_hooks, void *addr, size_t size, + * size_t size_a, size_t size_b, bool committed, unsigned arena_ind); + */ +typedef bool (extent_split_t)(extent_hooks_t *, void *, size_t, size_t, size_t, + bool, unsigned); + +/* + * bool + * extent_merge(extent_hooks_t *extent_hooks, void *addr_a, size_t size_a, + * void *addr_b, size_t size_b, bool committed, unsigned arena_ind); + */ +typedef bool (extent_merge_t)(extent_hooks_t *, void *, size_t, void *, size_t, + bool, unsigned); + +struct extent_hooks_s { + extent_alloc_t *alloc; + extent_dalloc_t *dalloc; + extent_destroy_t *destroy; + extent_commit_t *commit; + extent_decommit_t *decommit; + extent_purge_t *purge_lazy; + extent_purge_t *purge_forced; + extent_split_t *split; + extent_merge_t *merge; +}; diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/msvc_compat/C99/stdbool.h b/src/duckdb/extension/jemalloc/jemalloc/include/msvc_compat/C99/stdbool.h new file mode 100644 index 000000000..d92160ebc --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/msvc_compat/C99/stdbool.h @@ -0,0 +1,20 @@ +#ifndef stdbool_h +#define stdbool_h + +#include + +/* MSVC doesn't define _Bool or bool in C, but does have BOOL */ +/* Note this doesn't pass autoconf's test because (bool) 0.5 != true */ +/* Clang-cl uses MSVC headers, so needs msvc_compat, but has _Bool as + * a built-in type. */ +#ifndef __clang__ +typedef BOOL _Bool; +#endif + +#define bool _Bool +#define true 1 +#define false 0 + +#define __bool_true_false_are_defined 1 + +#endif /* stdbool_h */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/msvc_compat/C99/stdint.h b/src/duckdb/extension/jemalloc/jemalloc/include/msvc_compat/C99/stdint.h new file mode 100644 index 000000000..c66fbb817 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/msvc_compat/C99/stdint.h @@ -0,0 +1,247 @@ +// ISO C9x compliant stdint.h for Microsoft Visual Studio +// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 +// +// Copyright (c) 2006-2008 Alexander Chemeris +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. The name of the author may be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef _MSC_VER // [ +#error "Use this header only with Microsoft Visual C++ compilers!" +#endif // _MSC_VER ] + +#ifndef _MSC_STDINT_H_ // [ +#define _MSC_STDINT_H_ + +#if _MSC_VER > 1000 +#pragma once +#endif + +#include + +// For Visual Studio 6 in C++ mode and for many Visual Studio versions when +// compiling for ARM we should wrap include with 'extern "C++" {}' +// or compiler give many errors like this: +// error C2733: second C linkage of overloaded function 'wmemchr' not allowed +#ifdef __cplusplus +extern "C" { +#endif +# include +#ifdef __cplusplus +} +#endif + +// Define _W64 macros to mark types changing their size, like intptr_t. +#ifndef _W64 +# if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300 +# define _W64 __w64 +# else +# define _W64 +# endif +#endif + + +// 7.18.1 Integer types + +// 7.18.1.1 Exact-width integer types + +// Visual Studio 6 and Embedded Visual C++ 4 doesn't +// realize that, e.g. char has the same size as __int8 +// so we give up on __intX for them. +#if (_MSC_VER < 1300) + typedef signed char int8_t; + typedef signed short int16_t; + typedef signed int int32_t; + typedef unsigned char uint8_t; + typedef unsigned short uint16_t; + typedef unsigned int uint32_t; +#else + typedef signed __int8 int8_t; + typedef signed __int16 int16_t; + typedef signed __int32 int32_t; + typedef unsigned __int8 uint8_t; + typedef unsigned __int16 uint16_t; + typedef unsigned __int32 uint32_t; +#endif +typedef signed __int64 int64_t; +typedef unsigned __int64 uint64_t; + + +// 7.18.1.2 Minimum-width integer types +typedef int8_t int_least8_t; +typedef int16_t int_least16_t; +typedef int32_t int_least32_t; +typedef int64_t int_least64_t; +typedef uint8_t uint_least8_t; +typedef uint16_t uint_least16_t; +typedef uint32_t uint_least32_t; +typedef uint64_t uint_least64_t; + +// 7.18.1.3 Fastest minimum-width integer types +typedef int8_t int_fast8_t; +typedef int16_t int_fast16_t; +typedef int32_t int_fast32_t; +typedef int64_t int_fast64_t; +typedef uint8_t uint_fast8_t; +typedef uint16_t uint_fast16_t; +typedef uint32_t uint_fast32_t; +typedef uint64_t uint_fast64_t; + +// 7.18.1.4 Integer types capable of holding object pointers +#ifdef _WIN64 // [ + typedef signed __int64 intptr_t; + typedef unsigned __int64 uintptr_t; +#else // _WIN64 ][ + typedef _W64 signed int intptr_t; + typedef _W64 unsigned int uintptr_t; +#endif // _WIN64 ] + +// 7.18.1.5 Greatest-width integer types +typedef int64_t intmax_t; +typedef uint64_t uintmax_t; + + +// 7.18.2 Limits of specified-width integer types + +#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259 + +// 7.18.2.1 Limits of exact-width integer types +#define INT8_MIN ((int8_t)_I8_MIN) +#define INT8_MAX _I8_MAX +#define INT16_MIN ((int16_t)_I16_MIN) +#define INT16_MAX _I16_MAX +#define INT32_MIN ((int32_t)_I32_MIN) +#define INT32_MAX _I32_MAX +#define INT64_MIN ((int64_t)_I64_MIN) +#define INT64_MAX _I64_MAX +#define UINT8_MAX _UI8_MAX +#define UINT16_MAX _UI16_MAX +#define UINT32_MAX _UI32_MAX +#define UINT64_MAX _UI64_MAX + +// 7.18.2.2 Limits of minimum-width integer types +#define INT_LEAST8_MIN INT8_MIN +#define INT_LEAST8_MAX INT8_MAX +#define INT_LEAST16_MIN INT16_MIN +#define INT_LEAST16_MAX INT16_MAX +#define INT_LEAST32_MIN INT32_MIN +#define INT_LEAST32_MAX INT32_MAX +#define INT_LEAST64_MIN INT64_MIN +#define INT_LEAST64_MAX INT64_MAX +#define UINT_LEAST8_MAX UINT8_MAX +#define UINT_LEAST16_MAX UINT16_MAX +#define UINT_LEAST32_MAX UINT32_MAX +#define UINT_LEAST64_MAX UINT64_MAX + +// 7.18.2.3 Limits of fastest minimum-width integer types +#define INT_FAST8_MIN INT8_MIN +#define INT_FAST8_MAX INT8_MAX +#define INT_FAST16_MIN INT16_MIN +#define INT_FAST16_MAX INT16_MAX +#define INT_FAST32_MIN INT32_MIN +#define INT_FAST32_MAX INT32_MAX +#define INT_FAST64_MIN INT64_MIN +#define INT_FAST64_MAX INT64_MAX +#define UINT_FAST8_MAX UINT8_MAX +#define UINT_FAST16_MAX UINT16_MAX +#define UINT_FAST32_MAX UINT32_MAX +#define UINT_FAST64_MAX UINT64_MAX + +// 7.18.2.4 Limits of integer types capable of holding object pointers +#ifdef _WIN64 // [ +# define INTPTR_MIN INT64_MIN +# define INTPTR_MAX INT64_MAX +# define UINTPTR_MAX UINT64_MAX +#else // _WIN64 ][ +# define INTPTR_MIN INT32_MIN +# define INTPTR_MAX INT32_MAX +# define UINTPTR_MAX UINT32_MAX +#endif // _WIN64 ] + +// 7.18.2.5 Limits of greatest-width integer types +#define INTMAX_MIN INT64_MIN +#define INTMAX_MAX INT64_MAX +#define UINTMAX_MAX UINT64_MAX + +// 7.18.3 Limits of other integer types + +#ifdef _WIN64 // [ +# define PTRDIFF_MIN _I64_MIN +# define PTRDIFF_MAX _I64_MAX +#else // _WIN64 ][ +# define PTRDIFF_MIN _I32_MIN +# define PTRDIFF_MAX _I32_MAX +#endif // _WIN64 ] + +#define SIG_ATOMIC_MIN INT_MIN +#define SIG_ATOMIC_MAX INT_MAX + +#ifndef SIZE_MAX // [ +# ifdef _WIN64 // [ +# define SIZE_MAX _UI64_MAX +# else // _WIN64 ][ +# define SIZE_MAX _UI32_MAX +# endif // _WIN64 ] +#endif // SIZE_MAX ] + +// WCHAR_MIN and WCHAR_MAX are also defined in +#ifndef WCHAR_MIN // [ +# define WCHAR_MIN 0 +#endif // WCHAR_MIN ] +#ifndef WCHAR_MAX // [ +# define WCHAR_MAX _UI16_MAX +#endif // WCHAR_MAX ] + +#define WINT_MIN 0 +#define WINT_MAX _UI16_MAX + +#endif // __STDC_LIMIT_MACROS ] + + +// 7.18.4 Limits of other integer types + +#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260 + +// 7.18.4.1 Macros for minimum-width integer constants + +#define INT8_C(val) val##i8 +#define INT16_C(val) val##i16 +#define INT32_C(val) val##i32 +#define INT64_C(val) val##i64 + +#define UINT8_C(val) val##ui8 +#define UINT16_C(val) val##ui16 +#define UINT32_C(val) val##ui32 +#define UINT64_C(val) val##ui64 + +// 7.18.4.2 Macros for greatest-width integer constants +#define INTMAX_C INT64_C +#define UINTMAX_C UINT64_C + +#endif // __STDC_CONSTANT_MACROS ] + + +#endif // _MSC_STDINT_H_ ] diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/msvc_compat/strings.h b/src/duckdb/extension/jemalloc/jemalloc/include/msvc_compat/strings.h new file mode 100644 index 000000000..996f256ce --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/msvc_compat/strings.h @@ -0,0 +1,58 @@ +#ifndef strings_h +#define strings_h + +/* MSVC doesn't define ffs/ffsl. This dummy strings.h header is provided + * for both */ +#ifdef _MSC_VER +# include +# pragma intrinsic(_BitScanForward) +static __forceinline int ffsl(long x) { + unsigned long i; + + if (_BitScanForward(&i, x)) { + return i + 1; + } + return 0; +} + +static __forceinline int ffs(int x) { + return ffsl(x); +} + +# ifdef _M_X64 +# pragma intrinsic(_BitScanForward64) +# endif + +static __forceinline int ffsll(unsigned __int64 x) { + unsigned long i; +#ifdef _M_X64 + if (_BitScanForward64(&i, x)) { + return i + 1; + } + return 0; +#else +// Fallback for 32-bit build where 64-bit version not available +// assuming little endian + union { + unsigned __int64 ll; + unsigned long l[2]; + } s; + + s.ll = x; + + if (_BitScanForward(&i, s.l[0])) { + return i + 1; + } else if(_BitScanForward(&i, s.l[1])) { + return i + 33; + } + return 0; +#endif +} + +#else +# define ffsll(x) __builtin_ffsll(x) +# define ffsl(x) __builtin_ffsl(x) +# define ffs(x) __builtin_ffs(x) +#endif + +#endif /* strings_h */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/include/msvc_compat/windows_extra.h b/src/duckdb/extension/jemalloc/jemalloc/include/msvc_compat/windows_extra.h new file mode 100644 index 000000000..a6ebb9306 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/include/msvc_compat/windows_extra.h @@ -0,0 +1,6 @@ +#ifndef MSVC_COMPAT_WINDOWS_EXTRA_H +#define MSVC_COMPAT_WINDOWS_EXTRA_H + +#include + +#endif /* MSVC_COMPAT_WINDOWS_EXTRA_H */ diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/arena.c b/src/duckdb/extension/jemalloc/jemalloc/src/arena.c new file mode 100644 index 000000000..21010279d --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/arena.c @@ -0,0 +1,2023 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/decay.h" +#include "jemalloc/internal/ehooks.h" +#include "jemalloc/internal/extent_dss.h" +#include "jemalloc/internal/extent_mmap.h" +#include "jemalloc/internal/san.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/rtree.h" +#include "jemalloc/internal/safety_check.h" +#include "jemalloc/internal/util.h" + +JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS + +/******************************************************************************/ +/* Data. */ + +/* + * Define names for both unininitialized and initialized phases, so that + * options and mallctl processing are straightforward. + */ +const char *const percpu_arena_mode_names[] = { + "percpu", + "phycpu", + "disabled", + "percpu", + "phycpu" +}; +percpu_arena_mode_t opt_percpu_arena = PERCPU_ARENA_DEFAULT; + +ssize_t opt_dirty_decay_ms = DIRTY_DECAY_MS_DEFAULT; +ssize_t opt_muzzy_decay_ms = MUZZY_DECAY_MS_DEFAULT; + +static atomic_zd_t dirty_decay_ms_default; +static atomic_zd_t muzzy_decay_ms_default; + +emap_t arena_emap_global; +static pa_central_t arena_pa_central_global; + +div_info_t arena_binind_div_info[SC_NBINS]; + +size_t opt_oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT; +size_t oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT; + +uint32_t arena_bin_offsets[SC_NBINS]; + +static unsigned huge_arena_ind; + +const arena_config_t arena_config_default = { + /* .extent_hooks = */ (extent_hooks_t *)&ehooks_default_extent_hooks, + /* .metadata_use_hooks = */ true, +}; + +/******************************************************************************/ +/* + * Function prototypes for static functions that are referenced prior to + * definition. + */ + +static bool arena_decay_dirty(tsdn_t *tsdn, arena_t *arena, + bool is_background_thread, bool all); +static void arena_bin_lower_slab(tsdn_t *tsdn, arena_t *arena, edata_t *slab, + bin_t *bin); +static void +arena_maybe_do_deferred_work(tsdn_t *tsdn, arena_t *arena, decay_t *decay, + size_t npages_new); + +/******************************************************************************/ + +void +arena_basic_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads, + const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms, + size_t *nactive, size_t *ndirty, size_t *nmuzzy) { + *nthreads += arena_nthreads_get(arena, false); + *dss = dss_prec_names[arena_dss_prec_get(arena)]; + *dirty_decay_ms = arena_decay_ms_get(arena, extent_state_dirty); + *muzzy_decay_ms = arena_decay_ms_get(arena, extent_state_muzzy); + pa_shard_basic_stats_merge(&arena->pa_shard, nactive, ndirty, nmuzzy); +} + +void +arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads, + const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms, + size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats, + bin_stats_data_t *bstats, arena_stats_large_t *lstats, + pac_estats_t *estats, hpa_shard_stats_t *hpastats, sec_stats_t *secstats) { + cassert(config_stats); + + arena_basic_stats_merge(tsdn, arena, nthreads, dss, dirty_decay_ms, + muzzy_decay_ms, nactive, ndirty, nmuzzy); + + size_t base_allocated, base_edata_allocated, base_rtree_allocated, + base_resident, base_mapped, metadata_thp; + base_stats_get(tsdn, arena->base, &base_allocated, + &base_edata_allocated, &base_rtree_allocated, &base_resident, + &base_mapped, &metadata_thp); + size_t pac_mapped_sz = pac_mapped(&arena->pa_shard.pac); + astats->mapped += base_mapped + pac_mapped_sz; + astats->resident += base_resident; + + LOCKEDINT_MTX_LOCK(tsdn, arena->stats.mtx); + + astats->base += base_allocated; + astats->metadata_edata += base_edata_allocated; + astats->metadata_rtree += base_rtree_allocated; + atomic_load_add_store_zu(&astats->internal, arena_internal_get(arena)); + astats->metadata_thp += metadata_thp; + + for (szind_t i = 0; i < SC_NSIZES - SC_NBINS; i++) { + /* ndalloc should be read before nmalloc, + * since otherwise it is possible for ndalloc to be incremented, + * and the following can become true: ndalloc > nmalloc */ + uint64_t ndalloc = locked_read_u64(tsdn, + LOCKEDINT_MTX(arena->stats.mtx), + &arena->stats.lstats[i].ndalloc); + locked_inc_u64_unsynchronized(&lstats[i].ndalloc, ndalloc); + astats->ndalloc_large += ndalloc; + + uint64_t nmalloc = locked_read_u64(tsdn, + LOCKEDINT_MTX(arena->stats.mtx), + &arena->stats.lstats[i].nmalloc); + locked_inc_u64_unsynchronized(&lstats[i].nmalloc, nmalloc); + astats->nmalloc_large += nmalloc; + + uint64_t nrequests = locked_read_u64(tsdn, + LOCKEDINT_MTX(arena->stats.mtx), + &arena->stats.lstats[i].nrequests); + locked_inc_u64_unsynchronized(&lstats[i].nrequests, + nmalloc + nrequests); + astats->nrequests_large += nmalloc + nrequests; + + /* nfill == nmalloc for large currently. */ + locked_inc_u64_unsynchronized(&lstats[i].nfills, nmalloc); + astats->nfills_large += nmalloc; + + uint64_t nflush = locked_read_u64(tsdn, + LOCKEDINT_MTX(arena->stats.mtx), + &arena->stats.lstats[i].nflushes); + locked_inc_u64_unsynchronized(&lstats[i].nflushes, nflush); + astats->nflushes_large += nflush; + + assert(nmalloc >= ndalloc); + assert(nmalloc - ndalloc <= SIZE_T_MAX); + size_t curlextents = (size_t)(nmalloc - ndalloc); + lstats[i].curlextents += curlextents; + astats->allocated_large += + curlextents * sz_index2size(SC_NBINS + i); + } + + pa_shard_stats_merge(tsdn, &arena->pa_shard, &astats->pa_shard_stats, + estats, hpastats, secstats, &astats->resident); + + LOCKEDINT_MTX_UNLOCK(tsdn, arena->stats.mtx); + + /* Currently cached bytes and sanitizer-stashed bytes in tcache. */ + astats->tcache_bytes = 0; + astats->tcache_stashed_bytes = 0; + malloc_mutex_lock(tsdn, &arena->tcache_ql_mtx); + cache_bin_array_descriptor_t *descriptor; + ql_foreach(descriptor, &arena->cache_bin_array_descriptor_ql, link) { + for (szind_t i = 0; i < TCACHE_NBINS_MAX; i++) { + cache_bin_t *cache_bin = &descriptor->bins[i]; + if (cache_bin_disabled(cache_bin)) { + continue; + } + + cache_bin_sz_t ncached, nstashed; + cache_bin_nitems_get_remote(cache_bin, &ncached, &nstashed); + astats->tcache_bytes += ncached * sz_index2size(i); + astats->tcache_stashed_bytes += nstashed * + sz_index2size(i); + } + } + malloc_mutex_prof_read(tsdn, + &astats->mutex_prof_data[arena_prof_mutex_tcache_list], + &arena->tcache_ql_mtx); + malloc_mutex_unlock(tsdn, &arena->tcache_ql_mtx); + +#define READ_ARENA_MUTEX_PROF_DATA(mtx, ind) \ + malloc_mutex_lock(tsdn, &arena->mtx); \ + malloc_mutex_prof_read(tsdn, &astats->mutex_prof_data[ind], \ + &arena->mtx); \ + malloc_mutex_unlock(tsdn, &arena->mtx); + + /* Gather per arena mutex profiling data. */ + READ_ARENA_MUTEX_PROF_DATA(large_mtx, arena_prof_mutex_large); + READ_ARENA_MUTEX_PROF_DATA(base->mtx, + arena_prof_mutex_base); +#undef READ_ARENA_MUTEX_PROF_DATA + pa_shard_mtx_stats_read(tsdn, &arena->pa_shard, + astats->mutex_prof_data); + + nstime_copy(&astats->uptime, &arena->create_time); + nstime_update(&astats->uptime); + nstime_subtract(&astats->uptime, &arena->create_time); + + for (szind_t i = 0; i < SC_NBINS; i++) { + for (unsigned j = 0; j < bin_infos[i].n_shards; j++) { + bin_stats_merge(tsdn, &bstats[i], + arena_get_bin(arena, i, j)); + } + } +} + +static void +arena_background_thread_inactivity_check(tsdn_t *tsdn, arena_t *arena, + bool is_background_thread) { + if (!background_thread_enabled() || is_background_thread) { + return; + } + background_thread_info_t *info = + arena_background_thread_info_get(arena); + if (background_thread_indefinite_sleep(info)) { + arena_maybe_do_deferred_work(tsdn, arena, + &arena->pa_shard.pac.decay_dirty, 0); + } +} + +/* + * React to deferred work generated by a PAI function. + */ +void arena_handle_deferred_work(tsdn_t *tsdn, arena_t *arena) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + if (decay_immediately(&arena->pa_shard.pac.decay_dirty)) { + arena_decay_dirty(tsdn, arena, false, true); + } + arena_background_thread_inactivity_check(tsdn, arena, false); +} + +static void * +arena_slab_reg_alloc(edata_t *slab, const bin_info_t *bin_info) { + void *ret; + slab_data_t *slab_data = edata_slab_data_get(slab); + size_t regind; + + assert(edata_nfree_get(slab) > 0); + assert(!bitmap_full(slab_data->bitmap, &bin_info->bitmap_info)); + + regind = bitmap_sfu(slab_data->bitmap, &bin_info->bitmap_info); + ret = (void *)((byte_t *)edata_addr_get(slab) + + (uintptr_t)(bin_info->reg_size * regind)); + edata_nfree_dec(slab); + return ret; +} + +static void +arena_slab_reg_alloc_batch(edata_t *slab, const bin_info_t *bin_info, + unsigned cnt, void** ptrs) { + slab_data_t *slab_data = edata_slab_data_get(slab); + + assert(edata_nfree_get(slab) >= cnt); + assert(!bitmap_full(slab_data->bitmap, &bin_info->bitmap_info)); + +#if (! defined JEMALLOC_INTERNAL_POPCOUNTL) || (defined BITMAP_USE_TREE) + for (unsigned i = 0; i < cnt; i++) { + size_t regind = bitmap_sfu(slab_data->bitmap, + &bin_info->bitmap_info); + *(ptrs + i) = (void *)((uintptr_t)edata_addr_get(slab) + + (uintptr_t)(bin_info->reg_size * regind)); + } +#else + unsigned group = 0; + bitmap_t g = slab_data->bitmap[group]; + unsigned i = 0; + while (i < cnt) { + while (g == 0) { + g = slab_data->bitmap[++group]; + } + size_t shift = group << LG_BITMAP_GROUP_NBITS; + size_t pop = popcount_lu(g); + if (pop > (cnt - i)) { + pop = cnt - i; + } + + /* + * Load from memory locations only once, outside the + * hot loop below. + */ + uintptr_t base = (uintptr_t)edata_addr_get(slab); + uintptr_t regsize = (uintptr_t)bin_info->reg_size; + while (pop--) { + size_t bit = cfs_lu(&g); + size_t regind = shift + bit; + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + *(ptrs + i) = (void *)(base + regsize * regind); + + i++; + } + slab_data->bitmap[group] = g; + } +#endif + edata_nfree_sub(slab, cnt); +} + +static void +arena_large_malloc_stats_update(tsdn_t *tsdn, arena_t *arena, size_t usize) { + cassert(config_stats); + + szind_t index = sz_size2index(usize); + /* This only occurs when we have a sampled small allocation */ + if (usize < SC_LARGE_MINCLASS) { + assert(index < SC_NBINS); + assert(usize >= PAGE && usize % PAGE == 0); + bin_t *bin = arena_get_bin(arena, index, /* binshard */ 0); + malloc_mutex_lock(tsdn, &bin->lock); + bin->stats.nmalloc++; + malloc_mutex_unlock(tsdn, &bin->lock); + } else { + assert(index >= SC_NBINS); + szind_t hindex = index - SC_NBINS; + LOCKEDINT_MTX_LOCK(tsdn, arena->stats.mtx); + locked_inc_u64(tsdn, LOCKEDINT_MTX(arena->stats.mtx), + &arena->stats.lstats[hindex].nmalloc, 1); + LOCKEDINT_MTX_UNLOCK(tsdn, arena->stats.mtx); + } +} + +static void +arena_large_dalloc_stats_update(tsdn_t *tsdn, arena_t *arena, size_t usize) { + cassert(config_stats); + + szind_t index = sz_size2index(usize); + /* This only occurs when we have a sampled small allocation */ + if (usize < SC_LARGE_MINCLASS) { + assert(index < SC_NBINS); + assert(usize >= PAGE && usize % PAGE == 0); + bin_t *bin = arena_get_bin(arena, index, /* binshard */ 0); + malloc_mutex_lock(tsdn, &bin->lock); + bin->stats.ndalloc++; + malloc_mutex_unlock(tsdn, &bin->lock); + } else { + assert(index >= SC_NBINS); + szind_t hindex = index - SC_NBINS; + LOCKEDINT_MTX_LOCK(tsdn, arena->stats.mtx); + locked_inc_u64(tsdn, LOCKEDINT_MTX(arena->stats.mtx), + &arena->stats.lstats[hindex].ndalloc, 1); + LOCKEDINT_MTX_UNLOCK(tsdn, arena->stats.mtx); + } +} + +static void +arena_large_ralloc_stats_update(tsdn_t *tsdn, arena_t *arena, size_t oldusize, + size_t usize) { + arena_large_malloc_stats_update(tsdn, arena, usize); + arena_large_dalloc_stats_update(tsdn, arena, oldusize); +} + +edata_t * +arena_extent_alloc_large(tsdn_t *tsdn, arena_t *arena, size_t usize, + size_t alignment, bool zero) { + bool deferred_work_generated = false; + szind_t szind = sz_size2index(usize); + size_t esize = usize + sz_large_pad; + + bool guarded = san_large_extent_decide_guard(tsdn, + arena_get_ehooks(arena), esize, alignment); + + /* + * - if usize >= opt_calloc_madvise_threshold, + * - pa_alloc(..., zero_override = zero, ...) + * - otherwise, + * - pa_alloc(..., zero_override = false, ...) + * - use memset() to zero out memory if zero == true. + */ + bool zero_override = zero && (usize >= opt_calloc_madvise_threshold); + edata_t *edata = pa_alloc(tsdn, &arena->pa_shard, esize, alignment, + /* slab */ false, szind, zero_override, guarded, + &deferred_work_generated); + + if (edata == NULL) { + return NULL; + } + + if (config_stats) { + arena_large_malloc_stats_update(tsdn, arena, usize); + } + if (sz_large_pad != 0) { + arena_cache_oblivious_randomize(tsdn, arena, edata, alignment); + } + /* + * This branch should be put after the randomization so that the addr + * returned by edata_addr_get() has already be randomized, + * if cache_oblivious is enabled. + */ + if (zero && !zero_override && !edata_zeroed_get(edata)) { + void *addr = edata_addr_get(edata); + size_t usize = edata_usize_get(edata); + memset(addr, 0, usize); + } + + return edata; +} + +void +arena_extent_dalloc_large_prep(tsdn_t *tsdn, arena_t *arena, edata_t *edata) { + if (config_stats) { + arena_large_dalloc_stats_update(tsdn, arena, + edata_usize_get(edata)); + } +} + +void +arena_extent_ralloc_large_shrink(tsdn_t *tsdn, arena_t *arena, edata_t *edata, + size_t oldusize) { + size_t usize = edata_usize_get(edata); + + if (config_stats) { + arena_large_ralloc_stats_update(tsdn, arena, oldusize, usize); + } +} + +void +arena_extent_ralloc_large_expand(tsdn_t *tsdn, arena_t *arena, edata_t *edata, + size_t oldusize) { + size_t usize = edata_usize_get(edata); + + if (config_stats) { + arena_large_ralloc_stats_update(tsdn, arena, oldusize, usize); + } +} + +/* + * In situations where we're not forcing a decay (i.e. because the user + * specifically requested it), should we purge ourselves, or wait for the + * background thread to get to it. + */ +static pac_purge_eagerness_t +arena_decide_unforced_purge_eagerness(bool is_background_thread) { + if (is_background_thread) { + return PAC_PURGE_ALWAYS; + } else if (!is_background_thread && background_thread_enabled()) { + return PAC_PURGE_NEVER; + } else { + return PAC_PURGE_ON_EPOCH_ADVANCE; + } +} + +bool +arena_decay_ms_set(tsdn_t *tsdn, arena_t *arena, extent_state_t state, + ssize_t decay_ms) { + pac_purge_eagerness_t eagerness = arena_decide_unforced_purge_eagerness( + /* is_background_thread */ false); + return pa_decay_ms_set(tsdn, &arena->pa_shard, state, decay_ms, + eagerness); +} + +ssize_t +arena_decay_ms_get(arena_t *arena, extent_state_t state) { + return pa_decay_ms_get(&arena->pa_shard, state); +} + +static bool +arena_decay_impl(tsdn_t *tsdn, arena_t *arena, decay_t *decay, + pac_decay_stats_t *decay_stats, ecache_t *ecache, + bool is_background_thread, bool all) { + if (all) { + malloc_mutex_lock(tsdn, &decay->mtx); + pac_decay_all(tsdn, &arena->pa_shard.pac, decay, decay_stats, + ecache, /* fully_decay */ all); + malloc_mutex_unlock(tsdn, &decay->mtx); + return false; + } + + if (malloc_mutex_trylock(tsdn, &decay->mtx)) { + /* No need to wait if another thread is in progress. */ + return true; + } + pac_purge_eagerness_t eagerness = + arena_decide_unforced_purge_eagerness(is_background_thread); + bool epoch_advanced = pac_maybe_decay_purge(tsdn, &arena->pa_shard.pac, + decay, decay_stats, ecache, eagerness); + size_t npages_new; + if (epoch_advanced) { + /* Backlog is updated on epoch advance. */ + npages_new = decay_epoch_npages_delta(decay); + } + malloc_mutex_unlock(tsdn, &decay->mtx); + + if (have_background_thread && background_thread_enabled() && + epoch_advanced && !is_background_thread) { + arena_maybe_do_deferred_work(tsdn, arena, decay, npages_new); + } + + return false; +} + +static bool +arena_decay_dirty(tsdn_t *tsdn, arena_t *arena, bool is_background_thread, + bool all) { + return arena_decay_impl(tsdn, arena, &arena->pa_shard.pac.decay_dirty, + &arena->pa_shard.pac.stats->decay_dirty, + &arena->pa_shard.pac.ecache_dirty, is_background_thread, all); +} + +static bool +arena_decay_muzzy(tsdn_t *tsdn, arena_t *arena, bool is_background_thread, + bool all) { + if (pa_shard_dont_decay_muzzy(&arena->pa_shard)) { + return false; + } + return arena_decay_impl(tsdn, arena, &arena->pa_shard.pac.decay_muzzy, + &arena->pa_shard.pac.stats->decay_muzzy, + &arena->pa_shard.pac.ecache_muzzy, is_background_thread, all); +} + +void +arena_decay(tsdn_t *tsdn, arena_t *arena, bool is_background_thread, bool all) { + if (all) { + /* + * We should take a purge of "all" to mean "save as much memory + * as possible", including flushing any caches (for situations + * like thread death, or manual purge calls). + */ + sec_flush(tsdn, &arena->pa_shard.hpa_sec); + } + if (arena_decay_dirty(tsdn, arena, is_background_thread, all)) { + return; + } + arena_decay_muzzy(tsdn, arena, is_background_thread, all); +} + +static bool +arena_should_decay_early(tsdn_t *tsdn, arena_t *arena, decay_t *decay, + background_thread_info_t *info, nstime_t *remaining_sleep, + size_t npages_new) { + malloc_mutex_assert_owner(tsdn, &info->mtx); + + if (malloc_mutex_trylock(tsdn, &decay->mtx)) { + return false; + } + + if (!decay_gradually(decay)) { + malloc_mutex_unlock(tsdn, &decay->mtx); + return false; + } + + nstime_init(remaining_sleep, background_thread_wakeup_time_get(info)); + if (nstime_compare(remaining_sleep, &decay->epoch) <= 0) { + malloc_mutex_unlock(tsdn, &decay->mtx); + return false; + } + nstime_subtract(remaining_sleep, &decay->epoch); + if (npages_new > 0) { + uint64_t npurge_new = decay_npages_purge_in(decay, + remaining_sleep, npages_new); + info->npages_to_purge_new += npurge_new; + } + malloc_mutex_unlock(tsdn, &decay->mtx); + return info->npages_to_purge_new > + ARENA_DEFERRED_PURGE_NPAGES_THRESHOLD; +} + +/* + * Check if deferred work needs to be done sooner than planned. + * For decay we might want to wake up earlier because of an influx of dirty + * pages. Rather than waiting for previously estimated time, we proactively + * purge those pages. + * If background thread sleeps indefinitely, always wake up because some + * deferred work has been generated. + */ +static void +arena_maybe_do_deferred_work(tsdn_t *tsdn, arena_t *arena, decay_t *decay, + size_t npages_new) { + background_thread_info_t *info = arena_background_thread_info_get( + arena); + if (malloc_mutex_trylock(tsdn, &info->mtx)) { + /* + * Background thread may hold the mutex for a long period of + * time. We'd like to avoid the variance on application + * threads. So keep this non-blocking, and leave the work to a + * future epoch. + */ + return; + } + if (!background_thread_is_started(info)) { + goto label_done; + } + + nstime_t remaining_sleep; + if (background_thread_indefinite_sleep(info)) { + background_thread_wakeup_early(info, NULL); + } else if (arena_should_decay_early(tsdn, arena, decay, info, + &remaining_sleep, npages_new)) { + info->npages_to_purge_new = 0; + background_thread_wakeup_early(info, &remaining_sleep); + } +label_done: + malloc_mutex_unlock(tsdn, &info->mtx); +} + +/* Called from background threads. */ +void +arena_do_deferred_work(tsdn_t *tsdn, arena_t *arena) { + arena_decay(tsdn, arena, true, false); + pa_shard_do_deferred_work(tsdn, &arena->pa_shard); +} + +void +arena_slab_dalloc(tsdn_t *tsdn, arena_t *arena, edata_t *slab) { + bool deferred_work_generated = false; + pa_dalloc(tsdn, &arena->pa_shard, slab, &deferred_work_generated); + if (deferred_work_generated) { + arena_handle_deferred_work(tsdn, arena); + } +} + +static void +arena_bin_slabs_nonfull_insert(bin_t *bin, edata_t *slab) { + assert(edata_nfree_get(slab) > 0); + edata_heap_insert(&bin->slabs_nonfull, slab); + if (config_stats) { + bin->stats.nonfull_slabs++; + } +} + +static void +arena_bin_slabs_nonfull_remove(bin_t *bin, edata_t *slab) { + edata_heap_remove(&bin->slabs_nonfull, slab); + if (config_stats) { + bin->stats.nonfull_slabs--; + } +} + +static edata_t * +arena_bin_slabs_nonfull_tryget(bin_t *bin) { + edata_t *slab = edata_heap_remove_first(&bin->slabs_nonfull); + if (slab == NULL) { + return NULL; + } + if (config_stats) { + bin->stats.reslabs++; + bin->stats.nonfull_slabs--; + } + return slab; +} + +static void +arena_bin_slabs_full_insert(arena_t *arena, bin_t *bin, edata_t *slab) { + assert(edata_nfree_get(slab) == 0); + /* + * Tracking extents is required by arena_reset, which is not allowed + * for auto arenas. Bypass this step to avoid touching the edata + * linkage (often results in cache misses) for auto arenas. + */ + if (arena_is_auto(arena)) { + return; + } + edata_list_active_append(&bin->slabs_full, slab); +} + +static void +arena_bin_slabs_full_remove(arena_t *arena, bin_t *bin, edata_t *slab) { + if (arena_is_auto(arena)) { + return; + } + edata_list_active_remove(&bin->slabs_full, slab); +} + +static void +arena_bin_reset(tsd_t *tsd, arena_t *arena, bin_t *bin, unsigned binind) { + edata_t *slab; + + malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock); + + if (arena_bin_has_batch(binind)) { + bin_with_batch_t *batched_bin = (bin_with_batch_t *)bin; + batcher_init(&batched_bin->remote_frees, + BIN_REMOTE_FREE_ELEMS_MAX); + } + + if (bin->slabcur != NULL) { + slab = bin->slabcur; + bin->slabcur = NULL; + malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock); + arena_slab_dalloc(tsd_tsdn(tsd), arena, slab); + malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock); + } + while ((slab = edata_heap_remove_first(&bin->slabs_nonfull)) != NULL) { + malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock); + arena_slab_dalloc(tsd_tsdn(tsd), arena, slab); + malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock); + } + for (slab = edata_list_active_first(&bin->slabs_full); slab != NULL; + slab = edata_list_active_first(&bin->slabs_full)) { + arena_bin_slabs_full_remove(arena, bin, slab); + malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock); + arena_slab_dalloc(tsd_tsdn(tsd), arena, slab); + malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock); + } + if (config_stats) { + bin->stats.curregs = 0; + bin->stats.curslabs = 0; + } + malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock); +} + +void +arena_prof_promote(tsdn_t *tsdn, void *ptr, size_t usize, size_t bumped_usize) { + cassert(config_prof); + assert(ptr != NULL); + assert(isalloc(tsdn, ptr) == bumped_usize); + assert(sz_can_use_slab(usize)); + + if (config_opt_safety_checks) { + safety_check_set_redzone(ptr, usize, bumped_usize); + } + + edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, ptr); + + szind_t szind = sz_size2index(usize); + edata_szind_set(edata, szind); + emap_remap(tsdn, &arena_emap_global, edata, szind, /* slab */ false); + + assert(isalloc(tsdn, ptr) == usize); +} + +static size_t +arena_prof_demote(tsdn_t *tsdn, edata_t *edata, const void *ptr) { + cassert(config_prof); + assert(ptr != NULL); + size_t usize = isalloc(tsdn, ptr); + size_t bumped_usize = sz_sa2u(usize, PROF_SAMPLE_ALIGNMENT); + assert(bumped_usize <= SC_LARGE_MINCLASS && + PAGE_CEILING(bumped_usize) == bumped_usize); + assert(edata_size_get(edata) - bumped_usize <= sz_large_pad); + szind_t szind = sz_size2index(bumped_usize); + + edata_szind_set(edata, szind); + emap_remap(tsdn, &arena_emap_global, edata, szind, /* slab */ false); + + assert(isalloc(tsdn, ptr) == bumped_usize); + + return bumped_usize; +} + +static void +arena_dalloc_promoted_impl(tsdn_t *tsdn, void *ptr, tcache_t *tcache, + bool slow_path, edata_t *edata) { + cassert(config_prof); + assert(opt_prof); + + size_t usize = edata_usize_get(edata); + size_t bumped_usize = arena_prof_demote(tsdn, edata, ptr); + if (config_opt_safety_checks && usize < SC_LARGE_MINCLASS) { + /* + * Currently, we only do redzoning for small sampled + * allocations. + */ + safety_check_verify_redzone(ptr, usize, bumped_usize); + } + szind_t bumped_ind = sz_size2index(bumped_usize); + if (bumped_usize >= SC_LARGE_MINCLASS && + tcache != NULL && bumped_ind < TCACHE_NBINS_MAX && + !tcache_bin_disabled(bumped_ind, &tcache->bins[bumped_ind], + tcache->tcache_slow)) { + tcache_dalloc_large(tsdn_tsd(tsdn), tcache, ptr, bumped_ind, + slow_path); + } else { + large_dalloc(tsdn, edata); + } +} + +void +arena_dalloc_promoted(tsdn_t *tsdn, void *ptr, tcache_t *tcache, + bool slow_path) { + edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, ptr); + arena_dalloc_promoted_impl(tsdn, ptr, tcache, slow_path, edata); +} + +void +arena_reset(tsd_t *tsd, arena_t *arena) { + /* + * Locking in this function is unintuitive. The caller guarantees that + * no concurrent operations are happening in this arena, but there are + * still reasons that some locking is necessary: + * + * - Some of the functions in the transitive closure of calls assume + * appropriate locks are held, and in some cases these locks are + * temporarily dropped to avoid lock order reversal or deadlock due to + * reentry. + * - mallctl("epoch", ...) may concurrently refresh stats. While + * strictly speaking this is a "concurrent operation", disallowing + * stats refreshes would impose an inconvenient burden. + */ + + /* Large allocations. */ + malloc_mutex_lock(tsd_tsdn(tsd), &arena->large_mtx); + + for (edata_t *edata = edata_list_active_first(&arena->large); + edata != NULL; edata = edata_list_active_first(&arena->large)) { + void *ptr = edata_base_get(edata); + size_t usize; + + malloc_mutex_unlock(tsd_tsdn(tsd), &arena->large_mtx); + emap_alloc_ctx_t alloc_ctx; + emap_alloc_ctx_lookup(tsd_tsdn(tsd), &arena_emap_global, ptr, + &alloc_ctx); + assert(alloc_ctx.szind != SC_NSIZES); + + if (config_stats || (config_prof && opt_prof)) { + usize = sz_index2size(alloc_ctx.szind); + assert(usize == isalloc(tsd_tsdn(tsd), ptr)); + } + /* Remove large allocation from prof sample set. */ + if (config_prof && opt_prof) { + prof_free(tsd, ptr, usize, &alloc_ctx); + } + if (config_prof && opt_prof && alloc_ctx.szind < SC_NBINS) { + arena_dalloc_promoted_impl(tsd_tsdn(tsd), ptr, + /* tcache */ NULL, /* slow_path */ true, edata); + } else { + large_dalloc(tsd_tsdn(tsd), edata); + } + malloc_mutex_lock(tsd_tsdn(tsd), &arena->large_mtx); + } + malloc_mutex_unlock(tsd_tsdn(tsd), &arena->large_mtx); + + /* Bins. */ + for (unsigned i = 0; i < SC_NBINS; i++) { + for (unsigned j = 0; j < bin_infos[i].n_shards; j++) { + arena_bin_reset(tsd, arena, arena_get_bin(arena, i, j), + i); + } + } + pa_shard_reset(tsd_tsdn(tsd), &arena->pa_shard); +} + +static void +arena_prepare_base_deletion_sync_finish(tsd_t *tsd, malloc_mutex_t **mutexes, + unsigned n_mtx) { + for (unsigned i = 0; i < n_mtx; i++) { + malloc_mutex_lock(tsd_tsdn(tsd), mutexes[i]); + malloc_mutex_unlock(tsd_tsdn(tsd), mutexes[i]); + } +} + +#define ARENA_DESTROY_MAX_DELAYED_MTX 32 +static void +arena_prepare_base_deletion_sync(tsd_t *tsd, malloc_mutex_t *mtx, + malloc_mutex_t **delayed_mtx, unsigned *n_delayed) { + if (!malloc_mutex_trylock(tsd_tsdn(tsd), mtx)) { + /* No contention. */ + malloc_mutex_unlock(tsd_tsdn(tsd), mtx); + return; + } + unsigned n = *n_delayed; + assert(n < ARENA_DESTROY_MAX_DELAYED_MTX); + /* Add another to the batch. */ + delayed_mtx[n++] = mtx; + + if (n == ARENA_DESTROY_MAX_DELAYED_MTX) { + arena_prepare_base_deletion_sync_finish(tsd, delayed_mtx, n); + n = 0; + } + *n_delayed = n; +} + +static void +arena_prepare_base_deletion(tsd_t *tsd, base_t *base_to_destroy) { + /* + * In order to coalesce, emap_try_acquire_edata_neighbor will attempt to + * check neighbor edata's state to determine eligibility. This means + * under certain conditions, the metadata from an arena can be accessed + * w/o holding any locks from that arena. In order to guarantee safe + * memory access, the metadata and the underlying base allocator needs + * to be kept alive, until all pending accesses are done. + * + * 1) with opt_retain, the arena boundary implies the is_head state + * (tracked in the rtree leaf), and the coalesce flow will stop at the + * head state branch. Therefore no cross arena metadata access + * possible. + * + * 2) w/o opt_retain, the arena id needs to be read from the edata_t, + * meaning read only cross-arena metadata access is possible. The + * coalesce attempt will stop at the arena_id mismatch, and is always + * under one of the ecache locks. To allow safe passthrough of such + * metadata accesses, the loop below will iterate through all manual + * arenas' ecache locks. As all the metadata from this base allocator + * have been unlinked from the rtree, after going through all the + * relevant ecache locks, it's safe to say that a) pending accesses are + * all finished, and b) no new access will be generated. + */ + if (opt_retain) { + return; + } + unsigned destroy_ind = base_ind_get(base_to_destroy); + assert(destroy_ind >= manual_arena_base); + + tsdn_t *tsdn = tsd_tsdn(tsd); + malloc_mutex_t *delayed_mtx[ARENA_DESTROY_MAX_DELAYED_MTX]; + unsigned n_delayed = 0, total = narenas_total_get(); + for (unsigned i = 0; i < total; i++) { + if (i == destroy_ind) { + continue; + } + arena_t *arena = arena_get(tsdn, i, false); + if (arena == NULL) { + continue; + } + pac_t *pac = &arena->pa_shard.pac; + arena_prepare_base_deletion_sync(tsd, &pac->ecache_dirty.mtx, + delayed_mtx, &n_delayed); + arena_prepare_base_deletion_sync(tsd, &pac->ecache_muzzy.mtx, + delayed_mtx, &n_delayed); + arena_prepare_base_deletion_sync(tsd, &pac->ecache_retained.mtx, + delayed_mtx, &n_delayed); + } + arena_prepare_base_deletion_sync_finish(tsd, delayed_mtx, n_delayed); +} +#undef ARENA_DESTROY_MAX_DELAYED_MTX + +void +arena_destroy(tsd_t *tsd, arena_t *arena) { + assert(base_ind_get(arena->base) >= narenas_auto); + assert(arena_nthreads_get(arena, false) == 0); + assert(arena_nthreads_get(arena, true) == 0); + + /* + * No allocations have occurred since arena_reset() was called. + * Furthermore, the caller (arena_i_destroy_ctl()) purged all cached + * extents, so only retained extents may remain and it's safe to call + * pa_shard_destroy_retained. + */ + pa_shard_destroy(tsd_tsdn(tsd), &arena->pa_shard); + + /* + * Remove the arena pointer from the arenas array. We rely on the fact + * that there is no way for the application to get a dirty read from the + * arenas array unless there is an inherent race in the application + * involving access of an arena being concurrently destroyed. The + * application must synchronize knowledge of the arena's validity, so as + * long as we use an atomic write to update the arenas array, the + * application will get a clean read any time after it synchronizes + * knowledge that the arena is no longer valid. + */ + arena_set(base_ind_get(arena->base), NULL); + + /* + * Destroy the base allocator, which manages all metadata ever mapped by + * this arena. The prepare function will make sure no pending access to + * the metadata in this base anymore. + */ + arena_prepare_base_deletion(tsd, arena->base); + base_delete(tsd_tsdn(tsd), arena->base); +} + +static edata_t * +arena_slab_alloc(tsdn_t *tsdn, arena_t *arena, szind_t binind, unsigned binshard, + const bin_info_t *bin_info) { + bool deferred_work_generated = false; + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + bool guarded = san_slab_extent_decide_guard(tsdn, + arena_get_ehooks(arena)); + edata_t *slab = pa_alloc(tsdn, &arena->pa_shard, bin_info->slab_size, + /* alignment */ PAGE, /* slab */ true, /* szind */ binind, + /* zero */ false, guarded, &deferred_work_generated); + + if (deferred_work_generated) { + arena_handle_deferred_work(tsdn, arena); + } + + if (slab == NULL) { + return NULL; + } + assert(edata_slab_get(slab)); + + /* Initialize slab internals. */ + slab_data_t *slab_data = edata_slab_data_get(slab); + edata_nfree_binshard_set(slab, bin_info->nregs, binshard); + bitmap_init(slab_data->bitmap, &bin_info->bitmap_info, false); + + return slab; +} + +/* + * Before attempting the _with_fresh_slab approaches below, the _no_fresh_slab + * variants (i.e. through slabcur and nonfull) must be tried first. + */ +static void +arena_bin_refill_slabcur_with_fresh_slab(tsdn_t *tsdn, arena_t *arena, + bin_t *bin, szind_t binind, edata_t *fresh_slab) { + malloc_mutex_assert_owner(tsdn, &bin->lock); + /* Only called after slabcur and nonfull both failed. */ + assert(bin->slabcur == NULL); + assert(edata_heap_first(&bin->slabs_nonfull) == NULL); + assert(fresh_slab != NULL); + + /* A new slab from arena_slab_alloc() */ + assert(edata_nfree_get(fresh_slab) == bin_infos[binind].nregs); + if (config_stats) { + bin->stats.nslabs++; + bin->stats.curslabs++; + } + bin->slabcur = fresh_slab; +} + +/* Refill slabcur and then alloc using the fresh slab */ +static void * +arena_bin_malloc_with_fresh_slab(tsdn_t *tsdn, arena_t *arena, bin_t *bin, + szind_t binind, edata_t *fresh_slab) { + malloc_mutex_assert_owner(tsdn, &bin->lock); + arena_bin_refill_slabcur_with_fresh_slab(tsdn, arena, bin, binind, + fresh_slab); + + return arena_slab_reg_alloc(bin->slabcur, &bin_infos[binind]); +} + +static bool +arena_bin_refill_slabcur_no_fresh_slab(tsdn_t *tsdn, arena_t *arena, + bin_t *bin) { + malloc_mutex_assert_owner(tsdn, &bin->lock); + /* Only called after arena_slab_reg_alloc[_batch] failed. */ + assert(bin->slabcur == NULL || edata_nfree_get(bin->slabcur) == 0); + + if (bin->slabcur != NULL) { + arena_bin_slabs_full_insert(arena, bin, bin->slabcur); + } + + /* Look for a usable slab. */ + bin->slabcur = arena_bin_slabs_nonfull_tryget(bin); + assert(bin->slabcur == NULL || edata_nfree_get(bin->slabcur) > 0); + + return (bin->slabcur == NULL); +} + +bin_t * +arena_bin_choose(tsdn_t *tsdn, arena_t *arena, szind_t binind, + unsigned *binshard_p) { + unsigned binshard; + if (tsdn_null(tsdn) || tsd_arena_get(tsdn_tsd(tsdn)) == NULL) { + binshard = 0; + } else { + binshard = tsd_binshardsp_get(tsdn_tsd(tsdn))->binshard[binind]; + } + assert(binshard < bin_infos[binind].n_shards); + if (binshard_p != NULL) { + *binshard_p = binshard; + } + return arena_get_bin(arena, binind, binshard); +} + +void +arena_cache_bin_fill_small(tsdn_t *tsdn, arena_t *arena, + cache_bin_t *cache_bin, szind_t binind, const cache_bin_sz_t nfill) { + assert(cache_bin_ncached_get_local(cache_bin) == 0); + assert(nfill != 0); + + const bin_info_t *bin_info = &bin_infos[binind]; + + CACHE_BIN_PTR_ARRAY_DECLARE(ptrs, nfill); + cache_bin_init_ptr_array_for_fill(cache_bin, &ptrs, nfill); + /* + * Bin-local resources are used first: 1) bin->slabcur, and 2) nonfull + * slabs. After both are exhausted, new slabs will be allocated through + * arena_slab_alloc(). + * + * Bin lock is only taken / released right before / after the while(...) + * refill loop, with new slab allocation (which has its own locking) + * kept outside of the loop. This setup facilitates flat combining, at + * the cost of the nested loop (through goto label_refill). + * + * To optimize for cases with contention and limited resources + * (e.g. hugepage-backed or non-overcommit arenas), each fill-iteration + * gets one chance of slab_alloc, and a retry of bin local resources + * after the slab allocation (regardless if slab_alloc failed, because + * the bin lock is dropped during the slab allocation). + * + * In other words, new slab allocation is allowed, as long as there was + * progress since the previous slab_alloc. This is tracked with + * made_progress below, initialized to true to jump start the first + * iteration. + * + * In other words (again), the loop will only terminate early (i.e. stop + * with filled < nfill) after going through the three steps: a) bin + * local exhausted, b) unlock and slab_alloc returns null, c) re-lock + * and bin local fails again. + */ + bool made_progress = true; + edata_t *fresh_slab = NULL; + bool alloc_and_retry = false; + cache_bin_sz_t filled = 0; + unsigned binshard; + bin_t *bin = arena_bin_choose(tsdn, arena, binind, &binshard); + + /* + * This has some fields that are conditionally initialized down batch + * flush pathways. This can trigger static analysis warnings deeper + * down in the static. The accesses are guarded by the same checks as + * the initialization, but the analysis isn't able to track that across + * multiple stack frames. + */ + arena_bin_flush_batch_state_t batch_flush_state + JEMALLOC_CLANG_ANALYZER_SILENCE_INIT({0}); +label_refill: + malloc_mutex_lock(tsdn, &bin->lock); + arena_bin_flush_batch_after_lock(tsdn, arena, bin, binind, &batch_flush_state); + + while (filled < nfill) { + /* Try batch-fill from slabcur first. */ + edata_t *slabcur = bin->slabcur; + if (slabcur != NULL && edata_nfree_get(slabcur) > 0) { + unsigned tofill = nfill - filled; + unsigned nfree = edata_nfree_get(slabcur); + unsigned cnt = tofill < nfree ? tofill : nfree; + + arena_slab_reg_alloc_batch(slabcur, bin_info, cnt, + &ptrs.ptr[filled]); + made_progress = true; + filled += cnt; + continue; + } + /* Next try refilling slabcur from nonfull slabs. */ + if (!arena_bin_refill_slabcur_no_fresh_slab(tsdn, arena, bin)) { + assert(bin->slabcur != NULL); + continue; + } + + /* Then see if a new slab was reserved already. */ + if (fresh_slab != NULL) { + arena_bin_refill_slabcur_with_fresh_slab(tsdn, arena, + bin, binind, fresh_slab); + assert(bin->slabcur != NULL); + fresh_slab = NULL; + continue; + } + + /* Try slab_alloc if made progress (or never did slab_alloc). */ + if (made_progress) { + assert(bin->slabcur == NULL); + assert(fresh_slab == NULL); + alloc_and_retry = true; + /* Alloc a new slab then come back. */ + break; + } + + /* OOM. */ + + assert(fresh_slab == NULL); + assert(!alloc_and_retry); + break; + } /* while (filled < nfill) loop. */ + + if (config_stats && !alloc_and_retry) { + bin->stats.nmalloc += filled; + bin->stats.nrequests += cache_bin->tstats.nrequests; + bin->stats.curregs += filled; + bin->stats.nfills++; + cache_bin->tstats.nrequests = 0; + } + + arena_bin_flush_batch_before_unlock(tsdn, arena, bin, binind, + &batch_flush_state); + malloc_mutex_unlock(tsdn, &bin->lock); + arena_bin_flush_batch_after_unlock(tsdn, arena, bin, binind, + &batch_flush_state); + + if (alloc_and_retry) { + assert(fresh_slab == NULL); + assert(filled < nfill); + assert(made_progress); + + fresh_slab = arena_slab_alloc(tsdn, arena, binind, binshard, + bin_info); + /* fresh_slab NULL case handled in the for loop. */ + + alloc_and_retry = false; + made_progress = false; + goto label_refill; + } + assert(filled == nfill || (fresh_slab == NULL && !made_progress)); + + /* Release if allocated but not used. */ + if (fresh_slab != NULL) { + assert(edata_nfree_get(fresh_slab) == bin_info->nregs); + arena_slab_dalloc(tsdn, arena, fresh_slab); + fresh_slab = NULL; + } + + cache_bin_finish_fill(cache_bin, &ptrs, filled); + arena_decay_tick(tsdn, arena); +} + +size_t +arena_fill_small_fresh(tsdn_t *tsdn, arena_t *arena, szind_t binind, + void **ptrs, size_t nfill, bool zero) { + assert(binind < SC_NBINS); + const bin_info_t *bin_info = &bin_infos[binind]; + const size_t nregs = bin_info->nregs; + assert(nregs > 0); + const size_t usize = bin_info->reg_size; + + const bool manual_arena = !arena_is_auto(arena); + unsigned binshard; + bin_t *bin = arena_bin_choose(tsdn, arena, binind, &binshard); + + size_t nslab = 0; + size_t filled = 0; + edata_t *slab = NULL; + edata_list_active_t fulls; + edata_list_active_init(&fulls); + + while (filled < nfill && (slab = arena_slab_alloc(tsdn, arena, binind, + binshard, bin_info)) != NULL) { + assert((size_t)edata_nfree_get(slab) == nregs); + ++nslab; + size_t batch = nfill - filled; + if (batch > nregs) { + batch = nregs; + } + assert(batch > 0); + arena_slab_reg_alloc_batch(slab, bin_info, (unsigned)batch, + &ptrs[filled]); + assert(edata_addr_get(slab) == ptrs[filled]); + if (zero) { + memset(ptrs[filled], 0, batch * usize); + } + filled += batch; + if (batch == nregs) { + if (manual_arena) { + edata_list_active_append(&fulls, slab); + } + slab = NULL; + } + } + + malloc_mutex_lock(tsdn, &bin->lock); + /* + * Only the last slab can be non-empty, and the last slab is non-empty + * iff slab != NULL. + */ + if (slab != NULL) { + arena_bin_lower_slab(tsdn, arena, slab, bin); + } + if (manual_arena) { + edata_list_active_concat(&bin->slabs_full, &fulls); + } + assert(edata_list_active_empty(&fulls)); + if (config_stats) { + bin->stats.nslabs += nslab; + bin->stats.curslabs += nslab; + bin->stats.nmalloc += filled; + bin->stats.nrequests += filled; + bin->stats.curregs += filled; + } + malloc_mutex_unlock(tsdn, &bin->lock); + + arena_decay_tick(tsdn, arena); + return filled; +} + +/* + * Without allocating a new slab, try arena_slab_reg_alloc() and re-fill + * bin->slabcur if necessary. + */ +static void * +arena_bin_malloc_no_fresh_slab(tsdn_t *tsdn, arena_t *arena, bin_t *bin, + szind_t binind) { + malloc_mutex_assert_owner(tsdn, &bin->lock); + if (bin->slabcur == NULL || edata_nfree_get(bin->slabcur) == 0) { + if (arena_bin_refill_slabcur_no_fresh_slab(tsdn, arena, bin)) { + return NULL; + } + } + + assert(bin->slabcur != NULL && edata_nfree_get(bin->slabcur) > 0); + return arena_slab_reg_alloc(bin->slabcur, &bin_infos[binind]); +} + +static void * +arena_malloc_small(tsdn_t *tsdn, arena_t *arena, szind_t binind, bool zero) { + assert(binind < SC_NBINS); + const bin_info_t *bin_info = &bin_infos[binind]; + size_t usize = sz_index2size(binind); + unsigned binshard; + bin_t *bin = arena_bin_choose(tsdn, arena, binind, &binshard); + + malloc_mutex_lock(tsdn, &bin->lock); + edata_t *fresh_slab = NULL; + void *ret = arena_bin_malloc_no_fresh_slab(tsdn, arena, bin, binind); + if (ret == NULL) { + malloc_mutex_unlock(tsdn, &bin->lock); + /******************************/ + fresh_slab = arena_slab_alloc(tsdn, arena, binind, binshard, + bin_info); + /********************************/ + malloc_mutex_lock(tsdn, &bin->lock); + /* Retry since the lock was dropped. */ + ret = arena_bin_malloc_no_fresh_slab(tsdn, arena, bin, binind); + if (ret == NULL) { + if (fresh_slab == NULL) { + /* OOM */ + malloc_mutex_unlock(tsdn, &bin->lock); + return NULL; + } + ret = arena_bin_malloc_with_fresh_slab(tsdn, arena, bin, + binind, fresh_slab); + fresh_slab = NULL; + } + } + if (config_stats) { + bin->stats.nmalloc++; + bin->stats.nrequests++; + bin->stats.curregs++; + } + malloc_mutex_unlock(tsdn, &bin->lock); + + if (fresh_slab != NULL) { + arena_slab_dalloc(tsdn, arena, fresh_slab); + } + if (zero) { + memset(ret, 0, usize); + } + arena_decay_tick(tsdn, arena); + + return ret; +} + +void * +arena_malloc_hard(tsdn_t *tsdn, arena_t *arena, size_t size, szind_t ind, + bool zero, bool slab) { + assert(!tsdn_null(tsdn) || arena != NULL); + + if (likely(!tsdn_null(tsdn))) { + arena = arena_choose_maybe_huge(tsdn_tsd(tsdn), arena, size); + } + if (unlikely(arena == NULL)) { + return NULL; + } + + if (likely(slab)) { + assert(sz_can_use_slab(size)); + return arena_malloc_small(tsdn, arena, ind, zero); + } else { + return large_malloc(tsdn, arena, sz_index2size(ind), zero); + } +} + +void * +arena_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment, + bool zero, bool slab, tcache_t *tcache) { + if (slab) { + assert(sz_can_use_slab(usize)); + /* Small; alignment doesn't require special slab placement. */ + + /* usize should be a result of sz_sa2u() */ + assert((usize & (alignment - 1)) == 0); + + /* + * Small usize can't come from an alignment larger than a page. + */ + assert(alignment <= PAGE); + + return arena_malloc(tsdn, arena, usize, sz_size2index(usize), + zero, slab, tcache, true); + } else { + if (likely(alignment <= CACHELINE)) { + return large_malloc(tsdn, arena, usize, zero); + } else { + return large_palloc(tsdn, arena, usize, alignment, zero); + } + } +} + +static void +arena_dissociate_bin_slab(arena_t *arena, edata_t *slab, bin_t *bin) { + /* Dissociate slab from bin. */ + if (slab == bin->slabcur) { + bin->slabcur = NULL; + } else { + szind_t binind = edata_szind_get(slab); + const bin_info_t *bin_info = &bin_infos[binind]; + + /* + * The following block's conditional is necessary because if the + * slab only contains one region, then it never gets inserted + * into the non-full slabs heap. + */ + if (bin_info->nregs == 1) { + arena_bin_slabs_full_remove(arena, bin, slab); + } else { + arena_bin_slabs_nonfull_remove(bin, slab); + } + } +} + +static void +arena_bin_lower_slab(tsdn_t *tsdn, arena_t *arena, edata_t *slab, + bin_t *bin) { + assert(edata_nfree_get(slab) > 0); + + /* + * Make sure that if bin->slabcur is non-NULL, it refers to the + * oldest/lowest non-full slab. It is okay to NULL slabcur out rather + * than proactively keeping it pointing at the oldest/lowest non-full + * slab. + */ + if (bin->slabcur != NULL && edata_snad_comp(bin->slabcur, slab) > 0) { + /* Switch slabcur. */ + if (edata_nfree_get(bin->slabcur) > 0) { + arena_bin_slabs_nonfull_insert(bin, bin->slabcur); + } else { + arena_bin_slabs_full_insert(arena, bin, bin->slabcur); + } + bin->slabcur = slab; + if (config_stats) { + bin->stats.reslabs++; + } + } else { + arena_bin_slabs_nonfull_insert(bin, slab); + } +} + +static void +arena_dalloc_bin_slab_prepare(tsdn_t *tsdn, edata_t *slab, bin_t *bin) { + malloc_mutex_assert_owner(tsdn, &bin->lock); + + assert(slab != bin->slabcur); + if (config_stats) { + bin->stats.curslabs--; + } +} + +void +arena_dalloc_bin_locked_handle_newly_empty(tsdn_t *tsdn, arena_t *arena, + edata_t *slab, bin_t *bin) { + arena_dissociate_bin_slab(arena, slab, bin); + arena_dalloc_bin_slab_prepare(tsdn, slab, bin); +} + +void +arena_dalloc_bin_locked_handle_newly_nonempty(tsdn_t *tsdn, arena_t *arena, + edata_t *slab, bin_t *bin) { + arena_bin_slabs_full_remove(arena, bin, slab); + arena_bin_lower_slab(tsdn, arena, slab, bin); +} + +static void +arena_dalloc_bin(tsdn_t *tsdn, arena_t *arena, edata_t *edata, void *ptr) { + szind_t binind = edata_szind_get(edata); + unsigned binshard = edata_binshard_get(edata); + bin_t *bin = arena_get_bin(arena, binind, binshard); + + malloc_mutex_lock(tsdn, &bin->lock); + arena_dalloc_bin_locked_info_t info; + arena_dalloc_bin_locked_begin(&info, binind); + edata_t *dalloc_slabs[1]; + unsigned dalloc_slabs_count = 0; + arena_dalloc_bin_locked_step(tsdn, arena, bin, &info, binind, edata, + ptr, dalloc_slabs, /* ndalloc_slabs */ 1, &dalloc_slabs_count, + /* dalloc_slabs_extra */ NULL); + arena_dalloc_bin_locked_finish(tsdn, arena, bin, &info); + malloc_mutex_unlock(tsdn, &bin->lock); + + if (dalloc_slabs_count != 0) { + assert(dalloc_slabs[0] == edata); + arena_slab_dalloc(tsdn, arena, edata); + } +} + +void +arena_dalloc_small(tsdn_t *tsdn, void *ptr) { + edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, ptr); + arena_t *arena = arena_get_from_edata(edata); + + arena_dalloc_bin(tsdn, arena, edata, ptr); + arena_decay_tick(tsdn, arena); +} + +bool +arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, + size_t extra, bool zero, size_t *newsize) { + bool ret; + /* Calls with non-zero extra had to clamp extra. */ + assert(extra == 0 || size + extra <= SC_LARGE_MAXCLASS); + + edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, ptr); + if (unlikely(size > SC_LARGE_MAXCLASS)) { + ret = true; + goto done; + } + + size_t usize_min = sz_s2u(size); + size_t usize_max = sz_s2u(size + extra); + if (likely(oldsize <= SC_SMALL_MAXCLASS && usize_min + <= SC_SMALL_MAXCLASS)) { + /* + * Avoid moving the allocation if the size class can be left the + * same. + */ + assert(bin_infos[sz_size2index(oldsize)].reg_size == + oldsize); + if ((usize_max > SC_SMALL_MAXCLASS + || sz_size2index(usize_max) != sz_size2index(oldsize)) + && (size > oldsize || usize_max < oldsize)) { + ret = true; + goto done; + } + + arena_t *arena = arena_get_from_edata(edata); + arena_decay_tick(tsdn, arena); + ret = false; + } else if (oldsize >= SC_LARGE_MINCLASS + && usize_max >= SC_LARGE_MINCLASS) { + ret = large_ralloc_no_move(tsdn, edata, usize_min, usize_max, + zero); + } else { + ret = true; + } +done: + assert(edata == emap_edata_lookup(tsdn, &arena_emap_global, ptr)); + *newsize = edata_usize_get(edata); + + return ret; +} + +static void * +arena_ralloc_move_helper(tsdn_t *tsdn, arena_t *arena, size_t usize, + size_t alignment, bool zero, bool slab, tcache_t *tcache) { + if (alignment == 0) { + return arena_malloc(tsdn, arena, usize, sz_size2index(usize), + zero, slab, tcache, true); + } + usize = sz_sa2u(usize, alignment); + if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) { + return NULL; + } + return ipalloct_explicit_slab(tsdn, usize, alignment, zero, slab, + tcache, arena); +} + +void * +arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize, + size_t size, size_t alignment, bool zero, bool slab, tcache_t *tcache, + hook_ralloc_args_t *hook_args) { + size_t usize = alignment == 0 ? sz_s2u(size) : sz_sa2u(size, alignment); + if (unlikely(usize == 0 || size > SC_LARGE_MAXCLASS)) { + return NULL; + } + + if (likely(slab)) { + assert(sz_can_use_slab(usize)); + /* Try to avoid moving the allocation. */ + UNUSED size_t newsize; + if (!arena_ralloc_no_move(tsdn, ptr, oldsize, usize, 0, zero, + &newsize)) { + hook_invoke_expand(hook_args->is_realloc + ? hook_expand_realloc : hook_expand_rallocx, + ptr, oldsize, usize, (uintptr_t)ptr, + hook_args->args); + return ptr; + } + } + + if (oldsize >= SC_LARGE_MINCLASS + && usize >= SC_LARGE_MINCLASS) { + return large_ralloc(tsdn, arena, ptr, usize, + alignment, zero, tcache, hook_args); + } + + /* + * size and oldsize are different enough that we need to move the + * object. In that case, fall back to allocating new space and copying. + */ + void *ret = arena_ralloc_move_helper(tsdn, arena, usize, alignment, + zero, slab, tcache); + if (ret == NULL) { + return NULL; + } + + hook_invoke_alloc(hook_args->is_realloc + ? hook_alloc_realloc : hook_alloc_rallocx, ret, (uintptr_t)ret, + hook_args->args); + hook_invoke_dalloc(hook_args->is_realloc + ? hook_dalloc_realloc : hook_dalloc_rallocx, ptr, hook_args->args); + + /* + * Junk/zero-filling were already done by + * ipalloc()/arena_malloc(). + */ + size_t copysize = (usize < oldsize) ? usize : oldsize; + memcpy(ret, ptr, copysize); + isdalloct(tsdn, ptr, oldsize, tcache, NULL, true); + return ret; +} + +ehooks_t * +arena_get_ehooks(arena_t *arena) { + return base_ehooks_get(arena->base); +} + +extent_hooks_t * +arena_set_extent_hooks(tsd_t *tsd, arena_t *arena, + extent_hooks_t *extent_hooks) { + background_thread_info_t *info; + if (have_background_thread) { + info = arena_background_thread_info_get(arena); + malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx); + } + /* No using the HPA now that we have the custom hooks. */ + pa_shard_disable_hpa(tsd_tsdn(tsd), &arena->pa_shard); + extent_hooks_t *ret = base_extent_hooks_set(arena->base, extent_hooks); + if (have_background_thread) { + malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx); + } + + return ret; +} + +dss_prec_t +arena_dss_prec_get(arena_t *arena) { + return (dss_prec_t)atomic_load_u(&arena->dss_prec, ATOMIC_ACQUIRE); +} + +bool +arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec) { + if (!have_dss) { + return (dss_prec != dss_prec_disabled); + } + atomic_store_u(&arena->dss_prec, (unsigned)dss_prec, ATOMIC_RELEASE); + return false; +} + +void +arena_name_get(arena_t *arena, char *name) { + char *end = (char *)memchr((void *)arena->name, '\0', ARENA_NAME_LEN); + assert(end != NULL); + size_t len = (uintptr_t)end - (uintptr_t)arena->name + 1; + assert(len > 0 && len <= ARENA_NAME_LEN); + + strncpy(name, arena->name, len); +} + +void +arena_name_set(arena_t *arena, const char *name) { + strncpy(arena->name, name, ARENA_NAME_LEN); + arena->name[ARENA_NAME_LEN - 1] = '\0'; +} + +ssize_t +arena_dirty_decay_ms_default_get(void) { + return atomic_load_zd(&dirty_decay_ms_default, ATOMIC_RELAXED); +} + +bool +arena_dirty_decay_ms_default_set(ssize_t decay_ms) { + if (!decay_ms_valid(decay_ms)) { + return true; + } + atomic_store_zd(&dirty_decay_ms_default, decay_ms, ATOMIC_RELAXED); + return false; +} + +ssize_t +arena_muzzy_decay_ms_default_get(void) { + return atomic_load_zd(&muzzy_decay_ms_default, ATOMIC_RELAXED); +} + +bool +arena_muzzy_decay_ms_default_set(ssize_t decay_ms) { + if (!decay_ms_valid(decay_ms)) { + return true; + } + atomic_store_zd(&muzzy_decay_ms_default, decay_ms, ATOMIC_RELAXED); + return false; +} + +bool +arena_retain_grow_limit_get_set(tsd_t *tsd, arena_t *arena, size_t *old_limit, + size_t *new_limit) { + assert(opt_retain); + return pac_retain_grow_limit_get_set(tsd_tsdn(tsd), + &arena->pa_shard.pac, old_limit, new_limit); +} + +unsigned +arena_nthreads_get(arena_t *arena, bool internal) { + return atomic_load_u(&arena->nthreads[internal], ATOMIC_RELAXED); +} + +void +arena_nthreads_inc(arena_t *arena, bool internal) { + atomic_fetch_add_u(&arena->nthreads[internal], 1, ATOMIC_RELAXED); +} + +void +arena_nthreads_dec(arena_t *arena, bool internal) { + atomic_fetch_sub_u(&arena->nthreads[internal], 1, ATOMIC_RELAXED); +} + +arena_t * +arena_new(tsdn_t *tsdn, unsigned ind, const arena_config_t *config) { + arena_t *arena; + base_t *base; + + if (ind == 0) { + base = b0get(); + } else { + base = base_new(tsdn, ind, config->extent_hooks, + config->metadata_use_hooks); + if (base == NULL) { + return NULL; + } + } + + size_t arena_size = ALIGNMENT_CEILING(sizeof(arena_t), CACHELINE) + + sizeof(bin_with_batch_t) * bin_info_nbatched_bins + + sizeof(bin_t) * bin_info_nunbatched_bins; + arena = (arena_t *)base_alloc(tsdn, base, arena_size, CACHELINE); + if (arena == NULL) { + goto label_error; + } + + atomic_store_u(&arena->nthreads[0], 0, ATOMIC_RELAXED); + atomic_store_u(&arena->nthreads[1], 0, ATOMIC_RELAXED); + arena->last_thd = NULL; + + if (config_stats) { + if (arena_stats_init(tsdn, &arena->stats)) { + goto label_error; + } + + ql_new(&arena->tcache_ql); + ql_new(&arena->cache_bin_array_descriptor_ql); + if (malloc_mutex_init(&arena->tcache_ql_mtx, "tcache_ql", + WITNESS_RANK_TCACHE_QL, malloc_mutex_rank_exclusive)) { + goto label_error; + } + } + + atomic_store_u(&arena->dss_prec, (unsigned)extent_dss_prec_get(), + ATOMIC_RELAXED); + + edata_list_active_init(&arena->large); + if (malloc_mutex_init(&arena->large_mtx, "arena_large", + WITNESS_RANK_ARENA_LARGE, malloc_mutex_rank_exclusive)) { + goto label_error; + } + + nstime_t cur_time; + nstime_init_update(&cur_time); + if (pa_shard_init(tsdn, &arena->pa_shard, &arena_pa_central_global, + &arena_emap_global, base, ind, &arena->stats.pa_shard_stats, + LOCKEDINT_MTX(arena->stats.mtx), &cur_time, oversize_threshold, + arena_dirty_decay_ms_default_get(), + arena_muzzy_decay_ms_default_get())) { + goto label_error; + } + + /* Initialize bins. */ + atomic_store_u(&arena->binshard_next, 0, ATOMIC_RELEASE); + for (unsigned i = 0; i < SC_NBINS; i++) { + for (unsigned j = 0; j < bin_infos[i].n_shards; j++) { + bin_t *bin = arena_get_bin(arena, i, j); + bool err = bin_init(bin, i); + if (err) { + goto label_error; + } + } + } + + arena->base = base; + /* Set arena before creating background threads. */ + arena_set(ind, arena); + arena->ind = ind; + + /* Init the name. */ + malloc_snprintf(arena->name, sizeof(arena->name), "%s_%u", + arena_is_auto(arena) ? "auto" : "manual", arena->ind); + arena->name[ARENA_NAME_LEN - 1] = '\0'; + + nstime_init_update(&arena->create_time); + + /* + * We turn on the HPA if set to. There are two exceptions: + * - Custom extent hooks (we should only return memory allocated from + * them in that case). + * - Arena 0 initialization. In this case, we're mid-bootstrapping, and + * so arena_hpa_global is not yet initialized. + */ + if (opt_hpa && ehooks_are_default(base_ehooks_get(base)) && ind != 0) { + hpa_shard_opts_t hpa_shard_opts = opt_hpa_opts; + hpa_shard_opts.deferral_allowed = background_thread_enabled(); + if (pa_shard_enable_hpa(tsdn, &arena->pa_shard, + &hpa_shard_opts, &opt_hpa_sec_opts)) { + goto label_error; + } + } + + /* We don't support reentrancy for arena 0 bootstrapping. */ + if (ind != 0) { + /* + * If we're here, then arena 0 already exists, so bootstrapping + * is done enough that we should have tsd. + */ + assert(!tsdn_null(tsdn)); + pre_reentrancy(tsdn_tsd(tsdn), arena); + if (test_hooks_arena_new_hook) { + test_hooks_arena_new_hook(); + } + post_reentrancy(tsdn_tsd(tsdn)); + } + + return arena; +label_error: + if (ind != 0) { + base_delete(tsdn, base); + } + return NULL; +} + +static arena_t * +arena_create_huge_arena(tsd_t *tsd, unsigned ind) { + assert(ind != 0); + + arena_t *huge_arena = arena_get(tsd_tsdn(tsd), ind, true); + if (huge_arena == NULL) { + return NULL; + } + + char *huge_arena_name = "auto_oversize"; + strncpy(huge_arena->name, huge_arena_name, ARENA_NAME_LEN); + huge_arena->name[ARENA_NAME_LEN - 1] = '\0'; + + /* + * Purge eagerly for huge allocations, because: 1) number of huge + * allocations is usually small, which means ticker based decay is not + * reliable; and 2) less immediate reuse is expected for huge + * allocations. + * + * However, with background threads enabled, keep normal purging since + * the purging delay is bounded. + */ + if (!background_thread_enabled() + && arena_dirty_decay_ms_default_get() > 0) { + arena_decay_ms_set(tsd_tsdn(tsd), huge_arena, + extent_state_dirty, 0); + } + if (!background_thread_enabled() + &&arena_muzzy_decay_ms_default_get() > 0) { + arena_decay_ms_set(tsd_tsdn(tsd), huge_arena, + extent_state_muzzy, 0); + } + + return huge_arena; +} + +arena_t * +arena_choose_huge(tsd_t *tsd) { + /* huge_arena_ind can be 0 during init (will use a0). */ + if (huge_arena_ind == 0) { + assert(!malloc_initialized()); + } + + arena_t *huge_arena = arena_get(tsd_tsdn(tsd), huge_arena_ind, false); + if (huge_arena == NULL) { + /* Create the huge arena on demand. */ + huge_arena = arena_create_huge_arena(tsd, huge_arena_ind); + } + + return huge_arena; +} + +bool +arena_init_huge(arena_t *a0) { + bool huge_enabled; + + /* The threshold should be large size class. */ + if (opt_oversize_threshold > SC_LARGE_MAXCLASS || + opt_oversize_threshold < SC_LARGE_MINCLASS) { + opt_oversize_threshold = 0; + oversize_threshold = SC_LARGE_MAXCLASS + PAGE; + huge_enabled = false; + } else { + /* Reserve the index for the huge arena. */ + huge_arena_ind = narenas_total_get(); + oversize_threshold = opt_oversize_threshold; + /* a0 init happened before malloc_conf_init. */ + atomic_store_zu(&a0->pa_shard.pac.oversize_threshold, + oversize_threshold, ATOMIC_RELAXED); + huge_enabled = true; + } + + return huge_enabled; +} + +bool +arena_boot(sc_data_t *sc_data, base_t *base, bool hpa) { + arena_dirty_decay_ms_default_set(opt_dirty_decay_ms); + arena_muzzy_decay_ms_default_set(opt_muzzy_decay_ms); + for (unsigned i = 0; i < SC_NBINS; i++) { + sc_t *sc = &sc_data->sc[i]; + div_init(&arena_binind_div_info[i], + (1U << sc->lg_base) + (sc->ndelta << sc->lg_delta)); + } + + JEMALLOC_SUPPRESS_WARN_ON_USAGE( + uint32_t cur_offset = (uint32_t)offsetof(arena_t, all_bins); + ) + for (szind_t i = 0; i < SC_NBINS; i++) { + arena_bin_offsets[i] = cur_offset; + uint32_t bin_sz = (i < bin_info_nbatched_sizes + ? sizeof(bin_with_batch_t) : sizeof(bin_t)); + cur_offset += (uint32_t)bin_infos[i].n_shards * bin_sz; + } + return pa_central_init(&arena_pa_central_global, base, hpa, + &hpa_hooks_default); +} + +void +arena_prefork0(tsdn_t *tsdn, arena_t *arena) { + pa_shard_prefork0(tsdn, &arena->pa_shard); +} + +void +arena_prefork1(tsdn_t *tsdn, arena_t *arena) { + if (config_stats) { + malloc_mutex_prefork(tsdn, &arena->tcache_ql_mtx); + } +} + +void +arena_prefork2(tsdn_t *tsdn, arena_t *arena) { + pa_shard_prefork2(tsdn, &arena->pa_shard); +} + +void +arena_prefork3(tsdn_t *tsdn, arena_t *arena) { + pa_shard_prefork3(tsdn, &arena->pa_shard); +} + +void +arena_prefork4(tsdn_t *tsdn, arena_t *arena) { + pa_shard_prefork4(tsdn, &arena->pa_shard); +} + +void +arena_prefork5(tsdn_t *tsdn, arena_t *arena) { + pa_shard_prefork5(tsdn, &arena->pa_shard); +} + +void +arena_prefork6(tsdn_t *tsdn, arena_t *arena) { + base_prefork(tsdn, arena->base); +} + +void +arena_prefork7(tsdn_t *tsdn, arena_t *arena) { + malloc_mutex_prefork(tsdn, &arena->large_mtx); +} + +void +arena_prefork8(tsdn_t *tsdn, arena_t *arena) { + for (szind_t i = 0; i < SC_NBINS; i++) { + for (unsigned j = 0; j < bin_infos[i].n_shards; j++) { + bin_t *bin = arena_get_bin(arena, i, j); + bin_prefork(tsdn, bin, arena_bin_has_batch(i)); + } + } +} + +void +arena_postfork_parent(tsdn_t *tsdn, arena_t *arena) { + for (szind_t i = 0; i < SC_NBINS; i++) { + for (unsigned j = 0; j < bin_infos[i].n_shards; j++) { + bin_t *bin = arena_get_bin(arena, i, j); + bin_postfork_parent(tsdn, bin, arena_bin_has_batch(i)); + } + } + + malloc_mutex_postfork_parent(tsdn, &arena->large_mtx); + base_postfork_parent(tsdn, arena->base); + pa_shard_postfork_parent(tsdn, &arena->pa_shard); + if (config_stats) { + malloc_mutex_postfork_parent(tsdn, &arena->tcache_ql_mtx); + } +} + +void +arena_postfork_child(tsdn_t *tsdn, arena_t *arena) { + atomic_store_u(&arena->nthreads[0], 0, ATOMIC_RELAXED); + atomic_store_u(&arena->nthreads[1], 0, ATOMIC_RELAXED); + if (tsd_arena_get(tsdn_tsd(tsdn)) == arena) { + arena_nthreads_inc(arena, false); + } + if (tsd_iarena_get(tsdn_tsd(tsdn)) == arena) { + arena_nthreads_inc(arena, true); + } + if (config_stats) { + ql_new(&arena->tcache_ql); + ql_new(&arena->cache_bin_array_descriptor_ql); + tcache_slow_t *tcache_slow = tcache_slow_get(tsdn_tsd(tsdn)); + if (tcache_slow != NULL && tcache_slow->arena == arena) { + tcache_t *tcache = tcache_slow->tcache; + ql_elm_new(tcache_slow, link); + ql_tail_insert(&arena->tcache_ql, tcache_slow, link); + cache_bin_array_descriptor_init( + &tcache_slow->cache_bin_array_descriptor, + tcache->bins); + ql_tail_insert(&arena->cache_bin_array_descriptor_ql, + &tcache_slow->cache_bin_array_descriptor, link); + } + } + + for (szind_t i = 0; i < SC_NBINS; i++) { + for (unsigned j = 0; j < bin_infos[i].n_shards; j++) { + bin_t *bin = arena_get_bin(arena, i, j); + bin_postfork_child(tsdn, bin, arena_bin_has_batch(i)); + } + } + + malloc_mutex_postfork_child(tsdn, &arena->large_mtx); + base_postfork_child(tsdn, arena->base); + pa_shard_postfork_child(tsdn, &arena->pa_shard); + if (config_stats) { + malloc_mutex_postfork_child(tsdn, &arena->tcache_ql_mtx); + } +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/background_thread.c b/src/duckdb/extension/jemalloc/jemalloc/src/background_thread.c new file mode 100644 index 000000000..c92fa2bc8 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/background_thread.c @@ -0,0 +1,836 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" + +JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS + +/******************************************************************************/ +/* Data. */ + +/* This option should be opt-in only. */ +#define BACKGROUND_THREAD_DEFAULT false +/* Read-only after initialization. */ +bool opt_background_thread = BACKGROUND_THREAD_DEFAULT; +size_t opt_max_background_threads = MAX_BACKGROUND_THREAD_LIMIT + 1; + +/* Used for thread creation, termination and stats. */ +malloc_mutex_t background_thread_lock; +/* Indicates global state. Atomic because decay reads this w/o locking. */ +atomic_b_t background_thread_enabled_state; +size_t n_background_threads; +size_t max_background_threads; +/* Thread info per-index. */ +background_thread_info_t *background_thread_info; + +/******************************************************************************/ + +#ifdef JEMALLOC_PTHREAD_CREATE_WRAPPER + +static int (*pthread_create_fptr)(pthread_t *__restrict, const pthread_attr_t *, + void *(*)(void *), void *__restrict); + +static void +pthread_create_wrapper_init(void) { +#ifdef JEMALLOC_LAZY_LOCK + if (!isthreaded) { + isthreaded = true; + } +#endif +} + +int +pthread_create_wrapper(pthread_t *__restrict thread, const pthread_attr_t *attr, + void *(*start_routine)(void *), void *__restrict arg) { + pthread_create_wrapper_init(); + + return pthread_create_fptr(thread, attr, start_routine, arg); +} + +#ifdef JEMALLOC_HAVE_DLSYM +#include +#endif + +static bool +pthread_create_fptr_init(void) { + if (pthread_create_fptr != NULL) { + return false; + } + /* + * Try the next symbol first, because 1) when use lazy_lock we have a + * wrapper for pthread_create; and 2) application may define its own + * wrapper as well (and can call malloc within the wrapper). + */ +#ifdef JEMALLOC_HAVE_DLSYM + pthread_create_fptr = dlsym(RTLD_NEXT, "pthread_create"); +#else + pthread_create_fptr = NULL; +#endif + if (pthread_create_fptr == NULL) { + if (config_lazy_lock) { + malloc_write(": Error in dlsym(RTLD_NEXT, " + "\"pthread_create\")\n"); + abort(); + } else { + /* Fall back to the default symbol. */ + pthread_create_fptr = pthread_create; + } + } + + return false; +} +#endif /* JEMALLOC_PTHREAD_CREATE_WRAPPER */ + +#ifndef JEMALLOC_BACKGROUND_THREAD +#define NOT_REACHED { not_reached(); } +bool background_thread_create(tsd_t *tsd, unsigned arena_ind) NOT_REACHED +bool background_threads_enable(tsd_t *tsd) NOT_REACHED +bool background_threads_disable(tsd_t *tsd) NOT_REACHED +bool background_thread_is_started(background_thread_info_t *info) NOT_REACHED +void background_thread_wakeup_early(background_thread_info_t *info, + nstime_t *remaining_sleep) NOT_REACHED +void background_thread_prefork0(tsdn_t *tsdn) NOT_REACHED +void background_thread_prefork1(tsdn_t *tsdn) NOT_REACHED +void background_thread_postfork_parent(tsdn_t *tsdn) NOT_REACHED +void background_thread_postfork_child(tsdn_t *tsdn) NOT_REACHED +bool background_thread_stats_read(tsdn_t *tsdn, + background_thread_stats_t *stats) NOT_REACHED +void background_thread_ctl_init(tsdn_t *tsdn) NOT_REACHED +#undef NOT_REACHED +#else + +static bool background_thread_enabled_at_fork; + +static void +background_thread_info_init(tsdn_t *tsdn, background_thread_info_t *info) { + background_thread_wakeup_time_set(tsdn, info, 0); + info->npages_to_purge_new = 0; + if (config_stats) { + info->tot_n_runs = 0; + nstime_init_zero(&info->tot_sleep_time); + } +} + +static inline bool +set_current_thread_affinity(int cpu) { +#if defined(JEMALLOC_HAVE_SCHED_SETAFFINITY) || defined(JEMALLOC_HAVE_PTHREAD_SETAFFINITY_NP) +#if defined(JEMALLOC_HAVE_SCHED_SETAFFINITY) + cpu_set_t cpuset; +#else +# ifndef __NetBSD__ + cpuset_t cpuset; +# else + cpuset_t *cpuset; +# endif +#endif + +#ifndef __NetBSD__ + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); +#else + cpuset = cpuset_create(); +#endif + +#if defined(JEMALLOC_HAVE_SCHED_SETAFFINITY) + return (sched_setaffinity(0, sizeof(cpu_set_t), &cpuset) != 0); +#else +# ifndef __NetBSD__ + int ret = pthread_setaffinity_np(pthread_self(), sizeof(cpuset_t), + &cpuset); +# else + int ret = pthread_setaffinity_np(pthread_self(), cpuset_size(cpuset), + cpuset); + cpuset_destroy(cpuset); +# endif + return ret != 0; +#endif +#else + return false; +#endif +} + +#define BILLION UINT64_C(1000000000) +/* Minimal sleep interval 100 ms. */ +#define BACKGROUND_THREAD_MIN_INTERVAL_NS (BILLION / 10) + +static void +background_thread_sleep(tsdn_t *tsdn, background_thread_info_t *info, + uint64_t interval) { + if (config_stats) { + info->tot_n_runs++; + } + info->npages_to_purge_new = 0; + + struct timeval tv; + /* Specific clock required by timedwait. */ + gettimeofday(&tv, NULL); + nstime_t before_sleep; + nstime_init2(&before_sleep, tv.tv_sec, tv.tv_usec * 1000); + + int ret; + if (interval == BACKGROUND_THREAD_INDEFINITE_SLEEP) { + background_thread_wakeup_time_set(tsdn, info, + BACKGROUND_THREAD_INDEFINITE_SLEEP); + ret = pthread_cond_wait(&info->cond, &info->mtx.lock); + assert(ret == 0); + } else { + assert(interval >= BACKGROUND_THREAD_MIN_INTERVAL_NS && + interval <= BACKGROUND_THREAD_INDEFINITE_SLEEP); + /* We need malloc clock (can be different from tv). */ + nstime_t next_wakeup; + nstime_init_update(&next_wakeup); + nstime_iadd(&next_wakeup, interval); + assert(nstime_ns(&next_wakeup) < + BACKGROUND_THREAD_INDEFINITE_SLEEP); + background_thread_wakeup_time_set(tsdn, info, + nstime_ns(&next_wakeup)); + + nstime_t ts_wakeup; + nstime_copy(&ts_wakeup, &before_sleep); + nstime_iadd(&ts_wakeup, interval); + struct timespec ts; + ts.tv_sec = (size_t)nstime_sec(&ts_wakeup); + ts.tv_nsec = (size_t)nstime_nsec(&ts_wakeup); + + assert(!background_thread_indefinite_sleep(info)); + ret = pthread_cond_timedwait(&info->cond, &info->mtx.lock, &ts); + assert(ret == ETIMEDOUT || ret == 0); + } + if (config_stats) { + gettimeofday(&tv, NULL); + nstime_t after_sleep; + nstime_init2(&after_sleep, tv.tv_sec, tv.tv_usec * 1000); + if (nstime_compare(&after_sleep, &before_sleep) > 0) { + nstime_subtract(&after_sleep, &before_sleep); + nstime_add(&info->tot_sleep_time, &after_sleep); + } + } +} + +static bool +background_thread_pause_check(tsdn_t *tsdn, background_thread_info_t *info) { + if (unlikely(info->state == background_thread_paused)) { + malloc_mutex_unlock(tsdn, &info->mtx); + /* Wait on global lock to update status. */ + malloc_mutex_lock(tsdn, &background_thread_lock); + malloc_mutex_unlock(tsdn, &background_thread_lock); + malloc_mutex_lock(tsdn, &info->mtx); + return true; + } + + return false; +} + +static inline void +background_work_sleep_once(tsdn_t *tsdn, background_thread_info_t *info, + unsigned ind) { + uint64_t ns_until_deferred = BACKGROUND_THREAD_DEFERRED_MAX; + unsigned narenas = narenas_total_get(); + bool slept_indefinitely = background_thread_indefinite_sleep(info); + + for (unsigned i = ind; i < narenas; i += max_background_threads) { + arena_t *arena = arena_get(tsdn, i, false); + if (!arena) { + continue; + } + /* + * If thread was woken up from the indefinite sleep, don't + * do the work instantly, but rather check when the deferred + * work that caused this thread to wake up is scheduled for. + */ + if (!slept_indefinitely) { + arena_do_deferred_work(tsdn, arena); + } + if (ns_until_deferred <= BACKGROUND_THREAD_MIN_INTERVAL_NS) { + /* Min interval will be used. */ + continue; + } + uint64_t ns_arena_deferred = pa_shard_time_until_deferred_work( + tsdn, &arena->pa_shard); + if (ns_arena_deferred < ns_until_deferred) { + ns_until_deferred = ns_arena_deferred; + } + } + + uint64_t sleep_ns; + if (ns_until_deferred == BACKGROUND_THREAD_DEFERRED_MAX) { + sleep_ns = BACKGROUND_THREAD_INDEFINITE_SLEEP; + } else { + sleep_ns = + (ns_until_deferred < BACKGROUND_THREAD_MIN_INTERVAL_NS) + ? BACKGROUND_THREAD_MIN_INTERVAL_NS + : ns_until_deferred; + + } + + background_thread_sleep(tsdn, info, sleep_ns); +} + +static bool +background_threads_disable_single(tsd_t *tsd, background_thread_info_t *info) { + if (info == &background_thread_info[0]) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), + &background_thread_lock); + } else { + malloc_mutex_assert_not_owner(tsd_tsdn(tsd), + &background_thread_lock); + } + + pre_reentrancy(tsd, NULL); + malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx); + bool has_thread; + assert(info->state != background_thread_paused); + if (info->state == background_thread_started) { + has_thread = true; + info->state = background_thread_stopped; + pthread_cond_signal(&info->cond); + } else { + has_thread = false; + } + malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx); + + if (!has_thread) { + post_reentrancy(tsd); + return false; + } + void *ret; + if (pthread_join(info->thread, &ret)) { + post_reentrancy(tsd); + return true; + } + assert(ret == NULL); + n_background_threads--; + post_reentrancy(tsd); + + return false; +} + +static void *background_thread_entry(void *ind_arg); + +static int +background_thread_create_signals_masked(pthread_t *thread, + const pthread_attr_t *attr, void *(*start_routine)(void *), void *arg) { + /* + * Mask signals during thread creation so that the thread inherits + * an empty signal set. + */ + sigset_t set; + sigfillset(&set); + sigset_t oldset; + int mask_err = pthread_sigmask(SIG_SETMASK, &set, &oldset); + if (mask_err != 0) { + return mask_err; + } + int create_err = pthread_create_wrapper(thread, attr, start_routine, + arg); + /* + * Restore the signal mask. Failure to restore the signal mask here + * changes program behavior. + */ + int restore_err = pthread_sigmask(SIG_SETMASK, &oldset, NULL); + if (restore_err != 0) { + malloc_printf(": background thread creation " + "failed (%d), and signal mask restoration failed " + "(%d)\n", create_err, restore_err); + if (opt_abort) { + abort(); + } + } + return create_err; +} + +static bool +check_background_thread_creation(tsd_t *tsd, + const size_t const_max_background_threads, + unsigned *n_created, bool *created_threads) { + bool ret = false; + if (likely(*n_created == n_background_threads)) { + return ret; + } + + tsdn_t *tsdn = tsd_tsdn(tsd); + malloc_mutex_unlock(tsdn, &background_thread_info[0].mtx); + for (unsigned i = 1; i < const_max_background_threads; i++) { + if (created_threads[i]) { + continue; + } + background_thread_info_t *info = &background_thread_info[i]; + malloc_mutex_lock(tsdn, &info->mtx); + /* + * In case of the background_thread_paused state because of + * arena reset, delay the creation. + */ + bool create = (info->state == background_thread_started); + malloc_mutex_unlock(tsdn, &info->mtx); + if (!create) { + continue; + } + + pre_reentrancy(tsd, NULL); + int err = background_thread_create_signals_masked(&info->thread, + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + NULL, background_thread_entry, (void *)(uintptr_t)i); + post_reentrancy(tsd); + + if (err == 0) { + (*n_created)++; + created_threads[i] = true; + } else { + malloc_printf(": background thread " + "creation failed (%d)\n", err); + if (opt_abort) { + abort(); + } + } + /* Return to restart the loop since we unlocked. */ + ret = true; + break; + } + malloc_mutex_lock(tsdn, &background_thread_info[0].mtx); + + return ret; +} + +static void +background_thread0_work(tsd_t *tsd) { + /* + * Thread0 is also responsible for launching / terminating threads. + * We are guaranteed that `max_background_threads` will not change + * underneath us. Unfortunately static analysis tools do not understand + * this, so we are extracting `max_background_threads` into a local + * variable solely for the sake of exposing this information to such + * tools. + */ + const size_t const_max_background_threads = max_background_threads; + assert(const_max_background_threads > 0); + VARIABLE_ARRAY(bool, created_threads, const_max_background_threads); + unsigned i; + for (i = 1; i < const_max_background_threads; i++) { + created_threads[i] = false; + } + /* Start working, and create more threads when asked. */ + unsigned n_created = 1; + while (background_thread_info[0].state != background_thread_stopped) { + if (background_thread_pause_check(tsd_tsdn(tsd), + &background_thread_info[0])) { + continue; + } + if (check_background_thread_creation(tsd, const_max_background_threads, + &n_created, (bool *)&created_threads)) { + continue; + } + background_work_sleep_once(tsd_tsdn(tsd), + &background_thread_info[0], 0); + } + + /* + * Shut down other threads at exit. Note that the ctl thread is holding + * the global background_thread mutex (and is waiting) for us. + */ + assert(!background_thread_enabled()); + for (i = 1; i < const_max_background_threads; i++) { + background_thread_info_t *info = &background_thread_info[i]; + assert(info->state != background_thread_paused); + if (created_threads[i]) { + background_threads_disable_single(tsd, info); + } else { + malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx); + if (info->state != background_thread_stopped) { + /* The thread was not created. */ + assert(info->state == + background_thread_started); + n_background_threads--; + info->state = background_thread_stopped; + } + malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx); + } + } + background_thread_info[0].state = background_thread_stopped; + assert(n_background_threads == 1); +} + +static void +background_work(tsd_t *tsd, unsigned ind) { + background_thread_info_t *info = &background_thread_info[ind]; + + malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx); + background_thread_wakeup_time_set(tsd_tsdn(tsd), info, + BACKGROUND_THREAD_INDEFINITE_SLEEP); + if (ind == 0) { + background_thread0_work(tsd); + } else { + while (info->state != background_thread_stopped) { + if (background_thread_pause_check(tsd_tsdn(tsd), + info)) { + continue; + } + background_work_sleep_once(tsd_tsdn(tsd), info, ind); + } + } + assert(info->state == background_thread_stopped); + background_thread_wakeup_time_set(tsd_tsdn(tsd), info, 0); + malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx); +} + +static void * +background_thread_entry(void *ind_arg) { + unsigned thread_ind = (unsigned)(uintptr_t)ind_arg; + assert(thread_ind < max_background_threads); +#ifdef JEMALLOC_HAVE_PTHREAD_SETNAME_NP + pthread_setname_np(pthread_self(), "jemalloc_bg_thd"); +#elif defined(JEMALLOC_HAVE_PTHREAD_SET_NAME_NP) + pthread_set_name_np(pthread_self(), "jemalloc_bg_thd"); +#endif + if (opt_percpu_arena != percpu_arena_disabled) { + set_current_thread_affinity((int)thread_ind); + } + /* + * Start periodic background work. We use internal tsd which avoids + * side effects, for example triggering new arena creation (which in + * turn triggers another background thread creation). + */ + background_work(tsd_internal_fetch(), thread_ind); + assert(pthread_equal(pthread_self(), + background_thread_info[thread_ind].thread)); + + return NULL; +} + +static void +background_thread_init(tsd_t *tsd, background_thread_info_t *info) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &background_thread_lock); + info->state = background_thread_started; + background_thread_info_init(tsd_tsdn(tsd), info); + n_background_threads++; +} + +static bool +background_thread_create_locked(tsd_t *tsd, unsigned arena_ind) { + assert(have_background_thread); + malloc_mutex_assert_owner(tsd_tsdn(tsd), &background_thread_lock); + + /* We create at most NCPUs threads. */ + size_t thread_ind = arena_ind % max_background_threads; + background_thread_info_t *info = &background_thread_info[thread_ind]; + + bool need_new_thread; + malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx); + need_new_thread = background_thread_enabled() && + (info->state == background_thread_stopped); + if (need_new_thread) { + background_thread_init(tsd, info); + } + malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx); + if (!need_new_thread) { + return false; + } + if (arena_ind != 0) { + /* Threads are created asynchronously by Thread 0. */ + background_thread_info_t *t0 = &background_thread_info[0]; + malloc_mutex_lock(tsd_tsdn(tsd), &t0->mtx); + assert(t0->state == background_thread_started); + pthread_cond_signal(&t0->cond); + malloc_mutex_unlock(tsd_tsdn(tsd), &t0->mtx); + + return false; + } + + pre_reentrancy(tsd, NULL); + /* + * To avoid complications (besides reentrancy), create internal + * background threads with the underlying pthread_create. + */ + int err = background_thread_create_signals_masked(&info->thread, NULL, + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + background_thread_entry, (void *)thread_ind); + post_reentrancy(tsd); + + if (err != 0) { + malloc_printf(": arena 0 background thread creation " + "failed (%d)\n", err); + malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx); + info->state = background_thread_stopped; + n_background_threads--; + malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx); + + return true; + } + + return false; +} + +/* Create a new background thread if needed. */ +bool +background_thread_create(tsd_t *tsd, unsigned arena_ind) { + assert(have_background_thread); + + bool ret; + malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_lock); + ret = background_thread_create_locked(tsd, arena_ind); + malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_lock); + + return ret; +} + +bool +background_threads_enable(tsd_t *tsd) { + assert(n_background_threads == 0); + assert(background_thread_enabled()); + malloc_mutex_assert_owner(tsd_tsdn(tsd), &background_thread_lock); + + VARIABLE_ARRAY(bool, marked, max_background_threads); + unsigned nmarked; + for (size_t i = 0; i < max_background_threads; i++) { + marked[i] = false; + } + nmarked = 0; + /* Thread 0 is required and created at the end. */ + marked[0] = true; + /* Mark the threads we need to create for thread 0. */ + unsigned narenas = narenas_total_get(); + for (unsigned i = 1; i < narenas; i++) { + if (marked[i % max_background_threads] || + arena_get(tsd_tsdn(tsd), i, false) == NULL) { + continue; + } + background_thread_info_t *info = &background_thread_info[ + i % max_background_threads]; + malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx); + assert(info->state == background_thread_stopped); + background_thread_init(tsd, info); + malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx); + marked[i % max_background_threads] = true; + if (++nmarked == max_background_threads) { + break; + } + } + + bool err = background_thread_create_locked(tsd, 0); + if (err) { + return true; + } + for (unsigned i = 0; i < narenas; i++) { + arena_t *arena = arena_get(tsd_tsdn(tsd), i, false); + if (arena != NULL) { + pa_shard_set_deferral_allowed(tsd_tsdn(tsd), + &arena->pa_shard, true); + } + } + return false; +} + +bool +background_threads_disable(tsd_t *tsd) { + assert(!background_thread_enabled()); + malloc_mutex_assert_owner(tsd_tsdn(tsd), &background_thread_lock); + + /* Thread 0 will be responsible for terminating other threads. */ + if (background_threads_disable_single(tsd, + &background_thread_info[0])) { + return true; + } + assert(n_background_threads == 0); + unsigned narenas = narenas_total_get(); + for (unsigned i = 0; i < narenas; i++) { + arena_t *arena = arena_get(tsd_tsdn(tsd), i, false); + if (arena != NULL) { + pa_shard_set_deferral_allowed(tsd_tsdn(tsd), + &arena->pa_shard, false); + } + } + + return false; +} + +bool +background_thread_is_started(background_thread_info_t *info) { + return info->state == background_thread_started; +} + +void +background_thread_wakeup_early(background_thread_info_t *info, + nstime_t *remaining_sleep) { + /* + * This is an optimization to increase batching. At this point + * we know that background thread wakes up soon, so the time to cache + * the just freed memory is bounded and low. + */ + if (remaining_sleep != NULL && nstime_ns(remaining_sleep) < + BACKGROUND_THREAD_MIN_INTERVAL_NS) { + return; + } + pthread_cond_signal(&info->cond); +} + +void +background_thread_prefork0(tsdn_t *tsdn) { + malloc_mutex_prefork(tsdn, &background_thread_lock); + background_thread_enabled_at_fork = background_thread_enabled(); +} + +void +background_thread_prefork1(tsdn_t *tsdn) { + for (unsigned i = 0; i < max_background_threads; i++) { + malloc_mutex_prefork(tsdn, &background_thread_info[i].mtx); + } +} + +void +background_thread_postfork_parent(tsdn_t *tsdn) { + for (unsigned i = 0; i < max_background_threads; i++) { + malloc_mutex_postfork_parent(tsdn, + &background_thread_info[i].mtx); + } + malloc_mutex_postfork_parent(tsdn, &background_thread_lock); +} + +void +background_thread_postfork_child(tsdn_t *tsdn) { + for (unsigned i = 0; i < max_background_threads; i++) { + malloc_mutex_postfork_child(tsdn, + &background_thread_info[i].mtx); + } + malloc_mutex_postfork_child(tsdn, &background_thread_lock); + if (!background_thread_enabled_at_fork) { + return; + } + + /* Clear background_thread state (reset to disabled for child). */ + malloc_mutex_lock(tsdn, &background_thread_lock); + n_background_threads = 0; + background_thread_enabled_set(tsdn, false); + for (unsigned i = 0; i < max_background_threads; i++) { + background_thread_info_t *info = &background_thread_info[i]; + malloc_mutex_lock(tsdn, &info->mtx); + info->state = background_thread_stopped; + int ret = pthread_cond_init(&info->cond, NULL); + assert(ret == 0); + background_thread_info_init(tsdn, info); + malloc_mutex_unlock(tsdn, &info->mtx); + } + malloc_mutex_unlock(tsdn, &background_thread_lock); +} + +bool +background_thread_stats_read(tsdn_t *tsdn, background_thread_stats_t *stats) { + assert(config_stats); + malloc_mutex_lock(tsdn, &background_thread_lock); + if (!background_thread_enabled()) { + malloc_mutex_unlock(tsdn, &background_thread_lock); + return true; + } + + nstime_init_zero(&stats->run_interval); + memset(&stats->max_counter_per_bg_thd, 0, sizeof(mutex_prof_data_t)); + + uint64_t num_runs = 0; + stats->num_threads = n_background_threads; + for (unsigned i = 0; i < max_background_threads; i++) { + background_thread_info_t *info = &background_thread_info[i]; + if (malloc_mutex_trylock(tsdn, &info->mtx)) { + /* + * Each background thread run may take a long time; + * avoid waiting on the stats if the thread is active. + */ + continue; + } + if (info->state != background_thread_stopped) { + num_runs += info->tot_n_runs; + nstime_add(&stats->run_interval, &info->tot_sleep_time); + malloc_mutex_prof_max_update(tsdn, + &stats->max_counter_per_bg_thd, &info->mtx); + } + malloc_mutex_unlock(tsdn, &info->mtx); + } + stats->num_runs = num_runs; + if (num_runs > 0) { + nstime_idivide(&stats->run_interval, num_runs); + } + malloc_mutex_unlock(tsdn, &background_thread_lock); + + return false; +} + +#undef BACKGROUND_THREAD_NPAGES_THRESHOLD +#undef BILLION +#undef BACKGROUND_THREAD_MIN_INTERVAL_NS + +/* + * When lazy lock is enabled, we need to make sure setting isthreaded before + * taking any background_thread locks. This is called early in ctl (instead of + * wait for the pthread_create calls to trigger) because the mutex is required + * before creating background threads. + */ +void +background_thread_ctl_init(tsdn_t *tsdn) { + malloc_mutex_assert_not_owner(tsdn, &background_thread_lock); +#ifdef JEMALLOC_PTHREAD_CREATE_WRAPPER + pthread_create_fptr_init(); + pthread_create_wrapper_init(); +#endif +} + +#endif /* defined(JEMALLOC_BACKGROUND_THREAD) */ + +bool +background_thread_boot0(void) { + if (!have_background_thread && opt_background_thread) { + malloc_printf(": option background_thread currently " + "supports pthread only\n"); + return true; + } +#ifdef JEMALLOC_PTHREAD_CREATE_WRAPPER + if ((config_lazy_lock || opt_background_thread) && + pthread_create_fptr_init()) { + return true; + } +#endif + return false; +} + +bool +background_thread_boot1(tsdn_t *tsdn, base_t *base) { +#ifdef JEMALLOC_BACKGROUND_THREAD + assert(have_background_thread); + assert(narenas_total_get() > 0); + + if (opt_max_background_threads > MAX_BACKGROUND_THREAD_LIMIT) { + opt_max_background_threads = DEFAULT_NUM_BACKGROUND_THREAD; + } + max_background_threads = opt_max_background_threads; + + background_thread_enabled_set(tsdn, opt_background_thread); + if (malloc_mutex_init(&background_thread_lock, + "background_thread_global", + WITNESS_RANK_BACKGROUND_THREAD_GLOBAL, + malloc_mutex_rank_exclusive)) { + return true; + } + + background_thread_info = (background_thread_info_t *)base_alloc(tsdn, + base, opt_max_background_threads * + sizeof(background_thread_info_t), CACHELINE); + if (background_thread_info == NULL) { + return true; + } + + for (unsigned i = 0; i < max_background_threads; i++) { + background_thread_info_t *info = &background_thread_info[i]; + /* Thread mutex is rank_inclusive because of thread0. */ + if (malloc_mutex_init(&info->mtx, "background_thread", + WITNESS_RANK_BACKGROUND_THREAD, + malloc_mutex_address_ordered)) { + return true; + } + if (pthread_cond_init(&info->cond, NULL)) { + return true; + } + malloc_mutex_lock(tsdn, &info->mtx); + info->state = background_thread_stopped; + background_thread_info_init(tsdn, info); + malloc_mutex_unlock(tsdn, &info->mtx); + } +#endif + + return false; +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/base.c b/src/duckdb/extension/jemalloc/jemalloc/src/base.c new file mode 100644 index 000000000..1d5e8fcda --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/base.c @@ -0,0 +1,673 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/extent_mmap.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/sz.h" + +/* + * In auto mode, arenas switch to huge pages for the base allocator on the + * second base block. a0 switches to thp on the 5th block (after 20 megabytes + * of metadata), since more metadata (e.g. rtree nodes) come from a0's base. + */ + +#define BASE_AUTO_THP_THRESHOLD 2 +#define BASE_AUTO_THP_THRESHOLD_A0 5 + +/******************************************************************************/ +/* Data. */ + +static base_t *b0; + +metadata_thp_mode_t opt_metadata_thp = METADATA_THP_DEFAULT; + +const char *const metadata_thp_mode_names[] = { + "disabled", + "auto", + "always" +}; + +/******************************************************************************/ + +static inline bool +metadata_thp_madvise(void) { + return (metadata_thp_enabled() && + (init_system_thp_mode == thp_mode_default)); +} + +static void * +base_map(tsdn_t *tsdn, ehooks_t *ehooks, unsigned ind, size_t size) { + void *addr; + bool zero = true; + bool commit = true; + + /* Use huge page sizes and alignment regardless of opt_metadata_thp. */ + assert(size == HUGEPAGE_CEILING(size)); + size_t alignment = HUGEPAGE; + if (ehooks_are_default(ehooks)) { + addr = extent_alloc_mmap(NULL, size, alignment, &zero, &commit); + if (have_madvise_huge && addr) { + pages_set_thp_state(addr, size); + } + } else { + addr = ehooks_alloc(tsdn, ehooks, NULL, size, alignment, &zero, + &commit); + } + + return addr; +} + +static void +base_unmap(tsdn_t *tsdn, ehooks_t *ehooks, unsigned ind, void *addr, + size_t size) { + /* + * Cascade through dalloc, decommit, purge_forced, and purge_lazy, + * stopping at first success. This cascade is performed for consistency + * with the cascade in extent_dalloc_wrapper() because an application's + * custom hooks may not support e.g. dalloc. This function is only ever + * called as a side effect of arena destruction, so although it might + * seem pointless to do anything besides dalloc here, the application + * may in fact want the end state of all associated virtual memory to be + * in some consistent-but-allocated state. + */ + if (ehooks_are_default(ehooks)) { + if (!extent_dalloc_mmap(addr, size)) { + goto label_done; + } + if (!pages_decommit(addr, size)) { + goto label_done; + } + if (!pages_purge_forced(addr, size)) { + goto label_done; + } + if (!pages_purge_lazy(addr, size)) { + goto label_done; + } + /* Nothing worked. This should never happen. */ + not_reached(); + } else { + if (!ehooks_dalloc(tsdn, ehooks, addr, size, true)) { + goto label_done; + } + if (!ehooks_decommit(tsdn, ehooks, addr, size, 0, size)) { + goto label_done; + } + if (!ehooks_purge_forced(tsdn, ehooks, addr, size, 0, size)) { + goto label_done; + } + if (!ehooks_purge_lazy(tsdn, ehooks, addr, size, 0, size)) { + goto label_done; + } + /* Nothing worked. That's the application's problem. */ + } +label_done: + if (metadata_thp_madvise()) { + /* Set NOHUGEPAGE after unmap to avoid kernel defrag. */ + assert(((uintptr_t)addr & HUGEPAGE_MASK) == 0 && + (size & HUGEPAGE_MASK) == 0); + pages_nohuge(addr, size); + } +} + +static inline bool +base_edata_is_reused(edata_t *edata) { + /* + * Borrow the guarded bit to indicate if the extent is a recycled one, + * i.e. the ones returned to base for reuse; currently only tcache bin + * stacks. Skips stats updating if so (needed for this purpose only). + */ + return edata_guarded_get(edata); +} + +static void +base_edata_init(size_t *extent_sn_next, edata_t *edata, void *addr, + size_t size) { + size_t sn; + + sn = *extent_sn_next; + (*extent_sn_next)++; + + edata_binit(edata, addr, size, sn, false /* is_reused */); +} + +static size_t +base_get_num_blocks(base_t *base, bool with_new_block) { + base_block_t *b = base->blocks; + assert(b != NULL); + + size_t n_blocks = with_new_block ? 2 : 1; + while (b->next != NULL) { + n_blocks++; + b = b->next; + } + + return n_blocks; +} + +static void +base_auto_thp_switch(tsdn_t *tsdn, base_t *base) { + assert(opt_metadata_thp == metadata_thp_auto); + malloc_mutex_assert_owner(tsdn, &base->mtx); + if (base->auto_thp_switched) { + return; + } + /* Called when adding a new block. */ + bool should_switch; + if (base_ind_get(base) != 0) { + should_switch = (base_get_num_blocks(base, true) == + BASE_AUTO_THP_THRESHOLD); + } else { + should_switch = (base_get_num_blocks(base, true) == + BASE_AUTO_THP_THRESHOLD_A0); + } + if (!should_switch) { + return; + } + + base->auto_thp_switched = true; + assert(!config_stats || base->n_thp == 0); + /* Make the initial blocks THP lazily. */ + base_block_t *block = base->blocks; + while (block != NULL) { + assert((block->size & HUGEPAGE_MASK) == 0); + pages_huge(block, block->size); + if (config_stats) { + base->n_thp += HUGEPAGE_CEILING(block->size - + edata_bsize_get(&block->edata)) >> LG_HUGEPAGE; + } + block = block->next; + assert(block == NULL || (base_ind_get(base) == 0)); + } +} + +static void * +base_extent_bump_alloc_helper(edata_t *edata, size_t *gap_size, size_t size, + size_t alignment) { + void *ret; + + assert(alignment == ALIGNMENT_CEILING(alignment, QUANTUM)); + assert(size == ALIGNMENT_CEILING(size, alignment)); + + *gap_size = ALIGNMENT_CEILING((uintptr_t)edata_addr_get(edata), + alignment) - (uintptr_t)edata_addr_get(edata); + ret = (void *)((byte_t *)edata_addr_get(edata) + *gap_size); + assert(edata_bsize_get(edata) >= *gap_size + size); + edata_binit(edata, (void *)((byte_t *)edata_addr_get(edata) + + *gap_size + size), edata_bsize_get(edata) - *gap_size - size, + edata_sn_get(edata), base_edata_is_reused(edata)); + return ret; +} + +static void +base_edata_heap_insert(tsdn_t *tsdn, base_t *base, edata_t *edata) { + malloc_mutex_assert_owner(tsdn, &base->mtx); + + size_t bsize = edata_bsize_get(edata); + assert(bsize > 0); + /* + * Compute the index for the largest size class that does not exceed + * extent's size. + */ + szind_t index_floor = sz_size2index(bsize + 1) - 1; + edata_heap_insert(&base->avail[index_floor], edata); +} + +/* + * Only can be called by top-level functions, since it may call base_alloc + * internally when cache is empty. + */ +static edata_t * +base_alloc_base_edata(tsdn_t *tsdn, base_t *base) { + edata_t *edata; + + malloc_mutex_lock(tsdn, &base->mtx); + edata = edata_avail_first(&base->edata_avail); + if (edata != NULL) { + edata_avail_remove(&base->edata_avail, edata); + } + malloc_mutex_unlock(tsdn, &base->mtx); + + if (edata == NULL) { + edata = base_alloc_edata(tsdn, base); + } + + return edata; +} + +static void +base_extent_bump_alloc_post(tsdn_t *tsdn, base_t *base, edata_t *edata, + size_t gap_size, void *addr, size_t size) { + if (edata_bsize_get(edata) > 0) { + base_edata_heap_insert(tsdn, base, edata); + } else { + /* Freed base edata_t stored in edata_avail. */ + edata_avail_insert(&base->edata_avail, edata); + } + + if (config_stats && !base_edata_is_reused(edata)) { + base->allocated += size; + /* + * Add one PAGE to base_resident for every page boundary that is + * crossed by the new allocation. Adjust n_thp similarly when + * metadata_thp is enabled. + */ + base->resident += PAGE_CEILING((uintptr_t)addr + size) - + PAGE_CEILING((uintptr_t)addr - gap_size); + assert(base->allocated <= base->resident); + assert(base->resident <= base->mapped); + if (metadata_thp_madvise() && (opt_metadata_thp == + metadata_thp_always || base->auto_thp_switched)) { + base->n_thp += (HUGEPAGE_CEILING((uintptr_t)addr + size) + - HUGEPAGE_CEILING((uintptr_t)addr - gap_size)) >> + LG_HUGEPAGE; + assert(base->mapped >= base->n_thp << LG_HUGEPAGE); + } + } +} + +static void * +base_extent_bump_alloc(tsdn_t *tsdn, base_t *base, edata_t *edata, size_t size, + size_t alignment) { + void *ret; + size_t gap_size; + + ret = base_extent_bump_alloc_helper(edata, &gap_size, size, alignment); + base_extent_bump_alloc_post(tsdn, base, edata, gap_size, ret, size); + return ret; +} + +/* + * Allocate a block of virtual memory that is large enough to start with a + * base_block_t header, followed by an object of specified size and alignment. + * On success a pointer to the initialized base_block_t header is returned. + */ +static base_block_t * +base_block_alloc(tsdn_t *tsdn, base_t *base, ehooks_t *ehooks, unsigned ind, + pszind_t *pind_last, size_t *extent_sn_next, size_t size, + size_t alignment) { + alignment = ALIGNMENT_CEILING(alignment, QUANTUM); + size_t usize = ALIGNMENT_CEILING(size, alignment); + size_t header_size = sizeof(base_block_t); + size_t gap_size = ALIGNMENT_CEILING(header_size, alignment) - + header_size; + /* + * Create increasingly larger blocks in order to limit the total number + * of disjoint virtual memory ranges. Choose the next size in the page + * size class series (skipping size classes that are not a multiple of + * HUGEPAGE), or a size large enough to satisfy the requested size and + * alignment, whichever is larger. + */ + size_t min_block_size = HUGEPAGE_CEILING(sz_psz2u(header_size + gap_size + + usize)); + pszind_t pind_next = (*pind_last + 1 < sz_psz2ind(SC_LARGE_MAXCLASS)) ? + *pind_last + 1 : *pind_last; + size_t next_block_size = HUGEPAGE_CEILING(sz_pind2sz(pind_next)); + size_t block_size = (min_block_size > next_block_size) ? min_block_size + : next_block_size; + base_block_t *block = (base_block_t *)base_map(tsdn, ehooks, ind, + block_size); + if (block == NULL) { + return NULL; + } + + if (metadata_thp_madvise()) { + void *addr = (void *)block; + assert(((uintptr_t)addr & HUGEPAGE_MASK) == 0 && + (block_size & HUGEPAGE_MASK) == 0); + if (opt_metadata_thp == metadata_thp_always) { + pages_huge(addr, block_size); + } else if (opt_metadata_thp == metadata_thp_auto && + base != NULL) { + /* base != NULL indicates this is not a new base. */ + malloc_mutex_lock(tsdn, &base->mtx); + base_auto_thp_switch(tsdn, base); + if (base->auto_thp_switched) { + pages_huge(addr, block_size); + } + malloc_mutex_unlock(tsdn, &base->mtx); + } + } + + *pind_last = sz_psz2ind(block_size); + block->size = block_size; + block->next = NULL; + assert(block_size >= header_size); + base_edata_init(extent_sn_next, &block->edata, + (void *)((byte_t *)block + header_size), block_size - header_size); + return block; +} + +/* + * Allocate an extent that is at least as large as specified size, with + * specified alignment. + */ +static edata_t * +base_extent_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment) { + malloc_mutex_assert_owner(tsdn, &base->mtx); + + ehooks_t *ehooks = base_ehooks_get_for_metadata(base); + /* + * Drop mutex during base_block_alloc(), because an extent hook will be + * called. + */ + malloc_mutex_unlock(tsdn, &base->mtx); + base_block_t *block = base_block_alloc(tsdn, base, ehooks, + base_ind_get(base), &base->pind_last, &base->extent_sn_next, size, + alignment); + malloc_mutex_lock(tsdn, &base->mtx); + if (block == NULL) { + return NULL; + } + block->next = base->blocks; + base->blocks = block; + if (config_stats) { + base->allocated += sizeof(base_block_t); + base->resident += PAGE_CEILING(sizeof(base_block_t)); + base->mapped += block->size; + if (metadata_thp_madvise() && + !(opt_metadata_thp == metadata_thp_auto + && !base->auto_thp_switched)) { + assert(base->n_thp > 0); + base->n_thp += HUGEPAGE_CEILING(sizeof(base_block_t)) >> + LG_HUGEPAGE; + } + assert(base->allocated <= base->resident); + assert(base->resident <= base->mapped); + assert(base->n_thp << LG_HUGEPAGE <= base->mapped); + } + return &block->edata; +} + +base_t * +b0get(void) { + return b0; +} + +base_t * +base_new(tsdn_t *tsdn, unsigned ind, const extent_hooks_t *extent_hooks, + bool metadata_use_hooks) { + pszind_t pind_last = 0; + size_t extent_sn_next = 0; + + /* + * The base will contain the ehooks eventually, but it itself is + * allocated using them. So we use some stack ehooks to bootstrap its + * memory, and then initialize the ehooks within the base_t. + */ + ehooks_t fake_ehooks; + ehooks_init(&fake_ehooks, metadata_use_hooks ? + (extent_hooks_t *)extent_hooks : + (extent_hooks_t *)&ehooks_default_extent_hooks, ind); + + base_block_t *block = base_block_alloc(tsdn, NULL, &fake_ehooks, ind, + &pind_last, &extent_sn_next, sizeof(base_t), QUANTUM); + if (block == NULL) { + return NULL; + } + + size_t gap_size; + size_t base_alignment = CACHELINE; + size_t base_size = ALIGNMENT_CEILING(sizeof(base_t), base_alignment); + base_t *base = (base_t *)base_extent_bump_alloc_helper(&block->edata, + &gap_size, base_size, base_alignment); + ehooks_init(&base->ehooks, (extent_hooks_t *)extent_hooks, ind); + ehooks_init(&base->ehooks_base, metadata_use_hooks ? + (extent_hooks_t *)extent_hooks : + (extent_hooks_t *)&ehooks_default_extent_hooks, ind); + if (malloc_mutex_init(&base->mtx, "base", WITNESS_RANK_BASE, + malloc_mutex_rank_exclusive)) { + base_unmap(tsdn, &fake_ehooks, ind, block, block->size); + return NULL; + } + base->pind_last = pind_last; + base->extent_sn_next = extent_sn_next; + base->blocks = block; + base->auto_thp_switched = false; + for (szind_t i = 0; i < SC_NSIZES; i++) { + edata_heap_new(&base->avail[i]); + } + edata_avail_new(&base->edata_avail); + + if (config_stats) { + base->edata_allocated = 0; + base->rtree_allocated = 0; + base->allocated = sizeof(base_block_t); + base->resident = PAGE_CEILING(sizeof(base_block_t)); + base->mapped = block->size; + base->n_thp = (opt_metadata_thp == metadata_thp_always) && + metadata_thp_madvise() ? HUGEPAGE_CEILING(sizeof(base_block_t)) + >> LG_HUGEPAGE : 0; + assert(base->allocated <= base->resident); + assert(base->resident <= base->mapped); + assert(base->n_thp << LG_HUGEPAGE <= base->mapped); + } + + /* Locking here is only necessary because of assertions. */ + malloc_mutex_lock(tsdn, &base->mtx); + base_extent_bump_alloc_post(tsdn, base, &block->edata, gap_size, base, + base_size); + malloc_mutex_unlock(tsdn, &base->mtx); + + return base; +} + +void +base_delete(tsdn_t *tsdn, base_t *base) { + ehooks_t *ehooks = base_ehooks_get_for_metadata(base); + base_block_t *next = base->blocks; + do { + base_block_t *block = next; + next = block->next; + base_unmap(tsdn, ehooks, base_ind_get(base), block, + block->size); + } while (next != NULL); +} + +ehooks_t * +base_ehooks_get(base_t *base) { + return &base->ehooks; +} + +ehooks_t * +base_ehooks_get_for_metadata(base_t *base) { + return &base->ehooks_base; +} + +extent_hooks_t * +base_extent_hooks_set(base_t *base, extent_hooks_t *extent_hooks) { + extent_hooks_t *old_extent_hooks = + ehooks_get_extent_hooks_ptr(&base->ehooks); + ehooks_init(&base->ehooks, extent_hooks, ehooks_ind_get(&base->ehooks)); + return old_extent_hooks; +} + +static void * +base_alloc_impl(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment, + size_t *esn, size_t *ret_usize) { + alignment = QUANTUM_CEILING(alignment); + size_t usize = ALIGNMENT_CEILING(size, alignment); + size_t asize = usize + alignment - QUANTUM; + + edata_t *edata = NULL; + malloc_mutex_lock(tsdn, &base->mtx); + for (szind_t i = sz_size2index(asize); i < SC_NSIZES; i++) { + edata = edata_heap_remove_first(&base->avail[i]); + if (edata != NULL) { + /* Use existing space. */ + break; + } + } + if (edata == NULL) { + /* Try to allocate more space. */ + edata = base_extent_alloc(tsdn, base, usize, alignment); + } + void *ret; + if (edata == NULL) { + ret = NULL; + goto label_return; + } + + ret = base_extent_bump_alloc(tsdn, base, edata, usize, alignment); + if (esn != NULL) { + *esn = (size_t)edata_sn_get(edata); + } + if (ret_usize != NULL) { + *ret_usize = usize; + } +label_return: + malloc_mutex_unlock(tsdn, &base->mtx); + return ret; +} + +/* + * base_alloc() returns zeroed memory, which is always demand-zeroed for the + * auto arenas, in order to make multi-page sparse data structures such as radix + * tree nodes efficient with respect to physical memory usage. Upon success a + * pointer to at least size bytes with specified alignment is returned. Note + * that size is rounded up to the nearest multiple of alignment to avoid false + * sharing. + */ +void * +base_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment) { + return base_alloc_impl(tsdn, base, size, alignment, NULL, NULL); +} + +edata_t * +base_alloc_edata(tsdn_t *tsdn, base_t *base) { + size_t esn, usize; + edata_t *edata = base_alloc_impl(tsdn, base, sizeof(edata_t), + EDATA_ALIGNMENT, &esn, &usize); + if (edata == NULL) { + return NULL; + } + if (config_stats) { + base->edata_allocated += usize; + } + edata_esn_set(edata, esn); + return edata; +} + +void * +base_alloc_rtree(tsdn_t *tsdn, base_t *base, size_t size) { + size_t usize; + void *rtree = base_alloc_impl(tsdn, base, size, CACHELINE, NULL, + &usize); + if (rtree == NULL) { + return NULL; + } + if (config_stats) { + base->rtree_allocated += usize; + } + return rtree; +} + +static inline void +b0_alloc_header_size(size_t *header_size, size_t *alignment) { + *alignment = QUANTUM; + *header_size = QUANTUM > sizeof(edata_t *) ? QUANTUM : + sizeof(edata_t *); +} + +/* + * Each piece allocated here is managed by a separate edata, because it was bump + * allocated and cannot be merged back into the original base_block. This means + * it's not for general purpose: 1) they are not page aligned, nor page sized, + * and 2) the requested size should not be too small (as each piece comes with + * an edata_t). Only used for tcache bin stack allocation now. + */ +void * +b0_alloc_tcache_stack(tsdn_t *tsdn, size_t stack_size) { + base_t *base = b0get(); + edata_t *edata = base_alloc_base_edata(tsdn, base); + if (edata == NULL) { + return NULL; + } + + /* + * Reserve room for the header, which stores a pointer to the managing + * edata_t. The header itself is located right before the return + * address, so that edata can be retrieved on dalloc. Bump up to usize + * to improve reusability -- otherwise the freed stacks will be put back + * into the previous size class. + */ + size_t esn, alignment, header_size; + b0_alloc_header_size(&header_size, &alignment); + + size_t alloc_size = sz_s2u(stack_size + header_size); + void *addr = base_alloc_impl(tsdn, base, alloc_size, alignment, &esn, + NULL); + if (addr == NULL) { + edata_avail_insert(&base->edata_avail, edata); + return NULL; + } + + /* Set is_reused: see comments in base_edata_is_reused. */ + edata_binit(edata, addr, alloc_size, esn, true /* is_reused */); + *(edata_t **)addr = edata; + + return (byte_t *)addr + header_size; +} + +void +b0_dalloc_tcache_stack(tsdn_t *tsdn, void *tcache_stack) { + /* edata_t pointer stored in header. */ + size_t alignment, header_size; + b0_alloc_header_size(&header_size, &alignment); + + edata_t *edata = *(edata_t **)((byte_t *)tcache_stack - header_size); + void *addr = edata_addr_get(edata); + size_t bsize = edata_bsize_get(edata); + /* Marked as "reused" to avoid double counting stats. */ + assert(base_edata_is_reused(edata)); + assert(addr != NULL && bsize > 0); + + /* Zero out since base_alloc returns zeroed memory. */ + memset(addr, 0, bsize); + + base_t *base = b0get(); + malloc_mutex_lock(tsdn, &base->mtx); + base_edata_heap_insert(tsdn, base, edata); + malloc_mutex_unlock(tsdn, &base->mtx); +} + +void +base_stats_get(tsdn_t *tsdn, base_t *base, size_t *allocated, + size_t *edata_allocated, size_t *rtree_allocated, size_t *resident, + size_t *mapped, size_t *n_thp) { + cassert(config_stats); + + malloc_mutex_lock(tsdn, &base->mtx); + assert(base->allocated <= base->resident); + assert(base->resident <= base->mapped); + assert(base->edata_allocated + base->rtree_allocated <= base->allocated); + *allocated = base->allocated; + *edata_allocated = base->edata_allocated; + *rtree_allocated = base->rtree_allocated; + *resident = base->resident; + *mapped = base->mapped; + *n_thp = base->n_thp; + malloc_mutex_unlock(tsdn, &base->mtx); +} + +void +base_prefork(tsdn_t *tsdn, base_t *base) { + malloc_mutex_prefork(tsdn, &base->mtx); +} + +void +base_postfork_parent(tsdn_t *tsdn, base_t *base) { + malloc_mutex_postfork_parent(tsdn, &base->mtx); +} + +void +base_postfork_child(tsdn_t *tsdn, base_t *base) { + malloc_mutex_postfork_child(tsdn, &base->mtx); +} + +bool +base_boot(tsdn_t *tsdn) { + b0 = base_new(tsdn, 0, (extent_hooks_t *)&ehooks_default_extent_hooks, + /* metadata_use_hooks */ true); + return (b0 == NULL); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/batcher.c b/src/duckdb/extension/jemalloc/jemalloc/src/batcher.c new file mode 100644 index 000000000..2570b3a91 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/batcher.c @@ -0,0 +1,96 @@ +#include "jemalloc/internal/jemalloc_preamble.h" + +#include "jemalloc/internal/batcher.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/atomic.h" + +void +batcher_init(batcher_t *batcher, size_t nelems_max) { + atomic_store_zu(&batcher->nelems, 0, ATOMIC_RELAXED); + batcher->nelems_max = nelems_max; + batcher->npushes = 0; + malloc_mutex_init(&batcher->mtx, "batcher", WITNESS_RANK_BATCHER, + malloc_mutex_rank_exclusive); +} + +/* + * Returns an index (into some user-owned array) to use for pushing, or + * BATCHER_NO_IDX if no index is free. + */ +size_t batcher_push_begin(tsdn_t *tsdn, batcher_t *batcher, + size_t elems_to_push) { + assert(elems_to_push > 0); + size_t nelems_guess = atomic_load_zu(&batcher->nelems, ATOMIC_RELAXED); + if (nelems_guess + elems_to_push > batcher->nelems_max) { + return BATCHER_NO_IDX; + } + malloc_mutex_lock(tsdn, &batcher->mtx); + size_t nelems = atomic_load_zu(&batcher->nelems, ATOMIC_RELAXED); + if (nelems + elems_to_push > batcher->nelems_max) { + malloc_mutex_unlock(tsdn, &batcher->mtx); + return BATCHER_NO_IDX; + } + assert(elems_to_push <= batcher->nelems_max - nelems); + /* + * We update nelems at push time (instead of during pop) so that other + * racing accesses of the batcher can fail fast instead of trying to + * acquire a mutex only to discover that there's no space for them. + */ + atomic_store_zu(&batcher->nelems, nelems + elems_to_push, ATOMIC_RELAXED); + batcher->npushes++; + return nelems; +} + +size_t +batcher_pop_get_pushes(tsdn_t *tsdn, batcher_t *batcher) { + malloc_mutex_assert_owner(tsdn, &batcher->mtx); + size_t npushes = batcher->npushes; + batcher->npushes = 0; + return npushes; +} + +void +batcher_push_end(tsdn_t *tsdn, batcher_t *batcher) { + malloc_mutex_assert_owner(tsdn, &batcher->mtx); + assert(atomic_load_zu(&batcher->nelems, ATOMIC_RELAXED) > 0); + malloc_mutex_unlock(tsdn, &batcher->mtx); +} + +size_t +batcher_pop_begin(tsdn_t *tsdn, batcher_t *batcher) { + size_t nelems_guess = atomic_load_zu(&batcher->nelems, ATOMIC_RELAXED); + assert(nelems_guess <= batcher->nelems_max); + if (nelems_guess == 0) { + return BATCHER_NO_IDX; + } + malloc_mutex_lock(tsdn, &batcher->mtx); + size_t nelems = atomic_load_zu(&batcher->nelems, ATOMIC_RELAXED); + assert(nelems <= batcher->nelems_max); + if (nelems == 0) { + malloc_mutex_unlock(tsdn, &batcher->mtx); + return BATCHER_NO_IDX; + } + atomic_store_zu(&batcher->nelems, 0, ATOMIC_RELAXED); + return nelems; +} + +void batcher_pop_end(tsdn_t *tsdn, batcher_t *batcher) { + assert(atomic_load_zu(&batcher->nelems, ATOMIC_RELAXED) == 0); + malloc_mutex_unlock(tsdn, &batcher->mtx); +} + +void +batcher_prefork(tsdn_t *tsdn, batcher_t *batcher) { + malloc_mutex_prefork(tsdn, &batcher->mtx); +} + +void +batcher_postfork_parent(tsdn_t *tsdn, batcher_t *batcher) { + malloc_mutex_postfork_parent(tsdn, &batcher->mtx); +} + +void +batcher_postfork_child(tsdn_t *tsdn, batcher_t *batcher) { + malloc_mutex_postfork_child(tsdn, &batcher->mtx); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/bin.c b/src/duckdb/extension/jemalloc/jemalloc/src/bin.c new file mode 100644 index 000000000..267aa0f37 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/bin.c @@ -0,0 +1,109 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/bin.h" +#include "jemalloc/internal/sc.h" +#include "jemalloc/internal/witness.h" + +#ifdef JEMALLOC_JET +unsigned bin_batching_test_ndalloc_slabs_max = (unsigned)-1; +void (*bin_batching_test_after_push_hook)(size_t push_idx); +void (*bin_batching_test_mid_pop_hook)(size_t nelems_to_pop); +void (*bin_batching_test_after_unlock_hook)(unsigned slab_dalloc_count, + bool list_empty); +#endif + +bool +bin_update_shard_size(unsigned bin_shard_sizes[SC_NBINS], size_t start_size, + size_t end_size, size_t nshards) { + if (nshards > BIN_SHARDS_MAX || nshards == 0) { + return true; + } + + if (start_size > SC_SMALL_MAXCLASS) { + return false; + } + if (end_size > SC_SMALL_MAXCLASS) { + end_size = SC_SMALL_MAXCLASS; + } + + /* Compute the index since this may happen before sz init. */ + szind_t ind1 = sz_size2index_compute(start_size); + szind_t ind2 = sz_size2index_compute(end_size); + for (unsigned i = ind1; i <= ind2; i++) { + bin_shard_sizes[i] = (unsigned)nshards; + } + + return false; +} + +void +bin_shard_sizes_boot(unsigned bin_shard_sizes[SC_NBINS]) { + /* Load the default number of shards. */ + for (unsigned i = 0; i < SC_NBINS; i++) { + bin_shard_sizes[i] = N_BIN_SHARDS_DEFAULT; + } +} + +bool +bin_init(bin_t *bin, unsigned binind) { + if (malloc_mutex_init(&bin->lock, "bin", WITNESS_RANK_BIN, + malloc_mutex_rank_exclusive)) { + return true; + } + bin->slabcur = NULL; + edata_heap_new(&bin->slabs_nonfull); + edata_list_active_init(&bin->slabs_full); + if (config_stats) { + memset(&bin->stats, 0, sizeof(bin_stats_t)); + } + if (arena_bin_has_batch(binind)) { + bin_with_batch_t *batched_bin = (bin_with_batch_t *)bin; + batcher_init(&batched_bin->remote_frees, + opt_bin_info_remote_free_max); + } + return false; +} + +void +bin_prefork(tsdn_t *tsdn, bin_t *bin, bool has_batch) { + malloc_mutex_prefork(tsdn, &bin->lock); + if (has_batch) { + /* + * The batch mutex has lower rank than the bin mutex (as it must + * -- it's acquired later). But during forking, we go + * bin-at-a-time, so that we acquire mutex on bin 0, then on + * the bin 0 batcher, then on bin 1. This is a safe ordering + * (it's ordered by the index of arenas and bins within those + * arenas), but will trigger witness errors that would + * otherwise force another level of arena forking that breaks + * bin encapsulation (because the witness API doesn't "know" + * about arena or bin ordering -- it just sees that the batcher + * has a lower rank than the bin). So instead we exclude the + * batcher mutex from witness checking during fork (which is + * the only time we touch multiple bins at once) by passing + * TSDN_NULL. + */ + bin_with_batch_t *batched = (bin_with_batch_t *)bin; + batcher_prefork(TSDN_NULL, &batched->remote_frees); + } +} + +void +bin_postfork_parent(tsdn_t *tsdn, bin_t *bin, bool has_batch) { + malloc_mutex_postfork_parent(tsdn, &bin->lock); + if (has_batch) { + bin_with_batch_t *batched = (bin_with_batch_t *)bin; + batcher_postfork_parent(TSDN_NULL, &batched->remote_frees); + } +} + +void +bin_postfork_child(tsdn_t *tsdn, bin_t *bin, bool has_batch) { + malloc_mutex_postfork_child(tsdn, &bin->lock); + if (has_batch) { + bin_with_batch_t *batched = (bin_with_batch_t *)bin; + batcher_postfork_child(TSDN_NULL, &batched->remote_frees); + } +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/bin_info.c b/src/duckdb/extension/jemalloc/jemalloc/src/bin_info.c new file mode 100644 index 000000000..f8a64ae31 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/bin_info.c @@ -0,0 +1,54 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/bin_info.h" + +/* + * We leave bin-batching disabled by default, with other settings chosen mostly + * empirically; across the test programs I looked at they provided the most bang + * for the buck. With other default settings, these choices for bin batching + * result in them consuming far less memory (even in the worst case) than the + * tcaches themselves, the arena, etc. + * Note that we always try to pop all bins on every arena cache bin lock + * operation, so the typical memory waste is far less than this (and only on + * hot bins, which tend to be large anyways). + */ +size_t opt_bin_info_max_batched_size = 0; /* 192 is a good default. */ +size_t opt_bin_info_remote_free_max_batch = 4; +size_t opt_bin_info_remote_free_max = BIN_REMOTE_FREE_ELEMS_MAX; + +bin_info_t bin_infos[SC_NBINS]; + +szind_t bin_info_nbatched_sizes; +unsigned bin_info_nbatched_bins; +unsigned bin_info_nunbatched_bins; + +static void +bin_infos_init(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS], + bin_info_t infos[SC_NBINS]) { + for (unsigned i = 0; i < SC_NBINS; i++) { + bin_info_t *bin_info = &infos[i]; + sc_t *sc = &sc_data->sc[i]; + bin_info->reg_size = ((size_t)1U << sc->lg_base) + + ((size_t)sc->ndelta << sc->lg_delta); + bin_info->slab_size = (sc->pgs << LG_PAGE); + bin_info->nregs = + (uint32_t)(bin_info->slab_size / bin_info->reg_size); + bin_info->n_shards = bin_shard_sizes[i]; + bitmap_info_t bitmap_info = BITMAP_INFO_INITIALIZER( + bin_info->nregs); + bin_info->bitmap_info = bitmap_info; + if (bin_info->reg_size <= opt_bin_info_max_batched_size) { + bin_info_nbatched_sizes++; + bin_info_nbatched_bins += bin_info->n_shards; + } else { + bin_info_nunbatched_bins += bin_info->n_shards; + } + } +} + +void +bin_info_boot(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS]) { + assert(sc_data->initialized); + bin_infos_init(sc_data, bin_shard_sizes, bin_infos); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/bitmap.c b/src/duckdb/extension/jemalloc/jemalloc/src/bitmap.c new file mode 100644 index 000000000..0ccedc5db --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/bitmap.c @@ -0,0 +1,120 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" + +/******************************************************************************/ + +#ifdef BITMAP_USE_TREE + +void +bitmap_info_init(bitmap_info_t *binfo, size_t nbits) { + unsigned i; + size_t group_count; + + assert(nbits > 0); + assert(nbits <= (ZU(1) << LG_BITMAP_MAXBITS)); + + /* + * Compute the number of groups necessary to store nbits bits, and + * progressively work upward through the levels until reaching a level + * that requires only one group. + */ + binfo->levels[0].group_offset = 0; + group_count = BITMAP_BITS2GROUPS(nbits); + for (i = 1; group_count > 1; i++) { + assert(i < BITMAP_MAX_LEVELS); + binfo->levels[i].group_offset = binfo->levels[i-1].group_offset + + group_count; + group_count = BITMAP_BITS2GROUPS(group_count); + } + binfo->levels[i].group_offset = binfo->levels[i-1].group_offset + + group_count; + assert(binfo->levels[i].group_offset <= BITMAP_GROUPS_MAX); + binfo->nlevels = i; + binfo->nbits = nbits; +} + +static size_t +bitmap_info_ngroups(const bitmap_info_t *binfo) { + return binfo->levels[binfo->nlevels].group_offset; +} + +void +bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo, bool fill) { + size_t extra; + unsigned i; + + /* + * Bits are actually inverted with regard to the external bitmap + * interface. + */ + + if (fill) { + /* The "filled" bitmap starts out with all 0 bits. */ + memset(bitmap, 0, bitmap_size(binfo)); + return; + } + + /* + * The "empty" bitmap starts out with all 1 bits, except for trailing + * unused bits (if any). Note that each group uses bit 0 to correspond + * to the first logical bit in the group, so extra bits are the most + * significant bits of the last group. + */ + memset(bitmap, 0xffU, bitmap_size(binfo)); + extra = (BITMAP_GROUP_NBITS - (binfo->nbits & BITMAP_GROUP_NBITS_MASK)) + & BITMAP_GROUP_NBITS_MASK; + if (extra != 0) { + bitmap[binfo->levels[1].group_offset - 1] >>= extra; + } + for (i = 1; i < binfo->nlevels; i++) { + size_t group_count = binfo->levels[i].group_offset - + binfo->levels[i-1].group_offset; + extra = (BITMAP_GROUP_NBITS - (group_count & + BITMAP_GROUP_NBITS_MASK)) & BITMAP_GROUP_NBITS_MASK; + if (extra != 0) { + bitmap[binfo->levels[i+1].group_offset - 1] >>= extra; + } + } +} + +#else /* BITMAP_USE_TREE */ + +void +bitmap_info_init(bitmap_info_t *binfo, size_t nbits) { + assert(nbits > 0); + assert(nbits <= (ZU(1) << LG_BITMAP_MAXBITS)); + + binfo->ngroups = BITMAP_BITS2GROUPS(nbits); + binfo->nbits = nbits; +} + +static size_t +bitmap_info_ngroups(const bitmap_info_t *binfo) { + return binfo->ngroups; +} + +void +bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo, bool fill) { + size_t extra; + + if (fill) { + memset(bitmap, 0, bitmap_size(binfo)); + return; + } + + memset(bitmap, 0xffU, bitmap_size(binfo)); + extra = (BITMAP_GROUP_NBITS - (binfo->nbits & BITMAP_GROUP_NBITS_MASK)) + & BITMAP_GROUP_NBITS_MASK; + if (extra != 0) { + bitmap[binfo->ngroups - 1] >>= extra; + } +} + +#endif /* BITMAP_USE_TREE */ + +size_t +bitmap_size(const bitmap_info_t *binfo) { + return (bitmap_info_ngroups(binfo) << LG_SIZEOF_BITMAP); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/buf_writer.c b/src/duckdb/extension/jemalloc/jemalloc/src/buf_writer.c new file mode 100644 index 000000000..7c6f79403 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/buf_writer.c @@ -0,0 +1,144 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/buf_writer.h" +#include "jemalloc/internal/malloc_io.h" + +static void * +buf_writer_allocate_internal_buf(tsdn_t *tsdn, size_t buf_len) { +#ifdef JEMALLOC_JET + if (buf_len > SC_LARGE_MAXCLASS) { + return NULL; + } +#else + assert(buf_len <= SC_LARGE_MAXCLASS); +#endif + return iallocztm(tsdn, buf_len, sz_size2index(buf_len), false, NULL, + true, arena_get(tsdn, 0, false), true); +} + +static void +buf_writer_free_internal_buf(tsdn_t *tsdn, void *buf) { + if (buf != NULL) { + idalloctm(tsdn, buf, NULL, NULL, true, true); + } +} + +static void +buf_writer_assert(buf_writer_t *buf_writer) { + assert(buf_writer != NULL); + assert(buf_writer->write_cb != NULL); + if (buf_writer->buf != NULL) { + assert(buf_writer->buf_size > 0); + } else { + assert(buf_writer->buf_size == 0); + assert(buf_writer->internal_buf); + } + assert(buf_writer->buf_end <= buf_writer->buf_size); +} + +bool +buf_writer_init(tsdn_t *tsdn, buf_writer_t *buf_writer, write_cb_t *write_cb, + void *cbopaque, char *buf, size_t buf_len) { + if (write_cb != NULL) { + buf_writer->write_cb = write_cb; + } else { + buf_writer->write_cb = je_malloc_message != NULL ? + je_malloc_message : wrtmessage; + } + buf_writer->cbopaque = cbopaque; + assert(buf_len >= 2); + if (buf != NULL) { + buf_writer->buf = buf; + buf_writer->internal_buf = false; + } else { + buf_writer->buf = buf_writer_allocate_internal_buf(tsdn, + buf_len); + buf_writer->internal_buf = true; + } + if (buf_writer->buf != NULL) { + buf_writer->buf_size = buf_len - 1; /* Allowing for '\0'. */ + } else { + buf_writer->buf_size = 0; + } + buf_writer->buf_end = 0; + buf_writer_assert(buf_writer); + return buf_writer->buf == NULL; +} + +void +buf_writer_flush(buf_writer_t *buf_writer) { + buf_writer_assert(buf_writer); + if (buf_writer->buf == NULL) { + return; + } + buf_writer->buf[buf_writer->buf_end] = '\0'; + buf_writer->write_cb(buf_writer->cbopaque, buf_writer->buf); + buf_writer->buf_end = 0; + buf_writer_assert(buf_writer); +} + +void +buf_writer_cb(void *buf_writer_arg, const char *s) { + buf_writer_t *buf_writer = (buf_writer_t *)buf_writer_arg; + buf_writer_assert(buf_writer); + if (buf_writer->buf == NULL) { + buf_writer->write_cb(buf_writer->cbopaque, s); + return; + } + size_t i, slen, n; + for (i = 0, slen = strlen(s); i < slen; i += n) { + if (buf_writer->buf_end == buf_writer->buf_size) { + buf_writer_flush(buf_writer); + } + size_t s_remain = slen - i; + size_t buf_remain = buf_writer->buf_size - buf_writer->buf_end; + n = s_remain < buf_remain ? s_remain : buf_remain; + memcpy(buf_writer->buf + buf_writer->buf_end, s + i, n); + buf_writer->buf_end += n; + buf_writer_assert(buf_writer); + } + assert(i == slen); +} + +void +buf_writer_terminate(tsdn_t *tsdn, buf_writer_t *buf_writer) { + buf_writer_assert(buf_writer); + buf_writer_flush(buf_writer); + if (buf_writer->internal_buf) { + buf_writer_free_internal_buf(tsdn, buf_writer->buf); + } +} + +void +buf_writer_pipe(buf_writer_t *buf_writer, read_cb_t *read_cb, + void *read_cbopaque) { + /* + * A tiny local buffer in case the buffered writer failed to allocate + * at init. + */ + static char backup_buf[16]; + static buf_writer_t backup_buf_writer; + + buf_writer_assert(buf_writer); + assert(read_cb != NULL); + if (buf_writer->buf == NULL) { + buf_writer_init(TSDN_NULL, &backup_buf_writer, + buf_writer->write_cb, buf_writer->cbopaque, backup_buf, + sizeof(backup_buf)); + buf_writer = &backup_buf_writer; + } + assert(buf_writer->buf != NULL); + ssize_t nread = 0; + do { + buf_writer->buf_end += nread; + buf_writer_assert(buf_writer); + if (buf_writer->buf_end == buf_writer->buf_size) { + buf_writer_flush(buf_writer); + } + nread = read_cb(read_cbopaque, + buf_writer->buf + buf_writer->buf_end, + buf_writer->buf_size - buf_writer->buf_end); + } while (nread > 0); + buf_writer_flush(buf_writer); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/cache_bin.c b/src/duckdb/extension/jemalloc/jemalloc/src/cache_bin.c new file mode 100644 index 000000000..6438705f0 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/cache_bin.c @@ -0,0 +1,119 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/bit_util.h" +#include "jemalloc/internal/cache_bin.h" +#include "jemalloc/internal/safety_check.h" + +const uintptr_t disabled_bin = JUNK_ADDR; + +void +cache_bin_info_init(cache_bin_info_t *info, + cache_bin_sz_t ncached_max) { + assert(ncached_max <= CACHE_BIN_NCACHED_MAX); + size_t stack_size = (size_t)ncached_max * sizeof(void *); + assert(stack_size < ((size_t)1 << (sizeof(cache_bin_sz_t) * 8))); + info->ncached_max = (cache_bin_sz_t)ncached_max; +} + +bool +cache_bin_stack_use_thp(void) { + /* + * If metadata_thp is enabled, allocating tcache stack from the base + * allocator for efficiency gains. The downside, however, is that base + * allocator never purges freed memory, and may cache a fair amount of + * memory after many threads are terminated and not reused. + */ + return metadata_thp_enabled(); +} + +void +cache_bin_info_compute_alloc(const cache_bin_info_t *infos, szind_t ninfos, + size_t *size, size_t *alignment) { + /* For the total bin stack region (per tcache), reserve 2 more slots so + * that + * 1) the empty position can be safely read on the fast path before + * checking "is_empty"; and + * 2) the cur_ptr can go beyond the empty position by 1 step safely on + * the fast path (i.e. no overflow). + */ + *size = sizeof(void *) * 2; + for (szind_t i = 0; i < ninfos; i++) { + *size += infos[i].ncached_max * sizeof(void *); + } + + /* + * When not using THP, align to at least PAGE, to minimize the # of TLBs + * needed by the smaller sizes; also helps if the larger sizes don't get + * used at all. + */ + *alignment = cache_bin_stack_use_thp() ? QUANTUM : PAGE; +} + +void +cache_bin_preincrement(const cache_bin_info_t *infos, szind_t ninfos, void *alloc, + size_t *cur_offset) { + if (config_debug) { + size_t computed_size; + size_t computed_alignment; + + /* Pointer should be as aligned as we asked for. */ + cache_bin_info_compute_alloc(infos, ninfos, &computed_size, + &computed_alignment); + assert(((uintptr_t)alloc & (computed_alignment - 1)) == 0); + } + + *(uintptr_t *)((byte_t *)alloc + *cur_offset) = + cache_bin_preceding_junk; + *cur_offset += sizeof(void *); +} + +void +cache_bin_postincrement(void *alloc, size_t *cur_offset) { + *(uintptr_t *)((byte_t *)alloc + *cur_offset) = + cache_bin_trailing_junk; + *cur_offset += sizeof(void *); +} + +void +cache_bin_init(cache_bin_t *bin, const cache_bin_info_t *info, void *alloc, + size_t *cur_offset) { + /* + * The full_position points to the lowest available space. Allocations + * will access the slots toward higher addresses (for the benefit of + * adjacent prefetch). + */ + void *stack_cur = (void *)((byte_t *)alloc + *cur_offset); + void *full_position = stack_cur; + uint16_t bin_stack_size = info->ncached_max * sizeof(void *); + + *cur_offset += bin_stack_size; + void *empty_position = (void *)((byte_t *)alloc + *cur_offset); + + /* Init to the empty position. */ + bin->stack_head = (void **)empty_position; + bin->low_bits_low_water = (uint16_t)(uintptr_t)bin->stack_head; + bin->low_bits_full = (uint16_t)(uintptr_t)full_position; + bin->low_bits_empty = (uint16_t)(uintptr_t)empty_position; + cache_bin_info_init(&bin->bin_info, info->ncached_max); + cache_bin_sz_t free_spots = cache_bin_diff(bin, + bin->low_bits_full, (uint16_t)(uintptr_t)bin->stack_head); + assert(free_spots == bin_stack_size); + if (!cache_bin_disabled(bin)) { + assert(cache_bin_ncached_get_local(bin) == 0); + } + assert(cache_bin_empty_position_get(bin) == empty_position); + + assert(bin_stack_size > 0 || empty_position == full_position); +} + +void +cache_bin_init_disabled(cache_bin_t *bin, cache_bin_sz_t ncached_max) { + const void *fake_stack = cache_bin_disabled_bin_stack(); + size_t fake_offset = 0; + cache_bin_info_t fake_info; + cache_bin_info_init(&fake_info, 0); + cache_bin_init(bin, &fake_info, (void *)fake_stack, &fake_offset); + cache_bin_info_init(&bin->bin_info, ncached_max); + assert(fake_offset == 0); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/ckh.c b/src/duckdb/extension/jemalloc/jemalloc/src/ckh.c new file mode 100644 index 000000000..8db4319c5 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/ckh.c @@ -0,0 +1,569 @@ +/* + ******************************************************************************* + * Implementation of (2^1+,2) cuckoo hashing, where 2^1+ indicates that each + * hash bucket contains 2^n cells, for n >= 1, and 2 indicates that two hash + * functions are employed. The original cuckoo hashing algorithm was described + * in: + * + * Pagh, R., F.F. Rodler (2004) Cuckoo Hashing. Journal of Algorithms + * 51(2):122-144. + * + * Generalization of cuckoo hashing was discussed in: + * + * Erlingsson, U., M. Manasse, F. McSherry (2006) A cool and practical + * alternative to traditional hash tables. In Proceedings of the 7th + * Workshop on Distributed Data and Structures (WDAS'06), Santa Clara, CA, + * January 2006. + * + * This implementation uses precisely two hash functions because that is the + * fewest that can work, and supporting multiple hashes is an implementation + * burden. Here is a reproduction of Figure 1 from Erlingsson et al. (2006) + * that shows approximate expected maximum load factors for various + * configurations: + * + * | #cells/bucket | + * #hashes | 1 | 2 | 4 | 8 | + * --------+-------+-------+-------+-------+ + * 1 | 0.006 | 0.006 | 0.03 | 0.12 | + * 2 | 0.49 | 0.86 |>0.93< |>0.96< | + * 3 | 0.91 | 0.97 | 0.98 | 0.999 | + * 4 | 0.97 | 0.99 | 0.999 | | + * + * The number of cells per bucket is chosen such that a bucket fits in one cache + * line. So, on 32- and 64-bit systems, we use (8,2) and (4,2) cuckoo hashing, + * respectively. + * + ******************************************************************************/ +#include "jemalloc/internal/jemalloc_preamble.h" + +#include "jemalloc/internal/ckh.h" + +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/hash.h" +#include "jemalloc/internal/malloc_io.h" +#include "jemalloc/internal/prng.h" +#include "jemalloc/internal/util.h" + +/******************************************************************************/ +/* Function prototypes for non-inline static functions. */ + +static bool ckh_grow(tsd_t *tsd, ckh_t *ckh); +static void ckh_shrink(tsd_t *tsd, ckh_t *ckh); + +/******************************************************************************/ + +/* + * Search bucket for key and return the cell number if found; SIZE_T_MAX + * otherwise. + */ +static size_t +ckh_bucket_search(ckh_t *ckh, size_t bucket, const void *key) { + ckhc_t *cell; + unsigned i; + + for (i = 0; i < (ZU(1) << LG_CKH_BUCKET_CELLS); i++) { + cell = &ckh->tab[(bucket << LG_CKH_BUCKET_CELLS) + i]; + if (cell->key != NULL && ckh->keycomp(key, cell->key)) { + return (bucket << LG_CKH_BUCKET_CELLS) + i; + } + } + + return SIZE_T_MAX; +} + +/* + * Search table for key and return cell number if found; SIZE_T_MAX otherwise. + */ +static size_t +ckh_isearch(ckh_t *ckh, const void *key) { + size_t hashes[2], bucket, cell; + + assert(ckh != NULL); + + ckh->hash(key, hashes); + + /* Search primary bucket. */ + bucket = hashes[0] & ((ZU(1) << ckh->lg_curbuckets) - 1); + cell = ckh_bucket_search(ckh, bucket, key); + if (cell != SIZE_T_MAX) { + return cell; + } + + /* Search secondary bucket. */ + bucket = hashes[1] & ((ZU(1) << ckh->lg_curbuckets) - 1); + cell = ckh_bucket_search(ckh, bucket, key); + return cell; +} + +static bool +ckh_try_bucket_insert(ckh_t *ckh, size_t bucket, const void *key, + const void *data) { + ckhc_t *cell; + unsigned offset, i; + + /* + * Cycle through the cells in the bucket, starting at a random position. + * The randomness avoids worst-case search overhead as buckets fill up. + */ + offset = (unsigned)prng_lg_range_u64(&ckh->prng_state, + LG_CKH_BUCKET_CELLS); + for (i = 0; i < (ZU(1) << LG_CKH_BUCKET_CELLS); i++) { + cell = &ckh->tab[(bucket << LG_CKH_BUCKET_CELLS) + + ((i + offset) & ((ZU(1) << LG_CKH_BUCKET_CELLS) - 1))]; + if (cell->key == NULL) { + cell->key = key; + cell->data = data; + ckh->count++; + return false; + } + } + + return true; +} + +/* + * No space is available in bucket. Randomly evict an item, then try to find an + * alternate location for that item. Iteratively repeat this + * eviction/relocation procedure until either success or detection of an + * eviction/relocation bucket cycle. + */ +static bool +ckh_evict_reloc_insert(ckh_t *ckh, size_t argbucket, void const **argkey, + void const **argdata) { + const void *key, *data, *tkey, *tdata; + ckhc_t *cell; + size_t hashes[2], bucket, tbucket; + unsigned i; + + bucket = argbucket; + key = *argkey; + data = *argdata; + while (true) { + /* + * Choose a random item within the bucket to evict. This is + * critical to correct function, because without (eventually) + * evicting all items within a bucket during iteration, it + * would be possible to get stuck in an infinite loop if there + * were an item for which both hashes indicated the same + * bucket. + */ + i = (unsigned)prng_lg_range_u64(&ckh->prng_state, + LG_CKH_BUCKET_CELLS); + cell = &ckh->tab[(bucket << LG_CKH_BUCKET_CELLS) + i]; + assert(cell->key != NULL); + + /* Swap cell->{key,data} and {key,data} (evict). */ + tkey = cell->key; tdata = cell->data; + cell->key = key; cell->data = data; + key = tkey; data = tdata; + +#ifdef CKH_COUNT + ckh->nrelocs++; +#endif + + /* Find the alternate bucket for the evicted item. */ + ckh->hash(key, hashes); + tbucket = hashes[1] & ((ZU(1) << ckh->lg_curbuckets) - 1); + if (tbucket == bucket) { + tbucket = hashes[0] & ((ZU(1) << ckh->lg_curbuckets) + - 1); + /* + * It may be that (tbucket == bucket) still, if the + * item's hashes both indicate this bucket. However, + * we are guaranteed to eventually escape this bucket + * during iteration, assuming pseudo-random item + * selection (true randomness would make infinite + * looping a remote possibility). The reason we can + * never get trapped forever is that there are two + * cases: + * + * 1) This bucket == argbucket, so we will quickly + * detect an eviction cycle and terminate. + * 2) An item was evicted to this bucket from another, + * which means that at least one item in this bucket + * has hashes that indicate distinct buckets. + */ + } + /* Check for a cycle. */ + if (tbucket == argbucket) { + *argkey = key; + *argdata = data; + return true; + } + + bucket = tbucket; + if (!ckh_try_bucket_insert(ckh, bucket, key, data)) { + return false; + } + } +} + +static bool +ckh_try_insert(ckh_t *ckh, void const**argkey, void const**argdata) { + size_t hashes[2], bucket; + const void *key = *argkey; + const void *data = *argdata; + + ckh->hash(key, hashes); + + /* Try to insert in primary bucket. */ + bucket = hashes[0] & ((ZU(1) << ckh->lg_curbuckets) - 1); + if (!ckh_try_bucket_insert(ckh, bucket, key, data)) { + return false; + } + + /* Try to insert in secondary bucket. */ + bucket = hashes[1] & ((ZU(1) << ckh->lg_curbuckets) - 1); + if (!ckh_try_bucket_insert(ckh, bucket, key, data)) { + return false; + } + + /* + * Try to find a place for this item via iterative eviction/relocation. + */ + return ckh_evict_reloc_insert(ckh, bucket, argkey, argdata); +} + +/* + * Try to rebuild the hash table from scratch by inserting all items from the + * old table into the new. + */ +static bool +ckh_rebuild(ckh_t *ckh, ckhc_t *aTab) { + size_t count, i, nins; + const void *key, *data; + + count = ckh->count; + ckh->count = 0; + for (i = nins = 0; nins < count; i++) { + if (aTab[i].key != NULL) { + key = aTab[i].key; + data = aTab[i].data; + if (ckh_try_insert(ckh, &key, &data)) { + ckh->count = count; + return true; + } + nins++; + } + } + + return false; +} + +static bool +ckh_grow(tsd_t *tsd, ckh_t *ckh) { + bool ret; + ckhc_t *tab, *ttab; + unsigned lg_prevbuckets, lg_curcells; + +#ifdef CKH_COUNT + ckh->ngrows++; +#endif + + /* + * It is possible (though unlikely, given well behaved hashes) that the + * table will have to be doubled more than once in order to create a + * usable table. + */ + lg_prevbuckets = ckh->lg_curbuckets; + lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS; + while (true) { + size_t usize; + + lg_curcells++; + usize = sz_sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE); + if (unlikely(usize == 0 + || usize > SC_LARGE_MAXCLASS)) { + ret = true; + goto label_return; + } + tab = (ckhc_t *)ipallocztm(tsd_tsdn(tsd), usize, CACHELINE, + true, NULL, true, arena_ichoose(tsd, NULL)); + if (tab == NULL) { + ret = true; + goto label_return; + } + /* Swap in new table. */ + ttab = ckh->tab; + ckh->tab = tab; + tab = ttab; + ckh->lg_curbuckets = lg_curcells - LG_CKH_BUCKET_CELLS; + + if (!ckh_rebuild(ckh, tab)) { + idalloctm(tsd_tsdn(tsd), tab, NULL, NULL, true, true); + break; + } + + /* Rebuilding failed, so back out partially rebuilt table. */ + idalloctm(tsd_tsdn(tsd), ckh->tab, NULL, NULL, true, true); + ckh->tab = tab; + ckh->lg_curbuckets = lg_prevbuckets; + } + + ret = false; +label_return: + return ret; +} + +static void +ckh_shrink(tsd_t *tsd, ckh_t *ckh) { + ckhc_t *tab, *ttab; + size_t usize; + unsigned lg_prevbuckets, lg_curcells; + + /* + * It is possible (though unlikely, given well behaved hashes) that the + * table rebuild will fail. + */ + lg_prevbuckets = ckh->lg_curbuckets; + lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS - 1; + usize = sz_sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE); + if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) { + return; + } + tab = (ckhc_t *)ipallocztm(tsd_tsdn(tsd), usize, CACHELINE, true, NULL, + true, arena_ichoose(tsd, NULL)); + if (tab == NULL) { + /* + * An OOM error isn't worth propagating, since it doesn't + * prevent this or future operations from proceeding. + */ + return; + } + /* Swap in new table. */ + ttab = ckh->tab; + ckh->tab = tab; + tab = ttab; + ckh->lg_curbuckets = lg_curcells - LG_CKH_BUCKET_CELLS; + + if (!ckh_rebuild(ckh, tab)) { + idalloctm(tsd_tsdn(tsd), tab, NULL, NULL, true, true); +#ifdef CKH_COUNT + ckh->nshrinks++; +#endif + return; + } + + /* Rebuilding failed, so back out partially rebuilt table. */ + idalloctm(tsd_tsdn(tsd), ckh->tab, NULL, NULL, true, true); + ckh->tab = tab; + ckh->lg_curbuckets = lg_prevbuckets; +#ifdef CKH_COUNT + ckh->nshrinkfails++; +#endif +} + +bool +ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *ckh_hash, + ckh_keycomp_t *keycomp) { + bool ret; + size_t mincells, usize; + unsigned lg_mincells; + + assert(minitems > 0); + assert(ckh_hash != NULL); + assert(keycomp != NULL); + +#ifdef CKH_COUNT + ckh->ngrows = 0; + ckh->nshrinks = 0; + ckh->nshrinkfails = 0; + ckh->ninserts = 0; + ckh->nrelocs = 0; +#endif + ckh->prng_state = 42; /* Value doesn't really matter. */ + ckh->count = 0; + + /* + * Find the minimum power of 2 that is large enough to fit minitems + * entries. We are using (2+,2) cuckoo hashing, which has an expected + * maximum load factor of at least ~0.86, so 0.75 is a conservative load + * factor that will typically allow mincells items to fit without ever + * growing the table. + */ + assert(LG_CKH_BUCKET_CELLS > 0); + mincells = ((minitems + (3 - (minitems % 3))) / 3) << 2; + for (lg_mincells = LG_CKH_BUCKET_CELLS; + (ZU(1) << lg_mincells) < mincells; + lg_mincells++) { + /* Do nothing. */ + } + ckh->lg_minbuckets = lg_mincells - LG_CKH_BUCKET_CELLS; + ckh->lg_curbuckets = lg_mincells - LG_CKH_BUCKET_CELLS; + ckh->hash = ckh_hash; + ckh->keycomp = keycomp; + + usize = sz_sa2u(sizeof(ckhc_t) << lg_mincells, CACHELINE); + if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) { + ret = true; + goto label_return; + } + ckh->tab = (ckhc_t *)ipallocztm(tsd_tsdn(tsd), usize, CACHELINE, true, + NULL, true, arena_ichoose(tsd, NULL)); + if (ckh->tab == NULL) { + ret = true; + goto label_return; + } + + ret = false; +label_return: + return ret; +} + +void +ckh_delete(tsd_t *tsd, ckh_t *ckh) { + assert(ckh != NULL); + +#ifdef CKH_VERBOSE + malloc_printf( + "%s(%p): ngrows: %"FMTu64", nshrinks: %"FMTu64"," + " nshrinkfails: %"FMTu64", ninserts: %"FMTu64"," + " nrelocs: %"FMTu64"\n", __func__, ckh, + (unsigned long long)ckh->ngrows, + (unsigned long long)ckh->nshrinks, + (unsigned long long)ckh->nshrinkfails, + (unsigned long long)ckh->ninserts, + (unsigned long long)ckh->nrelocs); +#endif + + idalloctm(tsd_tsdn(tsd), ckh->tab, NULL, NULL, true, true); + if (config_debug) { + memset(ckh, JEMALLOC_FREE_JUNK, sizeof(ckh_t)); + } +} + +size_t +ckh_count(ckh_t *ckh) { + assert(ckh != NULL); + + return ckh->count; +} + +bool +ckh_iter(ckh_t *ckh, size_t *tabind, void **key, void **data) { + size_t i, ncells; + + for (i = *tabind, ncells = (ZU(1) << (ckh->lg_curbuckets + + LG_CKH_BUCKET_CELLS)); i < ncells; i++) { + if (ckh->tab[i].key != NULL) { + if (key != NULL) { + *key = (void *)ckh->tab[i].key; + } + if (data != NULL) { + *data = (void *)ckh->tab[i].data; + } + *tabind = i + 1; + return false; + } + } + + return true; +} + +bool +ckh_insert(tsd_t *tsd, ckh_t *ckh, const void *key, const void *data) { + bool ret; + + assert(ckh != NULL); + assert(ckh_search(ckh, key, NULL, NULL)); + +#ifdef CKH_COUNT + ckh->ninserts++; +#endif + + while (ckh_try_insert(ckh, &key, &data)) { + if (ckh_grow(tsd, ckh)) { + ret = true; + goto label_return; + } + } + + ret = false; +label_return: + return ret; +} + +bool +ckh_remove(tsd_t *tsd, ckh_t *ckh, const void *searchkey, void **key, + void **data) { + size_t cell; + + assert(ckh != NULL); + + cell = ckh_isearch(ckh, searchkey); + if (cell != SIZE_T_MAX) { + if (key != NULL) { + *key = (void *)ckh->tab[cell].key; + } + if (data != NULL) { + *data = (void *)ckh->tab[cell].data; + } + ckh->tab[cell].key = NULL; + ckh->tab[cell].data = NULL; /* Not necessary. */ + + ckh->count--; + /* Try to halve the table if it is less than 1/4 full. */ + if (ckh->count < (ZU(1) << (ckh->lg_curbuckets + + LG_CKH_BUCKET_CELLS - 2)) && ckh->lg_curbuckets + > ckh->lg_minbuckets) { + /* Ignore error due to OOM. */ + ckh_shrink(tsd, ckh); + } + + return false; + } + + return true; +} + +bool +ckh_search(ckh_t *ckh, const void *searchkey, void **key, void **data) { + size_t cell; + + assert(ckh != NULL); + + cell = ckh_isearch(ckh, searchkey); + if (cell != SIZE_T_MAX) { + if (key != NULL) { + *key = (void *)ckh->tab[cell].key; + } + if (data != NULL) { + *data = (void *)ckh->tab[cell].data; + } + return false; + } + + return true; +} + +void +ckh_string_hash(const void *key, size_t r_hash[2]) { + hash(key, strlen((const char *)key), 0x94122f33U, r_hash); +} + +bool +ckh_string_keycomp(const void *k1, const void *k2) { + assert(k1 != NULL); + assert(k2 != NULL); + + return !strcmp((char *)k1, (char *)k2); +} + +void +ckh_pointer_hash(const void *key, size_t r_hash[2]) { + union { + const void *v; + size_t i; + } u; + + assert(sizeof(u.v) == sizeof(u.i)); + u.v = key; + hash(&u.i, sizeof(u.i), 0xd983396eU, r_hash); +} + +bool +ckh_pointer_keycomp(const void *k1, const void *k2) { + return (k1 == k2); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/counter.c b/src/duckdb/extension/jemalloc/jemalloc/src/counter.c new file mode 100644 index 000000000..8f1ae3af4 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/counter.c @@ -0,0 +1,30 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/counter.h" + +bool +counter_accum_init(counter_accum_t *counter, uint64_t interval) { + if (LOCKEDINT_MTX_INIT(counter->mtx, "counter_accum", + WITNESS_RANK_COUNTER_ACCUM, malloc_mutex_rank_exclusive)) { + return true; + } + locked_init_u64_unsynchronized(&counter->accumbytes, 0); + counter->interval = interval; + return false; +} + +void +counter_prefork(tsdn_t *tsdn, counter_accum_t *counter) { + LOCKEDINT_MTX_PREFORK(tsdn, counter->mtx); +} + +void +counter_postfork_parent(tsdn_t *tsdn, counter_accum_t *counter) { + LOCKEDINT_MTX_POSTFORK_PARENT(tsdn, counter->mtx); +} + +void +counter_postfork_child(tsdn_t *tsdn, counter_accum_t *counter) { + LOCKEDINT_MTX_POSTFORK_CHILD(tsdn, counter->mtx); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/ctl.c b/src/duckdb/extension/jemalloc/jemalloc/src/ctl.c new file mode 100644 index 000000000..ebe5c61c5 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/ctl.c @@ -0,0 +1,4717 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/ctl.h" +#include "jemalloc/internal/extent_dss.h" +#include "jemalloc/internal/extent_mmap.h" +#include "jemalloc/internal/inspect.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/nstime.h" +#include "jemalloc/internal/peak_event.h" +#include "jemalloc/internal/prof_data.h" +#include "jemalloc/internal/prof_log.h" +#include "jemalloc/internal/prof_recent.h" +#include "jemalloc/internal/prof_stats.h" +#include "jemalloc/internal/prof_sys.h" +#include "jemalloc/internal/safety_check.h" +#include "jemalloc/internal/sc.h" +#include "jemalloc/internal/util.h" + +/******************************************************************************/ +/* Data. */ + +/* + * ctl_mtx protects the following: + * - ctl_stats->* + */ +static malloc_mutex_t ctl_mtx; +static bool ctl_initialized; +static ctl_stats_t *ctl_stats; +static ctl_arenas_t *ctl_arenas; + +/******************************************************************************/ +/* Helpers for named and indexed nodes. */ + +static const ctl_named_node_t * +ctl_named_node(const ctl_node_t *node) { + return ((node->named) ? (const ctl_named_node_t *)node : NULL); +} + +static const ctl_named_node_t * +ctl_named_children(const ctl_named_node_t *node, size_t index) { + const ctl_named_node_t *children = ctl_named_node(node->children); + + return (children ? &children[index] : NULL); +} + +static const ctl_indexed_node_t * +ctl_indexed_node(const ctl_node_t *node) { + return (!node->named ? (const ctl_indexed_node_t *)node : NULL); +} + +/******************************************************************************/ +/* Function prototypes for non-inline static functions. */ + +#define CTL_PROTO(n) \ +static int n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, \ + void *oldp, size_t *oldlenp, void *newp, size_t newlen); + +#define INDEX_PROTO(n) \ +static const ctl_named_node_t *n##_index(tsdn_t *tsdn, \ + const size_t *mib, size_t miblen, size_t i); + +CTL_PROTO(version) +CTL_PROTO(epoch) +CTL_PROTO(background_thread) +CTL_PROTO(max_background_threads) +CTL_PROTO(thread_tcache_enabled) +CTL_PROTO(thread_tcache_max) +CTL_PROTO(thread_tcache_flush) +CTL_PROTO(thread_tcache_ncached_max_write) +CTL_PROTO(thread_tcache_ncached_max_read_sizeclass) +CTL_PROTO(thread_peak_read) +CTL_PROTO(thread_peak_reset) +CTL_PROTO(thread_prof_name) +CTL_PROTO(thread_prof_active) +CTL_PROTO(thread_arena) +CTL_PROTO(thread_allocated) +CTL_PROTO(thread_allocatedp) +CTL_PROTO(thread_deallocated) +CTL_PROTO(thread_deallocatedp) +CTL_PROTO(thread_idle) +CTL_PROTO(config_cache_oblivious) +CTL_PROTO(config_debug) +CTL_PROTO(config_fill) +CTL_PROTO(config_lazy_lock) +CTL_PROTO(config_malloc_conf) +CTL_PROTO(config_opt_safety_checks) +CTL_PROTO(config_prof) +CTL_PROTO(config_prof_libgcc) +CTL_PROTO(config_prof_libunwind) +CTL_PROTO(config_stats) +CTL_PROTO(config_utrace) +CTL_PROTO(config_xmalloc) +CTL_PROTO(opt_abort) +CTL_PROTO(opt_abort_conf) +CTL_PROTO(opt_cache_oblivious) +CTL_PROTO(opt_debug_double_free_max_scan) +CTL_PROTO(opt_trust_madvise) +CTL_PROTO(opt_confirm_conf) +CTL_PROTO(opt_hpa) +CTL_PROTO(opt_hpa_slab_max_alloc) +CTL_PROTO(opt_hpa_hugification_threshold) +CTL_PROTO(opt_hpa_hugify_delay_ms) +CTL_PROTO(opt_hpa_min_purge_interval_ms) +CTL_PROTO(opt_hpa_strict_min_purge_interval) +CTL_PROTO(opt_hpa_dirty_mult) +CTL_PROTO(opt_hpa_sec_nshards) +CTL_PROTO(opt_hpa_sec_max_alloc) +CTL_PROTO(opt_hpa_sec_max_bytes) +CTL_PROTO(opt_hpa_sec_bytes_after_flush) +CTL_PROTO(opt_hpa_sec_batch_fill_extra) +CTL_PROTO(opt_metadata_thp) +CTL_PROTO(opt_retain) +CTL_PROTO(opt_dss) +CTL_PROTO(opt_narenas) +CTL_PROTO(opt_percpu_arena) +CTL_PROTO(opt_oversize_threshold) +CTL_PROTO(opt_background_thread) +CTL_PROTO(opt_mutex_max_spin) +CTL_PROTO(opt_max_background_threads) +CTL_PROTO(opt_dirty_decay_ms) +CTL_PROTO(opt_muzzy_decay_ms) +CTL_PROTO(opt_stats_print) +CTL_PROTO(opt_stats_print_opts) +CTL_PROTO(opt_stats_interval) +CTL_PROTO(opt_stats_interval_opts) +CTL_PROTO(opt_junk) +CTL_PROTO(opt_zero) +CTL_PROTO(opt_utrace) +CTL_PROTO(opt_xmalloc) +CTL_PROTO(opt_experimental_infallible_new) +CTL_PROTO(opt_max_batched_size) +CTL_PROTO(opt_remote_free_max) +CTL_PROTO(opt_remote_free_max_batch) +CTL_PROTO(opt_tcache) +CTL_PROTO(opt_tcache_max) +CTL_PROTO(opt_tcache_nslots_small_min) +CTL_PROTO(opt_tcache_nslots_small_max) +CTL_PROTO(opt_tcache_nslots_large) +CTL_PROTO(opt_lg_tcache_nslots_mul) +CTL_PROTO(opt_tcache_gc_incr_bytes) +CTL_PROTO(opt_tcache_gc_delay_bytes) +CTL_PROTO(opt_lg_tcache_flush_small_div) +CTL_PROTO(opt_lg_tcache_flush_large_div) +CTL_PROTO(opt_thp) +CTL_PROTO(opt_lg_extent_max_active_fit) +CTL_PROTO(opt_prof) +CTL_PROTO(opt_prof_prefix) +CTL_PROTO(opt_prof_active) +CTL_PROTO(opt_prof_thread_active_init) +CTL_PROTO(opt_prof_bt_max) +CTL_PROTO(opt_lg_prof_sample) +CTL_PROTO(opt_lg_prof_interval) +CTL_PROTO(opt_prof_gdump) +CTL_PROTO(opt_prof_final) +CTL_PROTO(opt_prof_leak) +CTL_PROTO(opt_prof_leak_error) +CTL_PROTO(opt_prof_accum) +CTL_PROTO(opt_prof_pid_namespace) +CTL_PROTO(opt_prof_recent_alloc_max) +CTL_PROTO(opt_prof_stats) +CTL_PROTO(opt_prof_sys_thread_name) +CTL_PROTO(opt_prof_time_res) +CTL_PROTO(opt_lg_san_uaf_align) +CTL_PROTO(opt_zero_realloc) +CTL_PROTO(opt_malloc_conf_symlink) +CTL_PROTO(opt_malloc_conf_env_var) +CTL_PROTO(opt_malloc_conf_global_var) +CTL_PROTO(opt_malloc_conf_global_var_2_conf_harder) +CTL_PROTO(tcache_create) +CTL_PROTO(tcache_flush) +CTL_PROTO(tcache_destroy) +CTL_PROTO(arena_i_initialized) +CTL_PROTO(arena_i_decay) +CTL_PROTO(arena_i_purge) +CTL_PROTO(arena_i_reset) +CTL_PROTO(arena_i_destroy) +CTL_PROTO(arena_i_dss) +CTL_PROTO(arena_i_oversize_threshold) +CTL_PROTO(arena_i_dirty_decay_ms) +CTL_PROTO(arena_i_muzzy_decay_ms) +CTL_PROTO(arena_i_extent_hooks) +CTL_PROTO(arena_i_retain_grow_limit) +CTL_PROTO(arena_i_name) +INDEX_PROTO(arena_i) +CTL_PROTO(arenas_bin_i_size) +CTL_PROTO(arenas_bin_i_nregs) +CTL_PROTO(arenas_bin_i_slab_size) +CTL_PROTO(arenas_bin_i_nshards) +INDEX_PROTO(arenas_bin_i) +CTL_PROTO(arenas_lextent_i_size) +INDEX_PROTO(arenas_lextent_i) +CTL_PROTO(arenas_narenas) +CTL_PROTO(arenas_dirty_decay_ms) +CTL_PROTO(arenas_muzzy_decay_ms) +CTL_PROTO(arenas_quantum) +CTL_PROTO(arenas_page) +CTL_PROTO(arenas_hugepage) +CTL_PROTO(arenas_tcache_max) +CTL_PROTO(arenas_nbins) +CTL_PROTO(arenas_nhbins) +CTL_PROTO(arenas_nlextents) +CTL_PROTO(arenas_create) +CTL_PROTO(arenas_lookup) +CTL_PROTO(prof_thread_active_init) +CTL_PROTO(prof_active) +CTL_PROTO(prof_dump) +CTL_PROTO(prof_gdump) +CTL_PROTO(prof_prefix) +CTL_PROTO(prof_reset) +CTL_PROTO(prof_interval) +CTL_PROTO(lg_prof_sample) +CTL_PROTO(prof_log_start) +CTL_PROTO(prof_log_stop) +CTL_PROTO(prof_stats_bins_i_live) +CTL_PROTO(prof_stats_bins_i_accum) +INDEX_PROTO(prof_stats_bins_i) +CTL_PROTO(prof_stats_lextents_i_live) +CTL_PROTO(prof_stats_lextents_i_accum) +INDEX_PROTO(prof_stats_lextents_i) +CTL_PROTO(stats_arenas_i_small_allocated) +CTL_PROTO(stats_arenas_i_small_nmalloc) +CTL_PROTO(stats_arenas_i_small_ndalloc) +CTL_PROTO(stats_arenas_i_small_nrequests) +CTL_PROTO(stats_arenas_i_small_nfills) +CTL_PROTO(stats_arenas_i_small_nflushes) +CTL_PROTO(stats_arenas_i_large_allocated) +CTL_PROTO(stats_arenas_i_large_nmalloc) +CTL_PROTO(stats_arenas_i_large_ndalloc) +CTL_PROTO(stats_arenas_i_large_nrequests) +CTL_PROTO(stats_arenas_i_large_nfills) +CTL_PROTO(stats_arenas_i_large_nflushes) +CTL_PROTO(stats_arenas_i_bins_j_nmalloc) +CTL_PROTO(stats_arenas_i_bins_j_ndalloc) +CTL_PROTO(stats_arenas_i_bins_j_nrequests) +CTL_PROTO(stats_arenas_i_bins_j_curregs) +CTL_PROTO(stats_arenas_i_bins_j_nfills) +CTL_PROTO(stats_arenas_i_bins_j_nflushes) +CTL_PROTO(stats_arenas_i_bins_j_nslabs) +CTL_PROTO(stats_arenas_i_bins_j_nreslabs) +CTL_PROTO(stats_arenas_i_bins_j_curslabs) +CTL_PROTO(stats_arenas_i_bins_j_nonfull_slabs) +CTL_PROTO(stats_arenas_i_bins_j_batch_pops) +CTL_PROTO(stats_arenas_i_bins_j_batch_failed_pushes) +CTL_PROTO(stats_arenas_i_bins_j_batch_pushes) +CTL_PROTO(stats_arenas_i_bins_j_batch_pushed_elems) +INDEX_PROTO(stats_arenas_i_bins_j) +CTL_PROTO(stats_arenas_i_lextents_j_nmalloc) +CTL_PROTO(stats_arenas_i_lextents_j_ndalloc) +CTL_PROTO(stats_arenas_i_lextents_j_nrequests) +CTL_PROTO(stats_arenas_i_lextents_j_curlextents) +INDEX_PROTO(stats_arenas_i_lextents_j) +CTL_PROTO(stats_arenas_i_extents_j_ndirty) +CTL_PROTO(stats_arenas_i_extents_j_nmuzzy) +CTL_PROTO(stats_arenas_i_extents_j_nretained) +CTL_PROTO(stats_arenas_i_extents_j_dirty_bytes) +CTL_PROTO(stats_arenas_i_extents_j_muzzy_bytes) +CTL_PROTO(stats_arenas_i_extents_j_retained_bytes) +INDEX_PROTO(stats_arenas_i_extents_j) +CTL_PROTO(stats_arenas_i_hpa_shard_npurge_passes) +CTL_PROTO(stats_arenas_i_hpa_shard_npurges) +CTL_PROTO(stats_arenas_i_hpa_shard_nhugifies) +CTL_PROTO(stats_arenas_i_hpa_shard_ndehugifies) + +/* We have a set of stats for full slabs. */ +CTL_PROTO(stats_arenas_i_hpa_shard_full_slabs_npageslabs_nonhuge) +CTL_PROTO(stats_arenas_i_hpa_shard_full_slabs_npageslabs_huge) +CTL_PROTO(stats_arenas_i_hpa_shard_full_slabs_nactive_nonhuge) +CTL_PROTO(stats_arenas_i_hpa_shard_full_slabs_nactive_huge) +CTL_PROTO(stats_arenas_i_hpa_shard_full_slabs_ndirty_nonhuge) +CTL_PROTO(stats_arenas_i_hpa_shard_full_slabs_ndirty_huge) + +/* A parallel set for the empty slabs. */ +CTL_PROTO(stats_arenas_i_hpa_shard_empty_slabs_npageslabs_nonhuge) +CTL_PROTO(stats_arenas_i_hpa_shard_empty_slabs_npageslabs_huge) +CTL_PROTO(stats_arenas_i_hpa_shard_empty_slabs_nactive_nonhuge) +CTL_PROTO(stats_arenas_i_hpa_shard_empty_slabs_nactive_huge) +CTL_PROTO(stats_arenas_i_hpa_shard_empty_slabs_ndirty_nonhuge) +CTL_PROTO(stats_arenas_i_hpa_shard_empty_slabs_ndirty_huge) + +/* + * And one for the slabs that are neither empty nor full, but indexed by how + * full they are. + */ +CTL_PROTO(stats_arenas_i_hpa_shard_nonfull_slabs_j_npageslabs_nonhuge) +CTL_PROTO(stats_arenas_i_hpa_shard_nonfull_slabs_j_npageslabs_huge) +CTL_PROTO(stats_arenas_i_hpa_shard_nonfull_slabs_j_nactive_nonhuge) +CTL_PROTO(stats_arenas_i_hpa_shard_nonfull_slabs_j_nactive_huge) +CTL_PROTO(stats_arenas_i_hpa_shard_nonfull_slabs_j_ndirty_nonhuge) +CTL_PROTO(stats_arenas_i_hpa_shard_nonfull_slabs_j_ndirty_huge) + +INDEX_PROTO(stats_arenas_i_hpa_shard_nonfull_slabs_j) +CTL_PROTO(stats_arenas_i_nthreads) +CTL_PROTO(stats_arenas_i_uptime) +CTL_PROTO(stats_arenas_i_dss) +CTL_PROTO(stats_arenas_i_dirty_decay_ms) +CTL_PROTO(stats_arenas_i_muzzy_decay_ms) +CTL_PROTO(stats_arenas_i_pactive) +CTL_PROTO(stats_arenas_i_pdirty) +CTL_PROTO(stats_arenas_i_pmuzzy) +CTL_PROTO(stats_arenas_i_mapped) +CTL_PROTO(stats_arenas_i_retained) +CTL_PROTO(stats_arenas_i_extent_avail) +CTL_PROTO(stats_arenas_i_dirty_npurge) +CTL_PROTO(stats_arenas_i_dirty_nmadvise) +CTL_PROTO(stats_arenas_i_dirty_purged) +CTL_PROTO(stats_arenas_i_muzzy_npurge) +CTL_PROTO(stats_arenas_i_muzzy_nmadvise) +CTL_PROTO(stats_arenas_i_muzzy_purged) +CTL_PROTO(stats_arenas_i_base) +CTL_PROTO(stats_arenas_i_internal) +CTL_PROTO(stats_arenas_i_metadata_edata) +CTL_PROTO(stats_arenas_i_metadata_rtree) +CTL_PROTO(stats_arenas_i_metadata_thp) +CTL_PROTO(stats_arenas_i_tcache_bytes) +CTL_PROTO(stats_arenas_i_tcache_stashed_bytes) +CTL_PROTO(stats_arenas_i_resident) +CTL_PROTO(stats_arenas_i_abandoned_vm) +CTL_PROTO(stats_arenas_i_hpa_sec_bytes) +INDEX_PROTO(stats_arenas_i) +CTL_PROTO(stats_allocated) +CTL_PROTO(stats_active) +CTL_PROTO(stats_background_thread_num_threads) +CTL_PROTO(stats_background_thread_num_runs) +CTL_PROTO(stats_background_thread_run_interval) +CTL_PROTO(stats_metadata) +CTL_PROTO(stats_metadata_edata) +CTL_PROTO(stats_metadata_rtree) +CTL_PROTO(stats_metadata_thp) +CTL_PROTO(stats_resident) +CTL_PROTO(stats_mapped) +CTL_PROTO(stats_retained) +CTL_PROTO(stats_zero_reallocs) +CTL_PROTO(experimental_hooks_install) +CTL_PROTO(experimental_hooks_remove) +CTL_PROTO(experimental_hooks_prof_backtrace) +CTL_PROTO(experimental_hooks_prof_dump) +CTL_PROTO(experimental_hooks_prof_sample) +CTL_PROTO(experimental_hooks_prof_sample_free) +CTL_PROTO(experimental_hooks_safety_check_abort) +CTL_PROTO(experimental_thread_activity_callback) +CTL_PROTO(experimental_utilization_query) +CTL_PROTO(experimental_utilization_batch_query) +CTL_PROTO(experimental_arenas_i_pactivep) +INDEX_PROTO(experimental_arenas_i) +CTL_PROTO(experimental_prof_recent_alloc_max) +CTL_PROTO(experimental_prof_recent_alloc_dump) +CTL_PROTO(experimental_batch_alloc) +CTL_PROTO(experimental_arenas_create_ext) + +#define MUTEX_STATS_CTL_PROTO_GEN(n) \ +CTL_PROTO(stats_##n##_num_ops) \ +CTL_PROTO(stats_##n##_num_wait) \ +CTL_PROTO(stats_##n##_num_spin_acq) \ +CTL_PROTO(stats_##n##_num_owner_switch) \ +CTL_PROTO(stats_##n##_total_wait_time) \ +CTL_PROTO(stats_##n##_max_wait_time) \ +CTL_PROTO(stats_##n##_max_num_thds) + +/* Global mutexes. */ +#define OP(mtx) MUTEX_STATS_CTL_PROTO_GEN(mutexes_##mtx) +MUTEX_PROF_GLOBAL_MUTEXES +#undef OP + +/* Per arena mutexes. */ +#define OP(mtx) MUTEX_STATS_CTL_PROTO_GEN(arenas_i_mutexes_##mtx) +MUTEX_PROF_ARENA_MUTEXES +#undef OP + +/* Arena bin mutexes. */ +MUTEX_STATS_CTL_PROTO_GEN(arenas_i_bins_j_mutex) +#undef MUTEX_STATS_CTL_PROTO_GEN + +CTL_PROTO(stats_mutexes_reset) + +/******************************************************************************/ +/* mallctl tree. */ + +#define NAME(n) {true}, n +#define CHILD(t, c) \ + sizeof(c##_node) / sizeof(ctl_##t##_node_t), \ + (ctl_node_t *)c##_node, \ + NULL +#define CTL(c) 0, NULL, c##_ctl + +/* + * Only handles internal indexed nodes, since there are currently no external + * ones. + */ +#define INDEX(i) {false}, i##_index + +static const ctl_named_node_t thread_tcache_ncached_max_node[] = { + {NAME("read_sizeclass"), + CTL(thread_tcache_ncached_max_read_sizeclass)}, + {NAME("write"), CTL(thread_tcache_ncached_max_write)} +}; + +static const ctl_named_node_t thread_tcache_node[] = { + {NAME("enabled"), CTL(thread_tcache_enabled)}, + {NAME("max"), CTL(thread_tcache_max)}, + {NAME("flush"), CTL(thread_tcache_flush)}, + {NAME("ncached_max"), CHILD(named, thread_tcache_ncached_max)} +}; + +static const ctl_named_node_t thread_peak_node[] = { + {NAME("read"), CTL(thread_peak_read)}, + {NAME("reset"), CTL(thread_peak_reset)}, +}; + +static const ctl_named_node_t thread_prof_node[] = { + {NAME("name"), CTL(thread_prof_name)}, + {NAME("active"), CTL(thread_prof_active)} +}; + +static const ctl_named_node_t thread_node[] = { + {NAME("arena"), CTL(thread_arena)}, + {NAME("allocated"), CTL(thread_allocated)}, + {NAME("allocatedp"), CTL(thread_allocatedp)}, + {NAME("deallocated"), CTL(thread_deallocated)}, + {NAME("deallocatedp"), CTL(thread_deallocatedp)}, + {NAME("tcache"), CHILD(named, thread_tcache)}, + {NAME("peak"), CHILD(named, thread_peak)}, + {NAME("prof"), CHILD(named, thread_prof)}, + {NAME("idle"), CTL(thread_idle)} +}; + +static const ctl_named_node_t config_node[] = { + {NAME("cache_oblivious"), CTL(config_cache_oblivious)}, + {NAME("debug"), CTL(config_debug)}, + {NAME("fill"), CTL(config_fill)}, + {NAME("lazy_lock"), CTL(config_lazy_lock)}, + {NAME("malloc_conf"), CTL(config_malloc_conf)}, + {NAME("opt_safety_checks"), CTL(config_opt_safety_checks)}, + {NAME("prof"), CTL(config_prof)}, + {NAME("prof_libgcc"), CTL(config_prof_libgcc)}, + {NAME("prof_libunwind"), CTL(config_prof_libunwind)}, + {NAME("stats"), CTL(config_stats)}, + {NAME("utrace"), CTL(config_utrace)}, + {NAME("xmalloc"), CTL(config_xmalloc)} +}; + +static const ctl_named_node_t opt_malloc_conf_node[] = { + {NAME("symlink"), CTL(opt_malloc_conf_symlink)}, + {NAME("env_var"), CTL(opt_malloc_conf_env_var)}, + {NAME("global_var"), CTL(opt_malloc_conf_global_var)}, + {NAME("global_var_2_conf_harder"), + CTL(opt_malloc_conf_global_var_2_conf_harder)} +}; + +static const ctl_named_node_t opt_node[] = { + {NAME("abort"), CTL(opt_abort)}, + {NAME("abort_conf"), CTL(opt_abort_conf)}, + {NAME("cache_oblivious"), CTL(opt_cache_oblivious)}, + {NAME("trust_madvise"), CTL(opt_trust_madvise)}, + {NAME("confirm_conf"), CTL(opt_confirm_conf)}, + {NAME("hpa"), CTL(opt_hpa)}, + {NAME("hpa_slab_max_alloc"), CTL(opt_hpa_slab_max_alloc)}, + {NAME("hpa_hugification_threshold"), + CTL(opt_hpa_hugification_threshold)}, + {NAME("hpa_hugify_delay_ms"), CTL(opt_hpa_hugify_delay_ms)}, + {NAME("hpa_min_purge_interval_ms"), CTL(opt_hpa_min_purge_interval_ms)}, + {NAME("hpa_strict_min_purge_interval"), CTL(opt_hpa_strict_min_purge_interval)}, + {NAME("hpa_dirty_mult"), CTL(opt_hpa_dirty_mult)}, + {NAME("hpa_sec_nshards"), CTL(opt_hpa_sec_nshards)}, + {NAME("hpa_sec_max_alloc"), CTL(opt_hpa_sec_max_alloc)}, + {NAME("hpa_sec_max_bytes"), CTL(opt_hpa_sec_max_bytes)}, + {NAME("hpa_sec_bytes_after_flush"), + CTL(opt_hpa_sec_bytes_after_flush)}, + {NAME("hpa_sec_batch_fill_extra"), + CTL(opt_hpa_sec_batch_fill_extra)}, + {NAME("metadata_thp"), CTL(opt_metadata_thp)}, + {NAME("retain"), CTL(opt_retain)}, + {NAME("dss"), CTL(opt_dss)}, + {NAME("narenas"), CTL(opt_narenas)}, + {NAME("percpu_arena"), CTL(opt_percpu_arena)}, + {NAME("oversize_threshold"), CTL(opt_oversize_threshold)}, + {NAME("mutex_max_spin"), CTL(opt_mutex_max_spin)}, + {NAME("background_thread"), CTL(opt_background_thread)}, + {NAME("max_background_threads"), CTL(opt_max_background_threads)}, + {NAME("dirty_decay_ms"), CTL(opt_dirty_decay_ms)}, + {NAME("muzzy_decay_ms"), CTL(opt_muzzy_decay_ms)}, + {NAME("stats_print"), CTL(opt_stats_print)}, + {NAME("stats_print_opts"), CTL(opt_stats_print_opts)}, + {NAME("stats_interval"), CTL(opt_stats_interval)}, + {NAME("stats_interval_opts"), CTL(opt_stats_interval_opts)}, + {NAME("junk"), CTL(opt_junk)}, + {NAME("zero"), CTL(opt_zero)}, + {NAME("utrace"), CTL(opt_utrace)}, + {NAME("xmalloc"), CTL(opt_xmalloc)}, + {NAME("experimental_infallible_new"), + CTL(opt_experimental_infallible_new)}, + {NAME("max_batched_size"), CTL(opt_max_batched_size)}, + {NAME("remote_free_max"), CTL(opt_remote_free_max)}, + {NAME("remote_free_max_batch"), CTL(opt_remote_free_max_batch)}, + {NAME("tcache"), CTL(opt_tcache)}, + {NAME("tcache_max"), CTL(opt_tcache_max)}, + {NAME("tcache_nslots_small_min"), + CTL(opt_tcache_nslots_small_min)}, + {NAME("tcache_nslots_small_max"), + CTL(opt_tcache_nslots_small_max)}, + {NAME("tcache_nslots_large"), CTL(opt_tcache_nslots_large)}, + {NAME("lg_tcache_nslots_mul"), CTL(opt_lg_tcache_nslots_mul)}, + {NAME("tcache_gc_incr_bytes"), CTL(opt_tcache_gc_incr_bytes)}, + {NAME("tcache_gc_delay_bytes"), CTL(opt_tcache_gc_delay_bytes)}, + {NAME("lg_tcache_flush_small_div"), + CTL(opt_lg_tcache_flush_small_div)}, + {NAME("lg_tcache_flush_large_div"), + CTL(opt_lg_tcache_flush_large_div)}, + {NAME("thp"), CTL(opt_thp)}, + {NAME("lg_extent_max_active_fit"), CTL(opt_lg_extent_max_active_fit)}, + {NAME("prof"), CTL(opt_prof)}, + {NAME("prof_prefix"), CTL(opt_prof_prefix)}, + {NAME("prof_active"), CTL(opt_prof_active)}, + {NAME("prof_thread_active_init"), CTL(opt_prof_thread_active_init)}, + {NAME("prof_bt_max"), CTL(opt_prof_bt_max)}, + {NAME("lg_prof_sample"), CTL(opt_lg_prof_sample)}, + {NAME("lg_prof_interval"), CTL(opt_lg_prof_interval)}, + {NAME("prof_gdump"), CTL(opt_prof_gdump)}, + {NAME("prof_final"), CTL(opt_prof_final)}, + {NAME("prof_leak"), CTL(opt_prof_leak)}, + {NAME("prof_leak_error"), CTL(opt_prof_leak_error)}, + {NAME("prof_accum"), CTL(opt_prof_accum)}, + {NAME("prof_pid_namespace"), CTL(opt_prof_pid_namespace)}, + {NAME("prof_recent_alloc_max"), CTL(opt_prof_recent_alloc_max)}, + {NAME("prof_stats"), CTL(opt_prof_stats)}, + {NAME("prof_sys_thread_name"), CTL(opt_prof_sys_thread_name)}, + {NAME("prof_time_resolution"), CTL(opt_prof_time_res)}, + {NAME("lg_san_uaf_align"), CTL(opt_lg_san_uaf_align)}, + {NAME("zero_realloc"), CTL(opt_zero_realloc)}, + {NAME("debug_double_free_max_scan"), + CTL(opt_debug_double_free_max_scan)}, + {NAME("malloc_conf"), CHILD(named, opt_malloc_conf)} +}; + +static const ctl_named_node_t tcache_node[] = { + {NAME("create"), CTL(tcache_create)}, + {NAME("flush"), CTL(tcache_flush)}, + {NAME("destroy"), CTL(tcache_destroy)} +}; + +static const ctl_named_node_t arena_i_node[] = { + {NAME("initialized"), CTL(arena_i_initialized)}, + {NAME("decay"), CTL(arena_i_decay)}, + {NAME("purge"), CTL(arena_i_purge)}, + {NAME("reset"), CTL(arena_i_reset)}, + {NAME("destroy"), CTL(arena_i_destroy)}, + {NAME("dss"), CTL(arena_i_dss)}, + /* + * Undocumented for now, since we anticipate an arena API in flux after + * we cut the last 5-series release. + */ + {NAME("oversize_threshold"), CTL(arena_i_oversize_threshold)}, + {NAME("dirty_decay_ms"), CTL(arena_i_dirty_decay_ms)}, + {NAME("muzzy_decay_ms"), CTL(arena_i_muzzy_decay_ms)}, + {NAME("extent_hooks"), CTL(arena_i_extent_hooks)}, + {NAME("retain_grow_limit"), CTL(arena_i_retain_grow_limit)}, + {NAME("name"), CTL(arena_i_name)} +}; +static const ctl_named_node_t super_arena_i_node[] = { + {NAME(""), CHILD(named, arena_i)} +}; + +static const ctl_indexed_node_t arena_node[] = { + {INDEX(arena_i)} +}; + +static const ctl_named_node_t arenas_bin_i_node[] = { + {NAME("size"), CTL(arenas_bin_i_size)}, + {NAME("nregs"), CTL(arenas_bin_i_nregs)}, + {NAME("slab_size"), CTL(arenas_bin_i_slab_size)}, + {NAME("nshards"), CTL(arenas_bin_i_nshards)} +}; +static const ctl_named_node_t super_arenas_bin_i_node[] = { + {NAME(""), CHILD(named, arenas_bin_i)} +}; + +static const ctl_indexed_node_t arenas_bin_node[] = { + {INDEX(arenas_bin_i)} +}; + +static const ctl_named_node_t arenas_lextent_i_node[] = { + {NAME("size"), CTL(arenas_lextent_i_size)} +}; +static const ctl_named_node_t super_arenas_lextent_i_node[] = { + {NAME(""), CHILD(named, arenas_lextent_i)} +}; + +static const ctl_indexed_node_t arenas_lextent_node[] = { + {INDEX(arenas_lextent_i)} +}; + +static const ctl_named_node_t arenas_node[] = { + {NAME("narenas"), CTL(arenas_narenas)}, + {NAME("dirty_decay_ms"), CTL(arenas_dirty_decay_ms)}, + {NAME("muzzy_decay_ms"), CTL(arenas_muzzy_decay_ms)}, + {NAME("quantum"), CTL(arenas_quantum)}, + {NAME("page"), CTL(arenas_page)}, + {NAME("hugepage"), CTL(arenas_hugepage)}, + {NAME("tcache_max"), CTL(arenas_tcache_max)}, + {NAME("nbins"), CTL(arenas_nbins)}, + {NAME("nhbins"), CTL(arenas_nhbins)}, + {NAME("bin"), CHILD(indexed, arenas_bin)}, + {NAME("nlextents"), CTL(arenas_nlextents)}, + {NAME("lextent"), CHILD(indexed, arenas_lextent)}, + {NAME("create"), CTL(arenas_create)}, + {NAME("lookup"), CTL(arenas_lookup)} +}; + +static const ctl_named_node_t prof_stats_bins_i_node[] = { + {NAME("live"), CTL(prof_stats_bins_i_live)}, + {NAME("accum"), CTL(prof_stats_bins_i_accum)} +}; + +static const ctl_named_node_t super_prof_stats_bins_i_node[] = { + {NAME(""), CHILD(named, prof_stats_bins_i)} +}; + +static const ctl_indexed_node_t prof_stats_bins_node[] = { + {INDEX(prof_stats_bins_i)} +}; + +static const ctl_named_node_t prof_stats_lextents_i_node[] = { + {NAME("live"), CTL(prof_stats_lextents_i_live)}, + {NAME("accum"), CTL(prof_stats_lextents_i_accum)} +}; + +static const ctl_named_node_t super_prof_stats_lextents_i_node[] = { + {NAME(""), CHILD(named, prof_stats_lextents_i)} +}; + +static const ctl_indexed_node_t prof_stats_lextents_node[] = { + {INDEX(prof_stats_lextents_i)} +}; + +static const ctl_named_node_t prof_stats_node[] = { + {NAME("bins"), CHILD(indexed, prof_stats_bins)}, + {NAME("lextents"), CHILD(indexed, prof_stats_lextents)}, +}; + +static const ctl_named_node_t prof_node[] = { + {NAME("thread_active_init"), CTL(prof_thread_active_init)}, + {NAME("active"), CTL(prof_active)}, + {NAME("dump"), CTL(prof_dump)}, + {NAME("gdump"), CTL(prof_gdump)}, + {NAME("prefix"), CTL(prof_prefix)}, + {NAME("reset"), CTL(prof_reset)}, + {NAME("interval"), CTL(prof_interval)}, + {NAME("lg_sample"), CTL(lg_prof_sample)}, + {NAME("log_start"), CTL(prof_log_start)}, + {NAME("log_stop"), CTL(prof_log_stop)}, + {NAME("stats"), CHILD(named, prof_stats)} +}; + +static const ctl_named_node_t stats_arenas_i_small_node[] = { + {NAME("allocated"), CTL(stats_arenas_i_small_allocated)}, + {NAME("nmalloc"), CTL(stats_arenas_i_small_nmalloc)}, + {NAME("ndalloc"), CTL(stats_arenas_i_small_ndalloc)}, + {NAME("nrequests"), CTL(stats_arenas_i_small_nrequests)}, + {NAME("nfills"), CTL(stats_arenas_i_small_nfills)}, + {NAME("nflushes"), CTL(stats_arenas_i_small_nflushes)} +}; + +static const ctl_named_node_t stats_arenas_i_large_node[] = { + {NAME("allocated"), CTL(stats_arenas_i_large_allocated)}, + {NAME("nmalloc"), CTL(stats_arenas_i_large_nmalloc)}, + {NAME("ndalloc"), CTL(stats_arenas_i_large_ndalloc)}, + {NAME("nrequests"), CTL(stats_arenas_i_large_nrequests)}, + {NAME("nfills"), CTL(stats_arenas_i_large_nfills)}, + {NAME("nflushes"), CTL(stats_arenas_i_large_nflushes)} +}; + +#define MUTEX_PROF_DATA_NODE(prefix) \ +static const ctl_named_node_t stats_##prefix##_node[] = { \ + {NAME("num_ops"), \ + CTL(stats_##prefix##_num_ops)}, \ + {NAME("num_wait"), \ + CTL(stats_##prefix##_num_wait)}, \ + {NAME("num_spin_acq"), \ + CTL(stats_##prefix##_num_spin_acq)}, \ + {NAME("num_owner_switch"), \ + CTL(stats_##prefix##_num_owner_switch)}, \ + {NAME("total_wait_time"), \ + CTL(stats_##prefix##_total_wait_time)}, \ + {NAME("max_wait_time"), \ + CTL(stats_##prefix##_max_wait_time)}, \ + {NAME("max_num_thds"), \ + CTL(stats_##prefix##_max_num_thds)} \ + /* Note that # of current waiting thread not provided. */ \ +}; + +MUTEX_PROF_DATA_NODE(arenas_i_bins_j_mutex) + +static const ctl_named_node_t stats_arenas_i_bins_j_node[] = { + {NAME("nmalloc"), CTL(stats_arenas_i_bins_j_nmalloc)}, + {NAME("ndalloc"), CTL(stats_arenas_i_bins_j_ndalloc)}, + {NAME("nrequests"), CTL(stats_arenas_i_bins_j_nrequests)}, + {NAME("curregs"), CTL(stats_arenas_i_bins_j_curregs)}, + {NAME("nfills"), CTL(stats_arenas_i_bins_j_nfills)}, + {NAME("nflushes"), CTL(stats_arenas_i_bins_j_nflushes)}, + {NAME("nslabs"), CTL(stats_arenas_i_bins_j_nslabs)}, + {NAME("nreslabs"), CTL(stats_arenas_i_bins_j_nreslabs)}, + {NAME("curslabs"), CTL(stats_arenas_i_bins_j_curslabs)}, + {NAME("nonfull_slabs"), CTL(stats_arenas_i_bins_j_nonfull_slabs)}, + {NAME("batch_pops"), + CTL(stats_arenas_i_bins_j_batch_pops)}, + {NAME("batch_failed_pushes"), + CTL(stats_arenas_i_bins_j_batch_failed_pushes)}, + {NAME("batch_pushes"), + CTL(stats_arenas_i_bins_j_batch_pushes)}, + {NAME("batch_pushed_elems"), + CTL(stats_arenas_i_bins_j_batch_pushed_elems)}, + {NAME("mutex"), CHILD(named, stats_arenas_i_bins_j_mutex)} +}; + +static const ctl_named_node_t super_stats_arenas_i_bins_j_node[] = { + {NAME(""), CHILD(named, stats_arenas_i_bins_j)} +}; + +static const ctl_indexed_node_t stats_arenas_i_bins_node[] = { + {INDEX(stats_arenas_i_bins_j)} +}; + +static const ctl_named_node_t stats_arenas_i_lextents_j_node[] = { + {NAME("nmalloc"), CTL(stats_arenas_i_lextents_j_nmalloc)}, + {NAME("ndalloc"), CTL(stats_arenas_i_lextents_j_ndalloc)}, + {NAME("nrequests"), CTL(stats_arenas_i_lextents_j_nrequests)}, + {NAME("curlextents"), CTL(stats_arenas_i_lextents_j_curlextents)} +}; +static const ctl_named_node_t super_stats_arenas_i_lextents_j_node[] = { + {NAME(""), CHILD(named, stats_arenas_i_lextents_j)} +}; + +static const ctl_indexed_node_t stats_arenas_i_lextents_node[] = { + {INDEX(stats_arenas_i_lextents_j)} +}; + +static const ctl_named_node_t stats_arenas_i_extents_j_node[] = { + {NAME("ndirty"), CTL(stats_arenas_i_extents_j_ndirty)}, + {NAME("nmuzzy"), CTL(stats_arenas_i_extents_j_nmuzzy)}, + {NAME("nretained"), CTL(stats_arenas_i_extents_j_nretained)}, + {NAME("dirty_bytes"), CTL(stats_arenas_i_extents_j_dirty_bytes)}, + {NAME("muzzy_bytes"), CTL(stats_arenas_i_extents_j_muzzy_bytes)}, + {NAME("retained_bytes"), CTL(stats_arenas_i_extents_j_retained_bytes)} +}; + +static const ctl_named_node_t super_stats_arenas_i_extents_j_node[] = { + {NAME(""), CHILD(named, stats_arenas_i_extents_j)} +}; + +static const ctl_indexed_node_t stats_arenas_i_extents_node[] = { + {INDEX(stats_arenas_i_extents_j)} +}; + +#define OP(mtx) MUTEX_PROF_DATA_NODE(arenas_i_mutexes_##mtx) +MUTEX_PROF_ARENA_MUTEXES +#undef OP + +static const ctl_named_node_t stats_arenas_i_mutexes_node[] = { +#define OP(mtx) {NAME(#mtx), CHILD(named, stats_arenas_i_mutexes_##mtx)}, +MUTEX_PROF_ARENA_MUTEXES +#undef OP +}; + +static const ctl_named_node_t stats_arenas_i_hpa_shard_full_slabs_node[] = { + {NAME("npageslabs_nonhuge"), + CTL(stats_arenas_i_hpa_shard_full_slabs_npageslabs_nonhuge)}, + {NAME("npageslabs_huge"), + CTL(stats_arenas_i_hpa_shard_full_slabs_npageslabs_huge)}, + {NAME("nactive_nonhuge"), + CTL(stats_arenas_i_hpa_shard_full_slabs_nactive_nonhuge)}, + {NAME("nactive_huge"), + CTL(stats_arenas_i_hpa_shard_full_slabs_nactive_huge)}, + {NAME("ndirty_nonhuge"), + CTL(stats_arenas_i_hpa_shard_full_slabs_ndirty_nonhuge)}, + {NAME("ndirty_huge"), + CTL(stats_arenas_i_hpa_shard_full_slabs_ndirty_huge)} +}; + +static const ctl_named_node_t stats_arenas_i_hpa_shard_empty_slabs_node[] = { + {NAME("npageslabs_nonhuge"), + CTL(stats_arenas_i_hpa_shard_empty_slabs_npageslabs_nonhuge)}, + {NAME("npageslabs_huge"), + CTL(stats_arenas_i_hpa_shard_empty_slabs_npageslabs_huge)}, + {NAME("nactive_nonhuge"), + CTL(stats_arenas_i_hpa_shard_empty_slabs_nactive_nonhuge)}, + {NAME("nactive_huge"), + CTL(stats_arenas_i_hpa_shard_empty_slabs_nactive_huge)}, + {NAME("ndirty_nonhuge"), + CTL(stats_arenas_i_hpa_shard_empty_slabs_ndirty_nonhuge)}, + {NAME("ndirty_huge"), + CTL(stats_arenas_i_hpa_shard_empty_slabs_ndirty_huge)} +}; + +static const ctl_named_node_t stats_arenas_i_hpa_shard_nonfull_slabs_j_node[] = { + {NAME("npageslabs_nonhuge"), + CTL(stats_arenas_i_hpa_shard_nonfull_slabs_j_npageslabs_nonhuge)}, + {NAME("npageslabs_huge"), + CTL(stats_arenas_i_hpa_shard_nonfull_slabs_j_npageslabs_huge)}, + {NAME("nactive_nonhuge"), + CTL(stats_arenas_i_hpa_shard_nonfull_slabs_j_nactive_nonhuge)}, + {NAME("nactive_huge"), + CTL(stats_arenas_i_hpa_shard_nonfull_slabs_j_nactive_huge)}, + {NAME("ndirty_nonhuge"), + CTL(stats_arenas_i_hpa_shard_nonfull_slabs_j_ndirty_nonhuge)}, + {NAME("ndirty_huge"), + CTL(stats_arenas_i_hpa_shard_nonfull_slabs_j_ndirty_huge)} +}; + +static const ctl_named_node_t super_stats_arenas_i_hpa_shard_nonfull_slabs_j_node[] = { + {NAME(""), + CHILD(named, stats_arenas_i_hpa_shard_nonfull_slabs_j)} +}; + +static const ctl_indexed_node_t stats_arenas_i_hpa_shard_nonfull_slabs_node[] = +{ + {INDEX(stats_arenas_i_hpa_shard_nonfull_slabs_j)} +}; + +static const ctl_named_node_t stats_arenas_i_hpa_shard_node[] = { + {NAME("full_slabs"), CHILD(named, + stats_arenas_i_hpa_shard_full_slabs)}, + {NAME("empty_slabs"), CHILD(named, + stats_arenas_i_hpa_shard_empty_slabs)}, + {NAME("nonfull_slabs"), CHILD(indexed, + stats_arenas_i_hpa_shard_nonfull_slabs)}, + + {NAME("npurge_passes"), CTL(stats_arenas_i_hpa_shard_npurge_passes)}, + {NAME("npurges"), CTL(stats_arenas_i_hpa_shard_npurges)}, + {NAME("nhugifies"), CTL(stats_arenas_i_hpa_shard_nhugifies)}, + {NAME("ndehugifies"), CTL(stats_arenas_i_hpa_shard_ndehugifies)} +}; + +static const ctl_named_node_t stats_arenas_i_node[] = { + {NAME("nthreads"), CTL(stats_arenas_i_nthreads)}, + {NAME("uptime"), CTL(stats_arenas_i_uptime)}, + {NAME("dss"), CTL(stats_arenas_i_dss)}, + {NAME("dirty_decay_ms"), CTL(stats_arenas_i_dirty_decay_ms)}, + {NAME("muzzy_decay_ms"), CTL(stats_arenas_i_muzzy_decay_ms)}, + {NAME("pactive"), CTL(stats_arenas_i_pactive)}, + {NAME("pdirty"), CTL(stats_arenas_i_pdirty)}, + {NAME("pmuzzy"), CTL(stats_arenas_i_pmuzzy)}, + {NAME("mapped"), CTL(stats_arenas_i_mapped)}, + {NAME("retained"), CTL(stats_arenas_i_retained)}, + {NAME("extent_avail"), CTL(stats_arenas_i_extent_avail)}, + {NAME("dirty_npurge"), CTL(stats_arenas_i_dirty_npurge)}, + {NAME("dirty_nmadvise"), CTL(stats_arenas_i_dirty_nmadvise)}, + {NAME("dirty_purged"), CTL(stats_arenas_i_dirty_purged)}, + {NAME("muzzy_npurge"), CTL(stats_arenas_i_muzzy_npurge)}, + {NAME("muzzy_nmadvise"), CTL(stats_arenas_i_muzzy_nmadvise)}, + {NAME("muzzy_purged"), CTL(stats_arenas_i_muzzy_purged)}, + {NAME("base"), CTL(stats_arenas_i_base)}, + {NAME("internal"), CTL(stats_arenas_i_internal)}, + {NAME("metadata_edata"), CTL(stats_arenas_i_metadata_edata)}, + {NAME("metadata_rtree"), CTL(stats_arenas_i_metadata_rtree)}, + {NAME("metadata_thp"), CTL(stats_arenas_i_metadata_thp)}, + {NAME("tcache_bytes"), CTL(stats_arenas_i_tcache_bytes)}, + {NAME("tcache_stashed_bytes"), + CTL(stats_arenas_i_tcache_stashed_bytes)}, + {NAME("resident"), CTL(stats_arenas_i_resident)}, + {NAME("abandoned_vm"), CTL(stats_arenas_i_abandoned_vm)}, + {NAME("hpa_sec_bytes"), CTL(stats_arenas_i_hpa_sec_bytes)}, + {NAME("small"), CHILD(named, stats_arenas_i_small)}, + {NAME("large"), CHILD(named, stats_arenas_i_large)}, + {NAME("bins"), CHILD(indexed, stats_arenas_i_bins)}, + {NAME("lextents"), CHILD(indexed, stats_arenas_i_lextents)}, + {NAME("extents"), CHILD(indexed, stats_arenas_i_extents)}, + {NAME("mutexes"), CHILD(named, stats_arenas_i_mutexes)}, + {NAME("hpa_shard"), CHILD(named, stats_arenas_i_hpa_shard)} +}; +static const ctl_named_node_t super_stats_arenas_i_node[] = { + {NAME(""), CHILD(named, stats_arenas_i)} +}; + +static const ctl_indexed_node_t stats_arenas_node[] = { + {INDEX(stats_arenas_i)} +}; + +static const ctl_named_node_t stats_background_thread_node[] = { + {NAME("num_threads"), CTL(stats_background_thread_num_threads)}, + {NAME("num_runs"), CTL(stats_background_thread_num_runs)}, + {NAME("run_interval"), CTL(stats_background_thread_run_interval)} +}; + +#define OP(mtx) MUTEX_PROF_DATA_NODE(mutexes_##mtx) +MUTEX_PROF_GLOBAL_MUTEXES +#undef OP + +static const ctl_named_node_t stats_mutexes_node[] = { +#define OP(mtx) {NAME(#mtx), CHILD(named, stats_mutexes_##mtx)}, +MUTEX_PROF_GLOBAL_MUTEXES +#undef OP + {NAME("reset"), CTL(stats_mutexes_reset)} +}; +#undef MUTEX_PROF_DATA_NODE + +static const ctl_named_node_t stats_node[] = { + {NAME("allocated"), CTL(stats_allocated)}, + {NAME("active"), CTL(stats_active)}, + {NAME("metadata"), CTL(stats_metadata)}, + {NAME("metadata_edata"), CTL(stats_metadata_edata)}, + {NAME("metadata_rtree"), CTL(stats_metadata_rtree)}, + {NAME("metadata_thp"), CTL(stats_metadata_thp)}, + {NAME("resident"), CTL(stats_resident)}, + {NAME("mapped"), CTL(stats_mapped)}, + {NAME("retained"), CTL(stats_retained)}, + {NAME("background_thread"), + CHILD(named, stats_background_thread)}, + {NAME("mutexes"), CHILD(named, stats_mutexes)}, + {NAME("arenas"), CHILD(indexed, stats_arenas)}, + {NAME("zero_reallocs"), CTL(stats_zero_reallocs)}, +}; + +static const ctl_named_node_t experimental_hooks_node[] = { + {NAME("install"), CTL(experimental_hooks_install)}, + {NAME("remove"), CTL(experimental_hooks_remove)}, + {NAME("prof_backtrace"), CTL(experimental_hooks_prof_backtrace)}, + {NAME("prof_dump"), CTL(experimental_hooks_prof_dump)}, + {NAME("prof_sample"), CTL(experimental_hooks_prof_sample)}, + {NAME("prof_sample_free"), CTL(experimental_hooks_prof_sample_free)}, + {NAME("safety_check_abort"), CTL(experimental_hooks_safety_check_abort)}, +}; + +static const ctl_named_node_t experimental_thread_node[] = { + {NAME("activity_callback"), + CTL(experimental_thread_activity_callback)} +}; + +static const ctl_named_node_t experimental_utilization_node[] = { + {NAME("query"), CTL(experimental_utilization_query)}, + {NAME("batch_query"), CTL(experimental_utilization_batch_query)} +}; + +static const ctl_named_node_t experimental_arenas_i_node[] = { + {NAME("pactivep"), CTL(experimental_arenas_i_pactivep)} +}; +static const ctl_named_node_t super_experimental_arenas_i_node[] = { + {NAME(""), CHILD(named, experimental_arenas_i)} +}; + +static const ctl_indexed_node_t experimental_arenas_node[] = { + {INDEX(experimental_arenas_i)} +}; + +static const ctl_named_node_t experimental_prof_recent_node[] = { + {NAME("alloc_max"), CTL(experimental_prof_recent_alloc_max)}, + {NAME("alloc_dump"), CTL(experimental_prof_recent_alloc_dump)}, +}; + +static const ctl_named_node_t experimental_node[] = { + {NAME("hooks"), CHILD(named, experimental_hooks)}, + {NAME("utilization"), CHILD(named, experimental_utilization)}, + {NAME("arenas"), CHILD(indexed, experimental_arenas)}, + {NAME("arenas_create_ext"), CTL(experimental_arenas_create_ext)}, + {NAME("prof_recent"), CHILD(named, experimental_prof_recent)}, + {NAME("batch_alloc"), CTL(experimental_batch_alloc)}, + {NAME("thread"), CHILD(named, experimental_thread)} +}; + +static const ctl_named_node_t root_node[] = { + {NAME("version"), CTL(version)}, + {NAME("epoch"), CTL(epoch)}, + {NAME("background_thread"), CTL(background_thread)}, + {NAME("max_background_threads"), CTL(max_background_threads)}, + {NAME("thread"), CHILD(named, thread)}, + {NAME("config"), CHILD(named, config)}, + {NAME("opt"), CHILD(named, opt)}, + {NAME("tcache"), CHILD(named, tcache)}, + {NAME("arena"), CHILD(indexed, arena)}, + {NAME("arenas"), CHILD(named, arenas)}, + {NAME("prof"), CHILD(named, prof)}, + {NAME("stats"), CHILD(named, stats)}, + {NAME("experimental"), CHILD(named, experimental)} +}; +static const ctl_named_node_t super_root_node[] = { + {NAME(""), CHILD(named, root)} +}; + +#undef NAME +#undef CHILD +#undef CTL +#undef INDEX + +/******************************************************************************/ + +/* + * Sets *dst + *src non-atomically. This is safe, since everything is + * synchronized by the ctl mutex. + */ +static void +ctl_accum_locked_u64(locked_u64_t *dst, locked_u64_t *src) { + locked_inc_u64_unsynchronized(dst, + locked_read_u64_unsynchronized(src)); +} + +static void +ctl_accum_atomic_zu(atomic_zu_t *dst, atomic_zu_t *src) { + size_t cur_dst = atomic_load_zu(dst, ATOMIC_RELAXED); + size_t cur_src = atomic_load_zu(src, ATOMIC_RELAXED); + atomic_store_zu(dst, cur_dst + cur_src, ATOMIC_RELAXED); +} + +/******************************************************************************/ + +static unsigned +arenas_i2a_impl(size_t i, bool compat, bool validate) { + unsigned a; + + switch (i) { + case MALLCTL_ARENAS_ALL: + a = 0; + break; + case MALLCTL_ARENAS_DESTROYED: + a = 1; + break; + default: + if (compat && i == ctl_arenas->narenas) { + /* + * Provide deprecated backward compatibility for + * accessing the merged stats at index narenas rather + * than via MALLCTL_ARENAS_ALL. This is scheduled for + * removal in 6.0.0. + */ + a = 0; + } else if (validate && i >= ctl_arenas->narenas) { + a = UINT_MAX; + } else { + /* + * This function should never be called for an index + * more than one past the range of indices that have + * initialized ctl data. + */ + assert(i < ctl_arenas->narenas || (!validate && i == + ctl_arenas->narenas)); + a = (unsigned)i + 2; + } + break; + } + + return a; +} + +static unsigned +arenas_i2a(size_t i) { + return arenas_i2a_impl(i, true, false); +} + +static ctl_arena_t * +arenas_i_impl(tsd_t *tsd, size_t i, bool compat, bool init) { + ctl_arena_t *ret; + + assert(!compat || !init); + + ret = ctl_arenas->arenas[arenas_i2a_impl(i, compat, false)]; + if (init && ret == NULL) { + if (config_stats) { + struct container_s { + ctl_arena_t ctl_arena; + ctl_arena_stats_t astats; + }; + struct container_s *cont = + (struct container_s *)base_alloc(tsd_tsdn(tsd), + b0get(), sizeof(struct container_s), QUANTUM); + if (cont == NULL) { + return NULL; + } + ret = &cont->ctl_arena; + ret->astats = &cont->astats; + } else { + ret = (ctl_arena_t *)base_alloc(tsd_tsdn(tsd), b0get(), + sizeof(ctl_arena_t), QUANTUM); + if (ret == NULL) { + return NULL; + } + } + ret->arena_ind = (unsigned)i; + ctl_arenas->arenas[arenas_i2a_impl(i, compat, false)] = ret; + } + + assert(ret == NULL || arenas_i2a(ret->arena_ind) == arenas_i2a(i)); + return ret; +} + +static ctl_arena_t * +arenas_i(size_t i) { + ctl_arena_t *ret = arenas_i_impl(tsd_fetch(), i, true, false); + assert(ret != NULL); + return ret; +} + +static void +ctl_arena_clear(ctl_arena_t *ctl_arena) { + ctl_arena->nthreads = 0; + ctl_arena->dss = dss_prec_names[dss_prec_limit]; + ctl_arena->dirty_decay_ms = -1; + ctl_arena->muzzy_decay_ms = -1; + ctl_arena->pactive = 0; + ctl_arena->pdirty = 0; + ctl_arena->pmuzzy = 0; + if (config_stats) { + memset(ctl_arena->astats, 0, sizeof(*(ctl_arena->astats))); + } +} + +static void +ctl_arena_stats_amerge(tsdn_t *tsdn, ctl_arena_t *ctl_arena, arena_t *arena) { + unsigned i; + + if (config_stats) { + arena_stats_merge(tsdn, arena, &ctl_arena->nthreads, + &ctl_arena->dss, &ctl_arena->dirty_decay_ms, + &ctl_arena->muzzy_decay_ms, &ctl_arena->pactive, + &ctl_arena->pdirty, &ctl_arena->pmuzzy, + &ctl_arena->astats->astats, ctl_arena->astats->bstats, + ctl_arena->astats->lstats, ctl_arena->astats->estats, + &ctl_arena->astats->hpastats, &ctl_arena->astats->secstats); + + for (i = 0; i < SC_NBINS; i++) { + bin_stats_t *bstats = + &ctl_arena->astats->bstats[i].stats_data; + ctl_arena->astats->allocated_small += bstats->curregs * + sz_index2size(i); + ctl_arena->astats->nmalloc_small += bstats->nmalloc; + ctl_arena->astats->ndalloc_small += bstats->ndalloc; + ctl_arena->astats->nrequests_small += bstats->nrequests; + ctl_arena->astats->nfills_small += bstats->nfills; + ctl_arena->astats->nflushes_small += bstats->nflushes; + } + } else { + arena_basic_stats_merge(tsdn, arena, &ctl_arena->nthreads, + &ctl_arena->dss, &ctl_arena->dirty_decay_ms, + &ctl_arena->muzzy_decay_ms, &ctl_arena->pactive, + &ctl_arena->pdirty, &ctl_arena->pmuzzy); + } +} + +static void +ctl_arena_stats_sdmerge(ctl_arena_t *ctl_sdarena, ctl_arena_t *ctl_arena, + bool destroyed) { + unsigned i; + + if (!destroyed) { + ctl_sdarena->nthreads += ctl_arena->nthreads; + ctl_sdarena->pactive += ctl_arena->pactive; + ctl_sdarena->pdirty += ctl_arena->pdirty; + ctl_sdarena->pmuzzy += ctl_arena->pmuzzy; + } else { + assert(ctl_arena->nthreads == 0); + assert(ctl_arena->pactive == 0); + assert(ctl_arena->pdirty == 0); + assert(ctl_arena->pmuzzy == 0); + } + + if (config_stats) { + ctl_arena_stats_t *sdstats = ctl_sdarena->astats; + ctl_arena_stats_t *astats = ctl_arena->astats; + + if (!destroyed) { + sdstats->astats.mapped += astats->astats.mapped; + sdstats->astats.pa_shard_stats.pac_stats.retained + += astats->astats.pa_shard_stats.pac_stats.retained; + sdstats->astats.pa_shard_stats.edata_avail + += astats->astats.pa_shard_stats.edata_avail; + } + + ctl_accum_locked_u64( + &sdstats->astats.pa_shard_stats.pac_stats.decay_dirty.npurge, + &astats->astats.pa_shard_stats.pac_stats.decay_dirty.npurge); + ctl_accum_locked_u64( + &sdstats->astats.pa_shard_stats.pac_stats.decay_dirty.nmadvise, + &astats->astats.pa_shard_stats.pac_stats.decay_dirty.nmadvise); + ctl_accum_locked_u64( + &sdstats->astats.pa_shard_stats.pac_stats.decay_dirty.purged, + &astats->astats.pa_shard_stats.pac_stats.decay_dirty.purged); + + ctl_accum_locked_u64( + &sdstats->astats.pa_shard_stats.pac_stats.decay_muzzy.npurge, + &astats->astats.pa_shard_stats.pac_stats.decay_muzzy.npurge); + ctl_accum_locked_u64( + &sdstats->astats.pa_shard_stats.pac_stats.decay_muzzy.nmadvise, + &astats->astats.pa_shard_stats.pac_stats.decay_muzzy.nmadvise); + ctl_accum_locked_u64( + &sdstats->astats.pa_shard_stats.pac_stats.decay_muzzy.purged, + &astats->astats.pa_shard_stats.pac_stats.decay_muzzy.purged); + +#define OP(mtx) malloc_mutex_prof_merge( \ + &(sdstats->astats.mutex_prof_data[ \ + arena_prof_mutex_##mtx]), \ + &(astats->astats.mutex_prof_data[ \ + arena_prof_mutex_##mtx])); +MUTEX_PROF_ARENA_MUTEXES +#undef OP + if (!destroyed) { + sdstats->astats.base += astats->astats.base; + sdstats->astats.metadata_edata += astats->astats + .metadata_edata; + sdstats->astats.metadata_rtree += astats->astats + .metadata_rtree; + sdstats->astats.resident += astats->astats.resident; + sdstats->astats.metadata_thp += astats->astats.metadata_thp; + ctl_accum_atomic_zu(&sdstats->astats.internal, + &astats->astats.internal); + } else { + assert(atomic_load_zu( + &astats->astats.internal, ATOMIC_RELAXED) == 0); + } + + if (!destroyed) { + sdstats->allocated_small += astats->allocated_small; + } else { + assert(astats->allocated_small == 0); + } + sdstats->nmalloc_small += astats->nmalloc_small; + sdstats->ndalloc_small += astats->ndalloc_small; + sdstats->nrequests_small += astats->nrequests_small; + sdstats->nfills_small += astats->nfills_small; + sdstats->nflushes_small += astats->nflushes_small; + + if (!destroyed) { + sdstats->astats.allocated_large += + astats->astats.allocated_large; + } else { + assert(astats->astats.allocated_large == 0); + } + sdstats->astats.nmalloc_large += astats->astats.nmalloc_large; + sdstats->astats.ndalloc_large += astats->astats.ndalloc_large; + sdstats->astats.nrequests_large + += astats->astats.nrequests_large; + sdstats->astats.nflushes_large += astats->astats.nflushes_large; + ctl_accum_atomic_zu( + &sdstats->astats.pa_shard_stats.pac_stats.abandoned_vm, + &astats->astats.pa_shard_stats.pac_stats.abandoned_vm); + + sdstats->astats.tcache_bytes += astats->astats.tcache_bytes; + sdstats->astats.tcache_stashed_bytes += + astats->astats.tcache_stashed_bytes; + + if (ctl_arena->arena_ind == 0) { + sdstats->astats.uptime = astats->astats.uptime; + } + + /* Merge bin stats. */ + for (i = 0; i < SC_NBINS; i++) { + bin_stats_t *bstats = &astats->bstats[i].stats_data; + bin_stats_t *merged = &sdstats->bstats[i].stats_data; + merged->nmalloc += bstats->nmalloc; + merged->ndalloc += bstats->ndalloc; + merged->nrequests += bstats->nrequests; + if (!destroyed) { + merged->curregs += bstats->curregs; + } else { + assert(bstats->curregs == 0); + } + merged->nfills += bstats->nfills; + merged->nflushes += bstats->nflushes; + merged->nslabs += bstats->nslabs; + merged->reslabs += bstats->reslabs; + if (!destroyed) { + merged->curslabs += bstats->curslabs; + merged->nonfull_slabs += bstats->nonfull_slabs; + } else { + assert(bstats->curslabs == 0); + assert(bstats->nonfull_slabs == 0); + } + + merged->batch_pops + += bstats->batch_pops; + merged->batch_failed_pushes + += bstats->batch_failed_pushes; + merged->batch_pushes + += bstats->batch_pushes; + merged->batch_pushed_elems + += bstats->batch_pushed_elems; + + malloc_mutex_prof_merge(&sdstats->bstats[i].mutex_data, + &astats->bstats[i].mutex_data); + } + + /* Merge stats for large allocations. */ + for (i = 0; i < SC_NSIZES - SC_NBINS; i++) { + ctl_accum_locked_u64(&sdstats->lstats[i].nmalloc, + &astats->lstats[i].nmalloc); + ctl_accum_locked_u64(&sdstats->lstats[i].ndalloc, + &astats->lstats[i].ndalloc); + ctl_accum_locked_u64(&sdstats->lstats[i].nrequests, + &astats->lstats[i].nrequests); + if (!destroyed) { + sdstats->lstats[i].curlextents += + astats->lstats[i].curlextents; + } else { + assert(astats->lstats[i].curlextents == 0); + } + } + + /* Merge extents stats. */ + for (i = 0; i < SC_NPSIZES; i++) { + sdstats->estats[i].ndirty += astats->estats[i].ndirty; + sdstats->estats[i].nmuzzy += astats->estats[i].nmuzzy; + sdstats->estats[i].nretained + += astats->estats[i].nretained; + sdstats->estats[i].dirty_bytes + += astats->estats[i].dirty_bytes; + sdstats->estats[i].muzzy_bytes + += astats->estats[i].muzzy_bytes; + sdstats->estats[i].retained_bytes + += astats->estats[i].retained_bytes; + } + + /* Merge HPA stats. */ + hpa_shard_stats_accum(&sdstats->hpastats, &astats->hpastats); + sec_stats_accum(&sdstats->secstats, &astats->secstats); + } +} + +static void +ctl_arena_refresh(tsdn_t *tsdn, arena_t *arena, ctl_arena_t *ctl_sdarena, + unsigned i, bool destroyed) { + ctl_arena_t *ctl_arena = arenas_i(i); + + ctl_arena_clear(ctl_arena); + ctl_arena_stats_amerge(tsdn, ctl_arena, arena); + /* Merge into sum stats as well. */ + ctl_arena_stats_sdmerge(ctl_sdarena, ctl_arena, destroyed); +} + +static unsigned +ctl_arena_init(tsd_t *tsd, const arena_config_t *config) { + unsigned arena_ind; + ctl_arena_t *ctl_arena; + + if ((ctl_arena = ql_last(&ctl_arenas->destroyed, destroyed_link)) != + NULL) { + ql_remove(&ctl_arenas->destroyed, ctl_arena, destroyed_link); + arena_ind = ctl_arena->arena_ind; + } else { + arena_ind = ctl_arenas->narenas; + } + + /* Trigger stats allocation. */ + if (arenas_i_impl(tsd, arena_ind, false, true) == NULL) { + return UINT_MAX; + } + + /* Initialize new arena. */ + if (arena_init(tsd_tsdn(tsd), arena_ind, config) == NULL) { + return UINT_MAX; + } + + if (arena_ind == ctl_arenas->narenas) { + ctl_arenas->narenas++; + } + + return arena_ind; +} + +static void +ctl_background_thread_stats_read(tsdn_t *tsdn) { + background_thread_stats_t *stats = &ctl_stats->background_thread; + if (!have_background_thread || + background_thread_stats_read(tsdn, stats)) { + memset(stats, 0, sizeof(background_thread_stats_t)); + nstime_init_zero(&stats->run_interval); + } + malloc_mutex_prof_copy( + &ctl_stats->mutex_prof_data[global_prof_mutex_max_per_bg_thd], + &stats->max_counter_per_bg_thd); +} + +static void +ctl_refresh(tsdn_t *tsdn) { + malloc_mutex_assert_owner(tsdn, &ctl_mtx); + /* + * We are guaranteed that `ctl_arenas->narenas` will not change + * underneath us since we hold `ctl_mtx` for the duration of this + * function. Unfortunately static analysis tools do not understand this, + * so we are extracting `narenas` into a local variable solely for the + * sake of exposing this information to such tools. + */ + const unsigned narenas = ctl_arenas->narenas; + assert(narenas > 0); + ctl_arena_t *ctl_sarena = arenas_i(MALLCTL_ARENAS_ALL); + VARIABLE_ARRAY_UNSAFE(arena_t *, tarenas, narenas); + + /* + * Clear sum stats, since they will be merged into by + * ctl_arena_refresh(). + */ + ctl_arena_clear(ctl_sarena); + + for (unsigned i = 0; i < narenas; i++) { + tarenas[i] = arena_get(tsdn, i, false); + } + + for (unsigned i = 0; i < narenas; i++) { + ctl_arena_t *ctl_arena = arenas_i(i); + bool initialized = (tarenas[i] != NULL); + + ctl_arena->initialized = initialized; + if (initialized) { + ctl_arena_refresh(tsdn, tarenas[i], ctl_sarena, i, + false); + } + } + + if (config_stats) { + ctl_stats->allocated = ctl_sarena->astats->allocated_small + + ctl_sarena->astats->astats.allocated_large; + ctl_stats->active = (ctl_sarena->pactive << LG_PAGE); + ctl_stats->metadata = ctl_sarena->astats->astats.base + + atomic_load_zu(&ctl_sarena->astats->astats.internal, + ATOMIC_RELAXED); + ctl_stats->metadata_edata = ctl_sarena->astats->astats + .metadata_edata; + ctl_stats->metadata_rtree = ctl_sarena->astats->astats + .metadata_rtree; + ctl_stats->resident = ctl_sarena->astats->astats.resident; + ctl_stats->metadata_thp = + ctl_sarena->astats->astats.metadata_thp; + ctl_stats->mapped = ctl_sarena->astats->astats.mapped; + ctl_stats->retained = ctl_sarena->astats->astats + .pa_shard_stats.pac_stats.retained; + + ctl_background_thread_stats_read(tsdn); + +#define READ_GLOBAL_MUTEX_PROF_DATA(i, mtx) \ + malloc_mutex_lock(tsdn, &mtx); \ + malloc_mutex_prof_read(tsdn, &ctl_stats->mutex_prof_data[i], &mtx); \ + malloc_mutex_unlock(tsdn, &mtx); + + if (config_prof && opt_prof) { + READ_GLOBAL_MUTEX_PROF_DATA( + global_prof_mutex_prof, bt2gctx_mtx); + READ_GLOBAL_MUTEX_PROF_DATA( + global_prof_mutex_prof_thds_data, tdatas_mtx); + READ_GLOBAL_MUTEX_PROF_DATA( + global_prof_mutex_prof_dump, prof_dump_mtx); + READ_GLOBAL_MUTEX_PROF_DATA( + global_prof_mutex_prof_recent_alloc, + prof_recent_alloc_mtx); + READ_GLOBAL_MUTEX_PROF_DATA( + global_prof_mutex_prof_recent_dump, + prof_recent_dump_mtx); + READ_GLOBAL_MUTEX_PROF_DATA( + global_prof_mutex_prof_stats, prof_stats_mtx); + } + if (have_background_thread) { + READ_GLOBAL_MUTEX_PROF_DATA( + global_prof_mutex_background_thread, + background_thread_lock); + } else { + memset(&ctl_stats->mutex_prof_data[ + global_prof_mutex_background_thread], 0, + sizeof(mutex_prof_data_t)); + } + /* We own ctl mutex already. */ + malloc_mutex_prof_read(tsdn, + &ctl_stats->mutex_prof_data[global_prof_mutex_ctl], + &ctl_mtx); +#undef READ_GLOBAL_MUTEX_PROF_DATA + } + ctl_arenas->epoch++; +} + +static bool +ctl_init(tsd_t *tsd) { + bool ret; + tsdn_t *tsdn = tsd_tsdn(tsd); + + malloc_mutex_lock(tsdn, &ctl_mtx); + if (!ctl_initialized) { + ctl_arena_t *ctl_sarena, *ctl_darena; + unsigned i; + + /* + * Allocate demand-zeroed space for pointers to the full + * range of supported arena indices. + */ + if (ctl_arenas == NULL) { + ctl_arenas = (ctl_arenas_t *)base_alloc(tsdn, + b0get(), sizeof(ctl_arenas_t), QUANTUM); + if (ctl_arenas == NULL) { + ret = true; + goto label_return; + } + } + + if (config_stats && ctl_stats == NULL) { + ctl_stats = (ctl_stats_t *)base_alloc(tsdn, b0get(), + sizeof(ctl_stats_t), QUANTUM); + if (ctl_stats == NULL) { + ret = true; + goto label_return; + } + } + + /* + * Allocate space for the current full range of arenas + * here rather than doing it lazily elsewhere, in order + * to limit when OOM-caused errors can occur. + */ + if ((ctl_sarena = arenas_i_impl(tsd, MALLCTL_ARENAS_ALL, false, + true)) == NULL) { + ret = true; + goto label_return; + } + ctl_sarena->initialized = true; + + if ((ctl_darena = arenas_i_impl(tsd, MALLCTL_ARENAS_DESTROYED, + false, true)) == NULL) { + ret = true; + goto label_return; + } + ctl_arena_clear(ctl_darena); + /* + * Don't toggle ctl_darena to initialized until an arena is + * actually destroyed, so that arena..initialized can be used + * to query whether the stats are relevant. + */ + + ctl_arenas->narenas = narenas_total_get(); + for (i = 0; i < ctl_arenas->narenas; i++) { + if (arenas_i_impl(tsd, i, false, true) == NULL) { + ret = true; + goto label_return; + } + } + + ql_new(&ctl_arenas->destroyed); + ctl_refresh(tsdn); + + ctl_initialized = true; + } + + ret = false; +label_return: + malloc_mutex_unlock(tsdn, &ctl_mtx); + return ret; +} + +static int +ctl_lookup(tsdn_t *tsdn, const ctl_named_node_t *starting_node, + const char *name, const ctl_named_node_t **ending_nodep, size_t *mibp, + size_t *depthp) { + int ret; + const char *elm, *tdot, *dot; + size_t elen, i, j; + const ctl_named_node_t *node; + + elm = name; + /* Equivalent to strchrnul(). */ + dot = ((tdot = strchr(elm, '.')) != NULL) ? tdot : strchr(elm, '\0'); + elen = (size_t)((uintptr_t)dot - (uintptr_t)elm); + if (elen == 0) { + ret = ENOENT; + goto label_return; + } + node = starting_node; + for (i = 0; i < *depthp; i++) { + assert(node); + assert(node->nchildren > 0); + if (ctl_named_node(node->children) != NULL) { + const ctl_named_node_t *pnode = node; + + /* Children are named. */ + for (j = 0; j < node->nchildren; j++) { + const ctl_named_node_t *child = + ctl_named_children(node, j); + if (strlen(child->name) == elen && + strncmp(elm, child->name, elen) == 0) { + node = child; + mibp[i] = j; + break; + } + } + if (node == pnode) { + ret = ENOENT; + goto label_return; + } + } else { + uintmax_t index; + const ctl_indexed_node_t *inode; + + /* Children are indexed. */ + index = malloc_strtoumax(elm, NULL, 10); + if (index == UINTMAX_MAX || index > SIZE_T_MAX) { + ret = ENOENT; + goto label_return; + } + + inode = ctl_indexed_node(node->children); + node = inode->index(tsdn, mibp, *depthp, (size_t)index); + if (node == NULL) { + ret = ENOENT; + goto label_return; + } + + mibp[i] = (size_t)index; + } + + /* Reached the end? */ + if (node->ctl != NULL || *dot == '\0') { + /* Terminal node. */ + if (*dot != '\0') { + /* + * The name contains more elements than are + * in this path through the tree. + */ + ret = ENOENT; + goto label_return; + } + /* Complete lookup successful. */ + *depthp = i + 1; + break; + } + + /* Update elm. */ + elm = &dot[1]; + dot = ((tdot = strchr(elm, '.')) != NULL) ? tdot : + strchr(elm, '\0'); + elen = (size_t)((uintptr_t)dot - (uintptr_t)elm); + } + if (ending_nodep != NULL) { + *ending_nodep = node; + } + + ret = 0; +label_return: + return ret; +} + +int +ctl_byname(tsd_t *tsd, const char *name, void *oldp, size_t *oldlenp, + void *newp, size_t newlen) { + int ret; + size_t depth; + size_t mib[CTL_MAX_DEPTH]; + const ctl_named_node_t *node; + + if (!ctl_initialized && ctl_init(tsd)) { + ret = EAGAIN; + goto label_return; + } + + depth = CTL_MAX_DEPTH; + ret = ctl_lookup(tsd_tsdn(tsd), super_root_node, name, &node, mib, + &depth); + if (ret != 0) { + goto label_return; + } + + if (node != NULL && node->ctl) { + ret = node->ctl(tsd, mib, depth, oldp, oldlenp, newp, newlen); + } else { + /* The name refers to a partial path through the ctl tree. */ + ret = ENOENT; + } + +label_return: + return(ret); +} + +int +ctl_nametomib(tsd_t *tsd, const char *name, size_t *mibp, size_t *miblenp) { + int ret; + + if (!ctl_initialized && ctl_init(tsd)) { + ret = EAGAIN; + goto label_return; + } + + ret = ctl_lookup(tsd_tsdn(tsd), super_root_node, name, NULL, mibp, + miblenp); +label_return: + return(ret); +} + +static int +ctl_lookupbymib(tsdn_t *tsdn, const ctl_named_node_t **ending_nodep, + const size_t *mib, size_t miblen) { + int ret; + + const ctl_named_node_t *node = super_root_node; + for (size_t i = 0; i < miblen; i++) { + assert(node); + assert(node->nchildren > 0); + if (ctl_named_node(node->children) != NULL) { + /* Children are named. */ + if (node->nchildren <= mib[i]) { + ret = ENOENT; + goto label_return; + } + node = ctl_named_children(node, mib[i]); + } else { + const ctl_indexed_node_t *inode; + + /* Indexed element. */ + inode = ctl_indexed_node(node->children); + node = inode->index(tsdn, mib, miblen, mib[i]); + if (node == NULL) { + ret = ENOENT; + goto label_return; + } + } + } + assert(ending_nodep != NULL); + *ending_nodep = node; + ret = 0; + +label_return: + return(ret); +} + +int +ctl_bymib(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp, + size_t *oldlenp, void *newp, size_t newlen) { + int ret; + const ctl_named_node_t *node; + + if (!ctl_initialized && ctl_init(tsd)) { + ret = EAGAIN; + goto label_return; + } + + ret = ctl_lookupbymib(tsd_tsdn(tsd), &node, mib, miblen); + if (ret != 0) { + goto label_return; + } + + /* Call the ctl function. */ + if (node && node->ctl) { + ret = node->ctl(tsd, mib, miblen, oldp, oldlenp, newp, newlen); + } else { + /* Partial MIB. */ + ret = ENOENT; + } + +label_return: + return(ret); +} + +int +ctl_mibnametomib(tsd_t *tsd, size_t *mib, size_t miblen, const char *name, + size_t *miblenp) { + int ret; + const ctl_named_node_t *node; + + if (!ctl_initialized && ctl_init(tsd)) { + ret = EAGAIN; + goto label_return; + } + + ret = ctl_lookupbymib(tsd_tsdn(tsd), &node, mib, miblen); + if (ret != 0) { + goto label_return; + } + if (node == NULL || node->ctl != NULL) { + ret = ENOENT; + goto label_return; + } + + assert(miblenp != NULL); + assert(*miblenp >= miblen); + *miblenp -= miblen; + ret = ctl_lookup(tsd_tsdn(tsd), node, name, NULL, mib + miblen, + miblenp); + *miblenp += miblen; +label_return: + return(ret); +} + +int +ctl_bymibname(tsd_t *tsd, size_t *mib, size_t miblen, const char *name, + size_t *miblenp, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + const ctl_named_node_t *node; + + if (!ctl_initialized && ctl_init(tsd)) { + ret = EAGAIN; + goto label_return; + } + + ret = ctl_lookupbymib(tsd_tsdn(tsd), &node, mib, miblen); + if (ret != 0) { + goto label_return; + } + if (node == NULL || node->ctl != NULL) { + ret = ENOENT; + goto label_return; + } + + assert(miblenp != NULL); + assert(*miblenp >= miblen); + *miblenp -= miblen; + /* + * The same node supplies the starting node and stores the ending node. + */ + ret = ctl_lookup(tsd_tsdn(tsd), node, name, &node, mib + miblen, + miblenp); + *miblenp += miblen; + if (ret != 0) { + goto label_return; + } + + if (node != NULL && node->ctl) { + ret = node->ctl(tsd, mib, *miblenp, oldp, oldlenp, newp, + newlen); + } else { + /* The name refers to a partial path through the ctl tree. */ + ret = ENOENT; + } + +label_return: + return(ret); +} + +bool +ctl_boot(void) { + if (malloc_mutex_init(&ctl_mtx, "ctl", WITNESS_RANK_CTL, + malloc_mutex_rank_exclusive)) { + return true; + } + + ctl_initialized = false; + + return false; +} + +void +ctl_prefork(tsdn_t *tsdn) { + malloc_mutex_prefork(tsdn, &ctl_mtx); +} + +void +ctl_postfork_parent(tsdn_t *tsdn) { + malloc_mutex_postfork_parent(tsdn, &ctl_mtx); +} + +void +ctl_postfork_child(tsdn_t *tsdn) { + malloc_mutex_postfork_child(tsdn, &ctl_mtx); +} + +void +ctl_mtx_assert_held(tsdn_t *tsdn) { + malloc_mutex_assert_owner(tsdn, &ctl_mtx); +} + +/******************************************************************************/ +/* *_ctl() functions. */ + +#define READONLY() do { \ + if (newp != NULL || newlen != 0) { \ + ret = EPERM; \ + goto label_return; \ + } \ +} while (0) + +#define WRITEONLY() do { \ + if (oldp != NULL || oldlenp != NULL) { \ + ret = EPERM; \ + goto label_return; \ + } \ +} while (0) + +/* Can read or write, but not both. */ +#define READ_XOR_WRITE() do { \ + if ((oldp != NULL && oldlenp != NULL) && (newp != NULL || \ + newlen != 0)) { \ + ret = EPERM; \ + goto label_return; \ + } \ +} while (0) + +/* Can neither read nor write. */ +#define NEITHER_READ_NOR_WRITE() do { \ + if (oldp != NULL || oldlenp != NULL || newp != NULL || \ + newlen != 0) { \ + ret = EPERM; \ + goto label_return; \ + } \ +} while (0) + +/* Verify that the space provided is enough. */ +#define VERIFY_READ(t) do { \ + if (oldp == NULL || oldlenp == NULL || *oldlenp != sizeof(t)) { \ + if (oldlenp != NULL) { \ + *oldlenp = 0; \ + } \ + ret = EINVAL; \ + goto label_return; \ + } \ +} while (0) + +#define READ(v, t) do { \ + if (oldp != NULL && oldlenp != NULL) { \ + if (*oldlenp != sizeof(t)) { \ + size_t copylen = (sizeof(t) <= *oldlenp) \ + ? sizeof(t) : *oldlenp; \ + memcpy(oldp, (void *)&(v), copylen); \ + *oldlenp = copylen; \ + ret = EINVAL; \ + goto label_return; \ + } \ + *(t *)oldp = (v); \ + } \ +} while (0) + +#define WRITE(v, t) do { \ + if (newp != NULL) { \ + if (newlen != sizeof(t)) { \ + ret = EINVAL; \ + goto label_return; \ + } \ + (v) = *(t *)newp; \ + } \ +} while (0) + +#define ASSURED_WRITE(v, t) do { \ + if (newp == NULL || newlen != sizeof(t)) { \ + ret = EINVAL; \ + goto label_return; \ + } \ + (v) = *(t *)newp; \ +} while (0) + +#define MIB_UNSIGNED(v, i) do { \ + if (mib[i] > UINT_MAX) { \ + ret = EFAULT; \ + goto label_return; \ + } \ + v = (unsigned)mib[i]; \ +} while (0) + +/* + * There's a lot of code duplication in the following macros due to limitations + * in how nested cpp macros are expanded. + */ +#define CTL_RO_CGEN(c, n, v, t) \ +static int \ +n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, \ + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { \ + int ret; \ + t oldval; \ + \ + if (!(c)) { \ + return ENOENT; \ + } \ + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); \ + READONLY(); \ + oldval = (v); \ + READ(oldval, t); \ + \ + ret = 0; \ +label_return: \ + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); \ + return ret; \ +} + +#define CTL_RO_GEN(n, v, t) \ +static int \ +n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp, \ + size_t *oldlenp, void *newp, size_t newlen) { \ + int ret; \ + t oldval; \ + \ + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); \ + READONLY(); \ + oldval = (v); \ + READ(oldval, t); \ + \ + ret = 0; \ +label_return: \ + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); \ + return ret; \ +} + +/* + * ctl_mtx is not acquired, under the assumption that no pertinent data will + * mutate during the call. + */ +#define CTL_RO_NL_CGEN(c, n, v, t) \ +static int \ +n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, \ + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { \ + int ret; \ + t oldval; \ + \ + if (!(c)) { \ + return ENOENT; \ + } \ + READONLY(); \ + oldval = (v); \ + READ(oldval, t); \ + \ + ret = 0; \ +label_return: \ + return ret; \ +} + +#define CTL_RO_NL_GEN(n, v, t) \ +static int \ +n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, \ + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { \ + int ret; \ + t oldval; \ + \ + READONLY(); \ + oldval = (v); \ + READ(oldval, t); \ + \ + ret = 0; \ +label_return: \ + return ret; \ +} + +#define CTL_RO_CONFIG_GEN(n, t) \ +static int \ +n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, \ + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { \ + int ret; \ + t oldval; \ + \ + READONLY(); \ + oldval = n; \ + READ(oldval, t); \ + \ + ret = 0; \ +label_return: \ + return ret; \ +} + +/******************************************************************************/ + +CTL_RO_NL_GEN(version, JEMALLOC_VERSION, const char *) + +static int +epoch_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + UNUSED uint64_t newval; + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + WRITE(newval, uint64_t); + if (newp != NULL) { + ctl_refresh(tsd_tsdn(tsd)); + } + READ(ctl_arenas->epoch, uint64_t); + + ret = 0; +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + return ret; +} + +static int +background_thread_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, + void *newp, size_t newlen) { + int ret; + bool oldval; + + if (!have_background_thread) { + return ENOENT; + } + background_thread_ctl_init(tsd_tsdn(tsd)); + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_lock); + if (newp == NULL) { + oldval = background_thread_enabled(); + READ(oldval, bool); + } else { + if (newlen != sizeof(bool)) { + ret = EINVAL; + goto label_return; + } + oldval = background_thread_enabled(); + READ(oldval, bool); + + bool newval = *(bool *)newp; + if (newval == oldval) { + ret = 0; + goto label_return; + } + + background_thread_enabled_set(tsd_tsdn(tsd), newval); + if (newval) { + if (background_threads_enable(tsd)) { + ret = EFAULT; + goto label_return; + } + } else { + if (background_threads_disable(tsd)) { + ret = EFAULT; + goto label_return; + } + } + } + ret = 0; +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_lock); + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + + return ret; +} + +static int +max_background_threads_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + size_t oldval; + + if (!have_background_thread) { + return ENOENT; + } + background_thread_ctl_init(tsd_tsdn(tsd)); + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_lock); + if (newp == NULL) { + oldval = max_background_threads; + READ(oldval, size_t); + } else { + if (newlen != sizeof(size_t)) { + ret = EINVAL; + goto label_return; + } + oldval = max_background_threads; + READ(oldval, size_t); + + size_t newval = *(size_t *)newp; + if (newval == oldval) { + ret = 0; + goto label_return; + } + if (newval > opt_max_background_threads) { + ret = EINVAL; + goto label_return; + } + + if (background_thread_enabled()) { + background_thread_enabled_set(tsd_tsdn(tsd), false); + if (background_threads_disable(tsd)) { + ret = EFAULT; + goto label_return; + } + max_background_threads = newval; + background_thread_enabled_set(tsd_tsdn(tsd), true); + if (background_threads_enable(tsd)) { + ret = EFAULT; + goto label_return; + } + } else { + max_background_threads = newval; + } + } + ret = 0; +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_lock); + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + + return ret; +} + +/******************************************************************************/ + +CTL_RO_CONFIG_GEN(config_cache_oblivious, bool) +CTL_RO_CONFIG_GEN(config_debug, bool) +CTL_RO_CONFIG_GEN(config_fill, bool) +CTL_RO_CONFIG_GEN(config_lazy_lock, bool) +CTL_RO_CONFIG_GEN(config_malloc_conf, const char *) +CTL_RO_CONFIG_GEN(config_opt_safety_checks, bool) +CTL_RO_CONFIG_GEN(config_prof, bool) +CTL_RO_CONFIG_GEN(config_prof_libgcc, bool) +CTL_RO_CONFIG_GEN(config_prof_libunwind, bool) +CTL_RO_CONFIG_GEN(config_stats, bool) +CTL_RO_CONFIG_GEN(config_utrace, bool) +CTL_RO_CONFIG_GEN(config_xmalloc, bool) + +/******************************************************************************/ + +CTL_RO_NL_GEN(opt_abort, opt_abort, bool) +CTL_RO_NL_GEN(opt_abort_conf, opt_abort_conf, bool) +CTL_RO_NL_GEN(opt_cache_oblivious, opt_cache_oblivious, bool) +CTL_RO_NL_GEN(opt_debug_double_free_max_scan, + opt_debug_double_free_max_scan, unsigned) +CTL_RO_NL_GEN(opt_trust_madvise, opt_trust_madvise, bool) +CTL_RO_NL_GEN(opt_confirm_conf, opt_confirm_conf, bool) + +/* HPA options. */ +CTL_RO_NL_GEN(opt_hpa, opt_hpa, bool) +CTL_RO_NL_GEN(opt_hpa_hugification_threshold, + opt_hpa_opts.hugification_threshold, size_t) +CTL_RO_NL_GEN(opt_hpa_hugify_delay_ms, opt_hpa_opts.hugify_delay_ms, uint64_t) +CTL_RO_NL_GEN(opt_hpa_min_purge_interval_ms, opt_hpa_opts.min_purge_interval_ms, + uint64_t) +CTL_RO_NL_GEN(opt_hpa_strict_min_purge_interval, + opt_hpa_opts.strict_min_purge_interval, bool) + +/* + * This will have to change before we publicly document this option; fxp_t and + * its representation are internal implementation details. + */ +CTL_RO_NL_GEN(opt_hpa_dirty_mult, opt_hpa_opts.dirty_mult, fxp_t) +CTL_RO_NL_GEN(opt_hpa_slab_max_alloc, opt_hpa_opts.slab_max_alloc, size_t) + +/* HPA SEC options */ +CTL_RO_NL_GEN(opt_hpa_sec_nshards, opt_hpa_sec_opts.nshards, size_t) +CTL_RO_NL_GEN(opt_hpa_sec_max_alloc, opt_hpa_sec_opts.max_alloc, size_t) +CTL_RO_NL_GEN(opt_hpa_sec_max_bytes, opt_hpa_sec_opts.max_bytes, size_t) +CTL_RO_NL_GEN(opt_hpa_sec_bytes_after_flush, opt_hpa_sec_opts.bytes_after_flush, + size_t) +CTL_RO_NL_GEN(opt_hpa_sec_batch_fill_extra, opt_hpa_sec_opts.batch_fill_extra, + size_t) + +CTL_RO_NL_GEN(opt_metadata_thp, metadata_thp_mode_names[opt_metadata_thp], + const char *) +CTL_RO_NL_GEN(opt_retain, opt_retain, bool) +CTL_RO_NL_GEN(opt_dss, opt_dss, const char *) +CTL_RO_NL_GEN(opt_narenas, opt_narenas, unsigned) +CTL_RO_NL_GEN(opt_percpu_arena, percpu_arena_mode_names[opt_percpu_arena], + const char *) +CTL_RO_NL_GEN(opt_mutex_max_spin, opt_mutex_max_spin, int64_t) +CTL_RO_NL_GEN(opt_oversize_threshold, opt_oversize_threshold, size_t) +CTL_RO_NL_GEN(opt_background_thread, opt_background_thread, bool) +CTL_RO_NL_GEN(opt_max_background_threads, opt_max_background_threads, size_t) +CTL_RO_NL_GEN(opt_dirty_decay_ms, opt_dirty_decay_ms, ssize_t) +CTL_RO_NL_GEN(opt_muzzy_decay_ms, opt_muzzy_decay_ms, ssize_t) +CTL_RO_NL_GEN(opt_stats_print, opt_stats_print, bool) +CTL_RO_NL_GEN(opt_stats_print_opts, opt_stats_print_opts, const char *) +CTL_RO_NL_GEN(opt_stats_interval, opt_stats_interval, int64_t) +CTL_RO_NL_GEN(opt_stats_interval_opts, opt_stats_interval_opts, const char *) +CTL_RO_NL_CGEN(config_fill, opt_junk, opt_junk, const char *) +CTL_RO_NL_CGEN(config_fill, opt_zero, opt_zero, bool) +CTL_RO_NL_CGEN(config_utrace, opt_utrace, opt_utrace, bool) +CTL_RO_NL_CGEN(config_xmalloc, opt_xmalloc, opt_xmalloc, bool) +CTL_RO_NL_CGEN(config_enable_cxx, opt_experimental_infallible_new, + opt_experimental_infallible_new, bool) +CTL_RO_NL_GEN(opt_max_batched_size, opt_bin_info_max_batched_size, size_t) +CTL_RO_NL_GEN(opt_remote_free_max, opt_bin_info_remote_free_max, + size_t) +CTL_RO_NL_GEN(opt_remote_free_max_batch, opt_bin_info_remote_free_max_batch, + size_t) +CTL_RO_NL_GEN(opt_tcache, opt_tcache, bool) +CTL_RO_NL_GEN(opt_tcache_max, opt_tcache_max, size_t) +CTL_RO_NL_GEN(opt_tcache_nslots_small_min, opt_tcache_nslots_small_min, + unsigned) +CTL_RO_NL_GEN(opt_tcache_nslots_small_max, opt_tcache_nslots_small_max, + unsigned) +CTL_RO_NL_GEN(opt_tcache_nslots_large, opt_tcache_nslots_large, unsigned) +CTL_RO_NL_GEN(opt_lg_tcache_nslots_mul, opt_lg_tcache_nslots_mul, ssize_t) +CTL_RO_NL_GEN(opt_tcache_gc_incr_bytes, opt_tcache_gc_incr_bytes, size_t) +CTL_RO_NL_GEN(opt_tcache_gc_delay_bytes, opt_tcache_gc_delay_bytes, size_t) +CTL_RO_NL_GEN(opt_lg_tcache_flush_small_div, opt_lg_tcache_flush_small_div, + unsigned) +CTL_RO_NL_GEN(opt_lg_tcache_flush_large_div, opt_lg_tcache_flush_large_div, + unsigned) +CTL_RO_NL_GEN(opt_thp, thp_mode_names[opt_thp], const char *) +CTL_RO_NL_GEN(opt_lg_extent_max_active_fit, opt_lg_extent_max_active_fit, + size_t) +CTL_RO_NL_CGEN(config_prof, opt_prof, opt_prof, bool) +CTL_RO_NL_CGEN(config_prof, opt_prof_prefix, opt_prof_prefix, const char *) +CTL_RO_NL_CGEN(config_prof, opt_prof_active, opt_prof_active, bool) +CTL_RO_NL_CGEN(config_prof, opt_prof_thread_active_init, + opt_prof_thread_active_init, bool) +CTL_RO_NL_CGEN(config_prof, opt_prof_bt_max, opt_prof_bt_max, unsigned) +CTL_RO_NL_CGEN(config_prof, opt_lg_prof_sample, opt_lg_prof_sample, size_t) +CTL_RO_NL_CGEN(config_prof, opt_prof_accum, opt_prof_accum, bool) +CTL_RO_NL_CGEN(config_prof, opt_prof_pid_namespace, opt_prof_pid_namespace, + bool) +CTL_RO_NL_CGEN(config_prof, opt_lg_prof_interval, opt_lg_prof_interval, ssize_t) +CTL_RO_NL_CGEN(config_prof, opt_prof_gdump, opt_prof_gdump, bool) +CTL_RO_NL_CGEN(config_prof, opt_prof_final, opt_prof_final, bool) +CTL_RO_NL_CGEN(config_prof, opt_prof_leak, opt_prof_leak, bool) +CTL_RO_NL_CGEN(config_prof, opt_prof_leak_error, opt_prof_leak_error, bool) +CTL_RO_NL_CGEN(config_prof, opt_prof_recent_alloc_max, + opt_prof_recent_alloc_max, ssize_t) +CTL_RO_NL_CGEN(config_prof, opt_prof_stats, opt_prof_stats, bool) +CTL_RO_NL_CGEN(config_prof, opt_prof_sys_thread_name, opt_prof_sys_thread_name, + bool) +CTL_RO_NL_CGEN(config_prof, opt_prof_time_res, + prof_time_res_mode_names[opt_prof_time_res], const char *) +CTL_RO_NL_CGEN(config_uaf_detection, opt_lg_san_uaf_align, + opt_lg_san_uaf_align, ssize_t) +CTL_RO_NL_GEN(opt_zero_realloc, + zero_realloc_mode_names[opt_zero_realloc_action], const char *) + +/* malloc_conf options */ +CTL_RO_NL_CGEN(opt_malloc_conf_symlink, opt_malloc_conf_symlink, + opt_malloc_conf_symlink, const char *) +CTL_RO_NL_CGEN(opt_malloc_conf_env_var, opt_malloc_conf_env_var, + opt_malloc_conf_env_var, const char *) +CTL_RO_NL_CGEN(je_malloc_conf, opt_malloc_conf_global_var, je_malloc_conf, + const char *) +CTL_RO_NL_CGEN(je_malloc_conf_2_conf_harder, + opt_malloc_conf_global_var_2_conf_harder, je_malloc_conf_2_conf_harder, + const char *) + +/******************************************************************************/ + +static int +thread_arena_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + arena_t *oldarena; + unsigned newind, oldind; + + oldarena = arena_choose(tsd, NULL); + if (oldarena == NULL) { + return EAGAIN; + } + newind = oldind = arena_ind_get(oldarena); + WRITE(newind, unsigned); + READ(oldind, unsigned); + + if (newind != oldind) { + arena_t *newarena; + + if (newind >= narenas_total_get()) { + /* New arena index is out of range. */ + ret = EFAULT; + goto label_return; + } + + if (have_percpu_arena && + PERCPU_ARENA_ENABLED(opt_percpu_arena)) { + if (newind < percpu_arena_ind_limit(opt_percpu_arena)) { + /* + * If perCPU arena is enabled, thread_arena + * control is not allowed for the auto arena + * range. + */ + ret = EPERM; + goto label_return; + } + } + + /* Initialize arena if necessary. */ + newarena = arena_get(tsd_tsdn(tsd), newind, true); + if (newarena == NULL) { + ret = EAGAIN; + goto label_return; + } + /* Set new arena/tcache associations. */ + arena_migrate(tsd, oldarena, newarena); + if (tcache_available(tsd)) { + tcache_arena_reassociate(tsd_tsdn(tsd), + tsd_tcache_slowp_get(tsd), tsd_tcachep_get(tsd), + newarena); + } + } + + ret = 0; +label_return: + return ret; +} + +CTL_RO_NL_GEN(thread_allocated, tsd_thread_allocated_get(tsd), uint64_t) +CTL_RO_NL_GEN(thread_allocatedp, tsd_thread_allocatedp_get(tsd), uint64_t *) + +static int +thread_tcache_ncached_max_read_sizeclass_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + size_t bin_size = 0; + + /* Read the bin size from newp. */ + if (newp == NULL) { + ret = EINVAL; + goto label_return; + } + WRITE(bin_size, size_t); + + cache_bin_sz_t ncached_max = 0; + if (tcache_bin_ncached_max_read(tsd, bin_size, &ncached_max)) { + ret = EINVAL; + goto label_return; + } + size_t result = (size_t)ncached_max; + READ(result, size_t); + ret = 0; +label_return: + return ret; +} + +static int +thread_tcache_ncached_max_write_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + WRITEONLY(); + if (newp != NULL) { + if (!tcache_available(tsd)) { + ret = ENOENT; + goto label_return; + } + char *settings = NULL; + WRITE(settings, char *); + if (settings == NULL) { + ret = EINVAL; + goto label_return; + } + /* Get the length of the setting string safely. */ + char *end = (char *)memchr(settings, '\0', + CTL_MULTI_SETTING_MAX_LEN); + if (end == NULL) { + ret = EINVAL; + goto label_return; + } + /* + * Exclude the last '\0' for len since it is not handled by + * multi_setting_parse_next. + */ + size_t len = (uintptr_t)end - (uintptr_t)settings; + if (len == 0) { + ret = 0; + goto label_return; + } + + if (tcache_bins_ncached_max_write(tsd, settings, len)) { + ret = EINVAL; + goto label_return; + } + } + + ret = 0; +label_return: + return ret; +} + +CTL_RO_NL_GEN(thread_deallocated, tsd_thread_deallocated_get(tsd), uint64_t) +CTL_RO_NL_GEN(thread_deallocatedp, tsd_thread_deallocatedp_get(tsd), uint64_t *) + +static int +thread_tcache_enabled_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + bool oldval; + + oldval = tcache_enabled_get(tsd); + if (newp != NULL) { + if (newlen != sizeof(bool)) { + ret = EINVAL; + goto label_return; + } + tcache_enabled_set(tsd, *(bool *)newp); + } + READ(oldval, bool); + + ret = 0; +label_return: + return ret; +} + +static int +thread_tcache_max_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + size_t oldval; + + /* pointer to tcache_t always exists even with tcache disabled. */ + tcache_t *tcache = tsd_tcachep_get(tsd); + assert(tcache != NULL); + oldval = tcache_max_get(tcache->tcache_slow); + READ(oldval, size_t); + + if (newp != NULL) { + if (newlen != sizeof(size_t)) { + ret = EINVAL; + goto label_return; + } + size_t new_tcache_max = oldval; + WRITE(new_tcache_max, size_t); + if (new_tcache_max > TCACHE_MAXCLASS_LIMIT) { + new_tcache_max = TCACHE_MAXCLASS_LIMIT; + } + new_tcache_max = sz_s2u(new_tcache_max); + if(new_tcache_max != oldval) { + thread_tcache_max_set(tsd, new_tcache_max); + } + } + + ret = 0; +label_return: + return ret; +} + +static int +thread_tcache_flush_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + + if (!tcache_available(tsd)) { + ret = EFAULT; + goto label_return; + } + + NEITHER_READ_NOR_WRITE(); + + tcache_flush(tsd); + + ret = 0; +label_return: + return ret; +} + +static int +thread_peak_read_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + if (!config_stats) { + return ENOENT; + } + READONLY(); + peak_event_update(tsd); + uint64_t result = peak_event_max(tsd); + READ(result, uint64_t); + ret = 0; +label_return: + return ret; +} + +static int +thread_peak_reset_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + if (!config_stats) { + return ENOENT; + } + NEITHER_READ_NOR_WRITE(); + peak_event_zero(tsd); + ret = 0; +label_return: + return ret; +} + +static int +thread_prof_name_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + + if (!config_prof || !opt_prof) { + return ENOENT; + } + + READ_XOR_WRITE(); + + if (newp != NULL) { + const char *newval = *(const char **)newp; + if (newlen != sizeof(const char *) || newval == NULL) { + ret = EINVAL; + goto label_return; + } + + if ((ret = prof_thread_name_set(tsd, newval)) != 0) { + goto label_return; + } + } else { + const char *oldname = prof_thread_name_get(tsd); + READ(oldname, const char *); + } + + ret = 0; +label_return: + return ret; +} + +static int +thread_prof_active_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + bool oldval; + + if (!config_prof) { + return ENOENT; + } + + oldval = opt_prof ? prof_thread_active_get(tsd) : false; + if (newp != NULL) { + if (!opt_prof) { + ret = ENOENT; + goto label_return; + } + if (newlen != sizeof(bool)) { + ret = EINVAL; + goto label_return; + } + if (prof_thread_active_set(tsd, *(bool *)newp)) { + ret = EAGAIN; + goto label_return; + } + } + READ(oldval, bool); + + ret = 0; +label_return: + return ret; +} + +static int +thread_idle_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + + NEITHER_READ_NOR_WRITE(); + + if (tcache_available(tsd)) { + tcache_flush(tsd); + } + /* + * This heuristic is perhaps not the most well-considered. But it + * matches the only idling policy we have experience with in the status + * quo. Over time we should investigate more principled approaches. + */ + if (opt_narenas > ncpus * 2) { + arena_t *arena = arena_choose(tsd, NULL); + if (arena != NULL) { + arena_decay(tsd_tsdn(tsd), arena, false, true); + } + /* + * The missing arena case is not actually an error; a thread + * might be idle before it associates itself to one. This is + * unusual, but not wrong. + */ + } + + ret = 0; +label_return: + return ret; +} + +/******************************************************************************/ + +static int +tcache_create_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned tcache_ind; + + READONLY(); + VERIFY_READ(unsigned); + if (tcaches_create(tsd, b0get(), &tcache_ind)) { + ret = EFAULT; + goto label_return; + } + READ(tcache_ind, unsigned); + + ret = 0; +label_return: + return ret; +} + +static int +tcache_flush_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned tcache_ind; + + WRITEONLY(); + ASSURED_WRITE(tcache_ind, unsigned); + tcaches_flush(tsd, tcache_ind); + + ret = 0; +label_return: + return ret; +} + +static int +tcache_destroy_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned tcache_ind; + + WRITEONLY(); + ASSURED_WRITE(tcache_ind, unsigned); + tcaches_destroy(tsd, tcache_ind); + + ret = 0; +label_return: + return ret; +} + +/******************************************************************************/ + +static int +arena_i_initialized_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + tsdn_t *tsdn = tsd_tsdn(tsd); + unsigned arena_ind; + bool initialized; + + READONLY(); + MIB_UNSIGNED(arena_ind, 1); + + malloc_mutex_lock(tsdn, &ctl_mtx); + initialized = arenas_i(arena_ind)->initialized; + malloc_mutex_unlock(tsdn, &ctl_mtx); + + READ(initialized, bool); + + ret = 0; +label_return: + return ret; +} + +static void +arena_i_decay(tsdn_t *tsdn, unsigned arena_ind, bool all) { + malloc_mutex_lock(tsdn, &ctl_mtx); + { + unsigned narenas = ctl_arenas->narenas; + + /* + * Access via index narenas is deprecated, and scheduled for + * removal in 6.0.0. + */ + if (arena_ind == MALLCTL_ARENAS_ALL || arena_ind == narenas) { + unsigned i; + VARIABLE_ARRAY_UNSAFE(arena_t *, tarenas, narenas); + + for (i = 0; i < narenas; i++) { + tarenas[i] = arena_get(tsdn, i, false); + } + + /* + * No further need to hold ctl_mtx, since narenas and + * tarenas contain everything needed below. + */ + malloc_mutex_unlock(tsdn, &ctl_mtx); + + for (i = 0; i < narenas; i++) { + if (tarenas[i] != NULL) { + arena_decay(tsdn, tarenas[i], false, + all); + } + } + } else { + arena_t *tarena; + + assert(arena_ind < narenas); + + tarena = arena_get(tsdn, arena_ind, false); + + /* No further need to hold ctl_mtx. */ + malloc_mutex_unlock(tsdn, &ctl_mtx); + + if (tarena != NULL) { + arena_decay(tsdn, tarena, false, all); + } + } + } +} + +static int +arena_i_decay_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp, + size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned arena_ind; + + NEITHER_READ_NOR_WRITE(); + MIB_UNSIGNED(arena_ind, 1); + arena_i_decay(tsd_tsdn(tsd), arena_ind, false); + + ret = 0; +label_return: + return ret; +} + +static int +arena_i_purge_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp, + size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned arena_ind; + + NEITHER_READ_NOR_WRITE(); + MIB_UNSIGNED(arena_ind, 1); + arena_i_decay(tsd_tsdn(tsd), arena_ind, true); + + ret = 0; +label_return: + return ret; +} + +static int +arena_i_reset_destroy_helper(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen, unsigned *arena_ind, + arena_t **arena) { + int ret; + + NEITHER_READ_NOR_WRITE(); + MIB_UNSIGNED(*arena_ind, 1); + + *arena = arena_get(tsd_tsdn(tsd), *arena_ind, false); + if (*arena == NULL || arena_is_auto(*arena)) { + ret = EFAULT; + goto label_return; + } + + ret = 0; +label_return: + return ret; +} + +static void +arena_reset_prepare_background_thread(tsd_t *tsd, unsigned arena_ind) { + /* Temporarily disable the background thread during arena reset. */ + if (have_background_thread) { + malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_lock); + if (background_thread_enabled()) { + background_thread_info_t *info = + background_thread_info_get(arena_ind); + assert(info->state == background_thread_started); + malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx); + info->state = background_thread_paused; + malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx); + } + } +} + +static void +arena_reset_finish_background_thread(tsd_t *tsd, unsigned arena_ind) { + if (have_background_thread) { + if (background_thread_enabled()) { + background_thread_info_t *info = + background_thread_info_get(arena_ind); + assert(info->state == background_thread_paused); + malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx); + info->state = background_thread_started; + malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx); + } + malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_lock); + } +} + +static int +arena_i_reset_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp, + size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned arena_ind; + arena_t *arena; + + ret = arena_i_reset_destroy_helper(tsd, mib, miblen, oldp, oldlenp, + newp, newlen, &arena_ind, &arena); + if (ret != 0) { + return ret; + } + + arena_reset_prepare_background_thread(tsd, arena_ind); + arena_reset(tsd, arena); + arena_reset_finish_background_thread(tsd, arena_ind); + + return ret; +} + +static int +arena_i_destroy_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp, + size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned arena_ind; + arena_t *arena; + ctl_arena_t *ctl_darena, *ctl_arena; + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + + ret = arena_i_reset_destroy_helper(tsd, mib, miblen, oldp, oldlenp, + newp, newlen, &arena_ind, &arena); + if (ret != 0) { + goto label_return; + } + + if (arena_nthreads_get(arena, false) != 0 || arena_nthreads_get(arena, + true) != 0) { + ret = EFAULT; + goto label_return; + } + + arena_reset_prepare_background_thread(tsd, arena_ind); + /* Merge stats after resetting and purging arena. */ + arena_reset(tsd, arena); + arena_decay(tsd_tsdn(tsd), arena, false, true); + ctl_darena = arenas_i(MALLCTL_ARENAS_DESTROYED); + ctl_darena->initialized = true; + ctl_arena_refresh(tsd_tsdn(tsd), arena, ctl_darena, arena_ind, true); + /* Destroy arena. */ + arena_destroy(tsd, arena); + ctl_arena = arenas_i(arena_ind); + ctl_arena->initialized = false; + /* Record arena index for later recycling via arenas.create. */ + ql_elm_new(ctl_arena, destroyed_link); + ql_tail_insert(&ctl_arenas->destroyed, ctl_arena, destroyed_link); + arena_reset_finish_background_thread(tsd, arena_ind); + + assert(ret == 0); +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + + return ret; +} + +static int +arena_i_dss_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp, + size_t *oldlenp, void *newp, size_t newlen) { + int ret; + const char *dss = NULL; + unsigned arena_ind; + dss_prec_t dss_prec = dss_prec_limit; + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + WRITE(dss, const char *); + MIB_UNSIGNED(arena_ind, 1); + if (dss != NULL) { + int i; + bool match = false; + + for (i = 0; i < dss_prec_limit; i++) { + if (strcmp(dss_prec_names[i], dss) == 0) { + dss_prec = i; + match = true; + break; + } + } + + if (!match) { + ret = EINVAL; + goto label_return; + } + } + + /* + * Access via index narenas is deprecated, and scheduled for removal in + * 6.0.0. + */ + dss_prec_t dss_prec_old; + if (arena_ind == MALLCTL_ARENAS_ALL || arena_ind == + ctl_arenas->narenas) { + if (dss_prec != dss_prec_limit && + extent_dss_prec_set(dss_prec)) { + ret = EFAULT; + goto label_return; + } + dss_prec_old = extent_dss_prec_get(); + } else { + arena_t *arena = arena_get(tsd_tsdn(tsd), arena_ind, false); + if (arena == NULL || (dss_prec != dss_prec_limit && + arena_dss_prec_set(arena, dss_prec))) { + ret = EFAULT; + goto label_return; + } + dss_prec_old = arena_dss_prec_get(arena); + } + + dss = dss_prec_names[dss_prec_old]; + READ(dss, const char *); + + ret = 0; +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + return ret; +} + +static int +arena_i_oversize_threshold_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + + unsigned arena_ind; + MIB_UNSIGNED(arena_ind, 1); + + arena_t *arena = arena_get(tsd_tsdn(tsd), arena_ind, false); + if (arena == NULL) { + ret = EFAULT; + goto label_return; + } + + if (oldp != NULL && oldlenp != NULL) { + size_t oldval = atomic_load_zu( + &arena->pa_shard.pac.oversize_threshold, ATOMIC_RELAXED); + READ(oldval, size_t); + } + if (newp != NULL) { + if (newlen != sizeof(size_t)) { + ret = EINVAL; + goto label_return; + } + atomic_store_zu(&arena->pa_shard.pac.oversize_threshold, + *(size_t *)newp, ATOMIC_RELAXED); + } + ret = 0; +label_return: + return ret; +} + +static int +arena_i_decay_ms_ctl_impl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen, bool dirty) { + int ret; + unsigned arena_ind; + arena_t *arena; + + MIB_UNSIGNED(arena_ind, 1); + arena = arena_get(tsd_tsdn(tsd), arena_ind, false); + if (arena == NULL) { + ret = EFAULT; + goto label_return; + } + extent_state_t state = dirty ? extent_state_dirty : extent_state_muzzy; + + if (oldp != NULL && oldlenp != NULL) { + size_t oldval = arena_decay_ms_get(arena, state); + READ(oldval, ssize_t); + } + if (newp != NULL) { + if (newlen != sizeof(ssize_t)) { + ret = EINVAL; + goto label_return; + } + + if (arena_decay_ms_set(tsd_tsdn(tsd), arena, state, + *(ssize_t *)newp)) { + ret = EFAULT; + goto label_return; + } + } + + ret = 0; +label_return: + return ret; +} + +static int +arena_i_dirty_decay_ms_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + return arena_i_decay_ms_ctl_impl(tsd, mib, miblen, oldp, oldlenp, newp, + newlen, true); +} + +static int +arena_i_muzzy_decay_ms_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + return arena_i_decay_ms_ctl_impl(tsd, mib, miblen, oldp, oldlenp, newp, + newlen, false); +} + +static int +arena_i_extent_hooks_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned arena_ind; + arena_t *arena; + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + MIB_UNSIGNED(arena_ind, 1); + if (arena_ind < narenas_total_get()) { + extent_hooks_t *old_extent_hooks; + arena = arena_get(tsd_tsdn(tsd), arena_ind, false); + if (arena == NULL) { + if (arena_ind >= narenas_auto) { + ret = EFAULT; + goto label_return; + } + old_extent_hooks = + (extent_hooks_t *)&ehooks_default_extent_hooks; + READ(old_extent_hooks, extent_hooks_t *); + if (newp != NULL) { + /* Initialize a new arena as a side effect. */ + extent_hooks_t *new_extent_hooks + JEMALLOC_CC_SILENCE_INIT(NULL); + WRITE(new_extent_hooks, extent_hooks_t *); + arena_config_t config = arena_config_default; + config.extent_hooks = new_extent_hooks; + + arena = arena_init(tsd_tsdn(tsd), arena_ind, + &config); + if (arena == NULL) { + ret = EFAULT; + goto label_return; + } + } + } else { + if (newp != NULL) { + extent_hooks_t *new_extent_hooks + JEMALLOC_CC_SILENCE_INIT(NULL); + WRITE(new_extent_hooks, extent_hooks_t *); + old_extent_hooks = arena_set_extent_hooks(tsd, + arena, new_extent_hooks); + READ(old_extent_hooks, extent_hooks_t *); + } else { + old_extent_hooks = + ehooks_get_extent_hooks_ptr( + arena_get_ehooks(arena)); + READ(old_extent_hooks, extent_hooks_t *); + } + } + } else { + ret = EFAULT; + goto label_return; + } + ret = 0; +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + return ret; +} + +static int +arena_i_retain_grow_limit_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + unsigned arena_ind; + arena_t *arena; + + if (!opt_retain) { + /* Only relevant when retain is enabled. */ + return ENOENT; + } + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + MIB_UNSIGNED(arena_ind, 1); + if (arena_ind < narenas_total_get() && (arena = + arena_get(tsd_tsdn(tsd), arena_ind, false)) != NULL) { + size_t old_limit, new_limit; + if (newp != NULL) { + WRITE(new_limit, size_t); + } + bool err = arena_retain_grow_limit_get_set(tsd, arena, + &old_limit, newp != NULL ? &new_limit : NULL); + if (!err) { + READ(old_limit, size_t); + ret = 0; + } else { + ret = EFAULT; + } + } else { + ret = EFAULT; + } +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + return ret; +} + +/* + * When writing, newp should point to a char array storing the name to be set. + * A name longer than ARENA_NAME_LEN will be arbitrarily cut. When reading, + * oldp should point to a char array whose length is no shorter than + * ARENA_NAME_LEN or the length of the name when it was set. + */ +static int +arena_i_name_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned arena_ind; + char *name; + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + MIB_UNSIGNED(arena_ind, 1); + if (arena_ind == MALLCTL_ARENAS_ALL || arena_ind >= + ctl_arenas->narenas) { + ret = EINVAL; + goto label_return; + } + arena_t *arena = arena_get(tsd_tsdn(tsd), arena_ind, false); + if (arena == NULL) { + ret = EFAULT; + goto label_return; + } + + if (oldp != NULL && oldlenp != NULL) { + /* + * Read the arena name. When reading, the input oldp should + * point to an array with a length no shorter than + * ARENA_NAME_LEN or the length when it was set. + */ + if (*oldlenp != sizeof(char *)) { + ret = EINVAL; + goto label_return; + } + name = *(char **)oldp; + arena_name_get(arena, name); + } + + if (newp != NULL) { + /* Write the arena name. */ + WRITE(name, char *); + if (name == NULL) { + ret = EINVAL; + goto label_return; + } + arena_name_set(arena, name); + } + ret = 0; +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + return ret; +} + +static const ctl_named_node_t * +arena_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen, + size_t i) { + const ctl_named_node_t *ret; + + malloc_mutex_lock(tsdn, &ctl_mtx); + switch (i) { + case MALLCTL_ARENAS_ALL: + case MALLCTL_ARENAS_DESTROYED: + break; + default: + if (i > ctl_arenas->narenas) { + ret = NULL; + goto label_return; + } + break; + } + + ret = super_arena_i_node; +label_return: + malloc_mutex_unlock(tsdn, &ctl_mtx); + return ret; +} + +/******************************************************************************/ + +static int +arenas_narenas_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned narenas; + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + READONLY(); + narenas = ctl_arenas->narenas; + READ(narenas, unsigned); + + ret = 0; +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + return ret; +} + +static int +arenas_decay_ms_ctl_impl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen, bool dirty) { + int ret; + + if (oldp != NULL && oldlenp != NULL) { + size_t oldval = (dirty ? arena_dirty_decay_ms_default_get() : + arena_muzzy_decay_ms_default_get()); + READ(oldval, ssize_t); + } + if (newp != NULL) { + if (newlen != sizeof(ssize_t)) { + ret = EINVAL; + goto label_return; + } + if (dirty ? arena_dirty_decay_ms_default_set(*(ssize_t *)newp) + : arena_muzzy_decay_ms_default_set(*(ssize_t *)newp)) { + ret = EFAULT; + goto label_return; + } + } + + ret = 0; +label_return: + return ret; +} + +static int +arenas_dirty_decay_ms_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + return arenas_decay_ms_ctl_impl(tsd, mib, miblen, oldp, oldlenp, newp, + newlen, true); +} + +static int +arenas_muzzy_decay_ms_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + return arenas_decay_ms_ctl_impl(tsd, mib, miblen, oldp, oldlenp, newp, + newlen, false); +} + +CTL_RO_NL_GEN(arenas_quantum, QUANTUM, size_t) +CTL_RO_NL_GEN(arenas_page, PAGE, size_t) +CTL_RO_NL_GEN(arenas_hugepage, HUGEPAGE, size_t) +CTL_RO_NL_GEN(arenas_tcache_max, global_do_not_change_tcache_maxclass, size_t) +CTL_RO_NL_GEN(arenas_nbins, SC_NBINS, unsigned) +CTL_RO_NL_GEN(arenas_nhbins, global_do_not_change_tcache_nbins, unsigned) +CTL_RO_NL_GEN(arenas_bin_i_size, bin_infos[mib[2]].reg_size, size_t) +CTL_RO_NL_GEN(arenas_bin_i_nregs, bin_infos[mib[2]].nregs, uint32_t) +CTL_RO_NL_GEN(arenas_bin_i_slab_size, bin_infos[mib[2]].slab_size, size_t) +CTL_RO_NL_GEN(arenas_bin_i_nshards, bin_infos[mib[2]].n_shards, uint32_t) +static const ctl_named_node_t * +arenas_bin_i_index(tsdn_t *tsdn, const size_t *mib, + size_t miblen, size_t i) { + if (i > SC_NBINS) { + return NULL; + } + return super_arenas_bin_i_node; +} + +CTL_RO_NL_GEN(arenas_nlextents, SC_NSIZES - SC_NBINS, unsigned) +CTL_RO_NL_GEN(arenas_lextent_i_size, sz_index2size(SC_NBINS+(szind_t)mib[2]), + size_t) +static const ctl_named_node_t * +arenas_lextent_i_index(tsdn_t *tsdn, const size_t *mib, + size_t miblen, size_t i) { + if (i > SC_NSIZES - SC_NBINS) { + return NULL; + } + return super_arenas_lextent_i_node; +} + +static int +arenas_create_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned arena_ind; + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + + VERIFY_READ(unsigned); + arena_config_t config = arena_config_default; + WRITE(config.extent_hooks, extent_hooks_t *); + if ((arena_ind = ctl_arena_init(tsd, &config)) == UINT_MAX) { + ret = EAGAIN; + goto label_return; + } + READ(arena_ind, unsigned); + + ret = 0; +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + return ret; +} + +static int +experimental_arenas_create_ext_ctl(tsd_t *tsd, + const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned arena_ind; + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + + arena_config_t config = arena_config_default; + VERIFY_READ(unsigned); + WRITE(config, arena_config_t); + + if ((arena_ind = ctl_arena_init(tsd, &config)) == UINT_MAX) { + ret = EAGAIN; + goto label_return; + } + READ(arena_ind, unsigned); + ret = 0; +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + return ret; +} + +static int +arenas_lookup_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + unsigned arena_ind; + void *ptr; + emap_full_alloc_ctx_t alloc_ctx; + bool ptr_not_present; + arena_t *arena; + + ptr = NULL; + ret = EINVAL; + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + WRITE(ptr, void *); + ptr_not_present = emap_full_alloc_ctx_try_lookup(tsd_tsdn(tsd), &arena_emap_global, ptr, + &alloc_ctx); + if (ptr_not_present || alloc_ctx.edata == NULL) { + goto label_return; + } + + arena = arena_get_from_edata(alloc_ctx.edata); + if (arena == NULL) { + goto label_return; + } + + arena_ind = arena_ind_get(arena); + READ(arena_ind, unsigned); + + ret = 0; +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + return ret; +} + +/******************************************************************************/ + +static int +prof_thread_active_init_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + bool oldval; + + if (!config_prof) { + return ENOENT; + } + + if (newp != NULL) { + if (!opt_prof) { + ret = ENOENT; + goto label_return; + } + if (newlen != sizeof(bool)) { + ret = EINVAL; + goto label_return; + } + oldval = prof_thread_active_init_set(tsd_tsdn(tsd), + *(bool *)newp); + } else { + oldval = opt_prof ? prof_thread_active_init_get(tsd_tsdn(tsd)) : + false; + } + READ(oldval, bool); + + ret = 0; +label_return: + return ret; +} + +static int +prof_active_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + bool oldval; + + if (!config_prof) { + ret = ENOENT; + goto label_return; + } + + if (newp != NULL) { + if (newlen != sizeof(bool)) { + ret = EINVAL; + goto label_return; + } + bool val = *(bool *)newp; + if (!opt_prof) { + if (val) { + ret = ENOENT; + goto label_return; + } else { + /* No change needed (already off). */ + oldval = false; + } + } else { + oldval = prof_active_set(tsd_tsdn(tsd), val); + } + } else { + oldval = opt_prof ? prof_active_get(tsd_tsdn(tsd)) : false; + } + READ(oldval, bool); + + ret = 0; +label_return: + return ret; +} + +static int +prof_dump_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + const char *filename = NULL; + + if (!config_prof || !opt_prof) { + return ENOENT; + } + + WRITEONLY(); + WRITE(filename, const char *); + + if (prof_mdump(tsd, filename)) { + ret = EFAULT; + goto label_return; + } + + ret = 0; +label_return: + return ret; +} + +static int +prof_gdump_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + bool oldval; + + if (!config_prof) { + return ENOENT; + } + + if (newp != NULL) { + if (!opt_prof) { + ret = ENOENT; + goto label_return; + } + if (newlen != sizeof(bool)) { + ret = EINVAL; + goto label_return; + } + oldval = prof_gdump_set(tsd_tsdn(tsd), *(bool *)newp); + } else { + oldval = opt_prof ? prof_gdump_get(tsd_tsdn(tsd)) : false; + } + READ(oldval, bool); + + ret = 0; +label_return: + return ret; +} + +static int +prof_prefix_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + const char *prefix = NULL; + + if (!config_prof || !opt_prof) { + return ENOENT; + } + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + WRITEONLY(); + WRITE(prefix, const char *); + + ret = prof_prefix_set(tsd_tsdn(tsd), prefix) ? EFAULT : 0; +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + return ret; +} + +static int +prof_reset_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + size_t lg_sample = lg_prof_sample; + + if (!config_prof || !opt_prof) { + return ENOENT; + } + + WRITEONLY(); + WRITE(lg_sample, size_t); + if (lg_sample >= (sizeof(uint64_t) << 3)) { + lg_sample = (sizeof(uint64_t) << 3) - 1; + } + + prof_reset(tsd, lg_sample); + + ret = 0; +label_return: + return ret; +} + +CTL_RO_NL_CGEN(config_prof, prof_interval, prof_interval, uint64_t) +CTL_RO_NL_CGEN(config_prof, lg_prof_sample, lg_prof_sample, size_t) + +static int +prof_log_start_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp, + size_t *oldlenp, void *newp, size_t newlen) { + int ret; + + const char *filename = NULL; + + if (!config_prof || !opt_prof) { + return ENOENT; + } + + WRITEONLY(); + WRITE(filename, const char *); + + if (prof_log_start(tsd_tsdn(tsd), filename)) { + ret = EFAULT; + goto label_return; + } + + ret = 0; +label_return: + return ret; +} + +static int +prof_log_stop_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp, + size_t *oldlenp, void *newp, size_t newlen) { + if (!config_prof || !opt_prof) { + return ENOENT; + } + + if (prof_log_stop(tsd_tsdn(tsd))) { + return EFAULT; + } + + return 0; +} + +static int +experimental_hooks_prof_backtrace_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + + if (oldp == NULL && newp == NULL) { + ret = EINVAL; + goto label_return; + } + if (oldp != NULL) { + prof_backtrace_hook_t old_hook = + prof_backtrace_hook_get(); + READ(old_hook, prof_backtrace_hook_t); + } + if (newp != NULL) { + if (!opt_prof) { + ret = ENOENT; + goto label_return; + } + prof_backtrace_hook_t new_hook JEMALLOC_CC_SILENCE_INIT(NULL); + WRITE(new_hook, prof_backtrace_hook_t); + if (new_hook == NULL) { + ret = EINVAL; + goto label_return; + } + prof_backtrace_hook_set(new_hook); + } + ret = 0; +label_return: + return ret; +} + +static int +experimental_hooks_prof_dump_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + + if (oldp == NULL && newp == NULL) { + ret = EINVAL; + goto label_return; + } + if (oldp != NULL) { + prof_dump_hook_t old_hook = + prof_dump_hook_get(); + READ(old_hook, prof_dump_hook_t); + } + if (newp != NULL) { + if (!opt_prof) { + ret = ENOENT; + goto label_return; + } + prof_dump_hook_t new_hook JEMALLOC_CC_SILENCE_INIT(NULL); + WRITE(new_hook, prof_dump_hook_t); + prof_dump_hook_set(new_hook); + } + ret = 0; +label_return: + return ret; +} + +static int +experimental_hooks_prof_sample_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + + if (oldp == NULL && newp == NULL) { + ret = EINVAL; + goto label_return; + } + if (oldp != NULL) { + prof_sample_hook_t old_hook = + prof_sample_hook_get(); + READ(old_hook, prof_sample_hook_t); + } + if (newp != NULL) { + if (!opt_prof) { + ret = ENOENT; + goto label_return; + } + prof_sample_hook_t new_hook JEMALLOC_CC_SILENCE_INIT(NULL); + WRITE(new_hook, prof_sample_hook_t); + prof_sample_hook_set(new_hook); + } + ret = 0; +label_return: + return ret; +} + +static int +experimental_hooks_prof_sample_free_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + + if (oldp == NULL && newp == NULL) { + ret = EINVAL; + goto label_return; + } + if (oldp != NULL) { + prof_sample_free_hook_t old_hook = + prof_sample_free_hook_get(); + READ(old_hook, prof_sample_free_hook_t); + } + if (newp != NULL) { + if (!opt_prof) { + ret = ENOENT; + goto label_return; + } + prof_sample_free_hook_t new_hook JEMALLOC_CC_SILENCE_INIT(NULL); + WRITE(new_hook, prof_sample_free_hook_t); + prof_sample_free_hook_set(new_hook); + } + ret = 0; +label_return: + return ret; +} + +/* For integration test purpose only. No plan to move out of experimental. */ +static int +experimental_hooks_safety_check_abort_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + + WRITEONLY(); + if (newp != NULL) { + if (newlen != sizeof(safety_check_abort_hook_t)) { + ret = EINVAL; + goto label_return; + } + safety_check_abort_hook_t hook JEMALLOC_CC_SILENCE_INIT(NULL); + WRITE(hook, safety_check_abort_hook_t); + safety_check_set_abort(hook); + } + ret = 0; +label_return: + return ret; +} + +/******************************************************************************/ + +CTL_RO_CGEN(config_stats, stats_allocated, ctl_stats->allocated, size_t) +CTL_RO_CGEN(config_stats, stats_active, ctl_stats->active, size_t) +CTL_RO_CGEN(config_stats, stats_metadata, ctl_stats->metadata, size_t) +CTL_RO_CGEN(config_stats, stats_metadata_edata, ctl_stats->metadata_edata, + size_t) +CTL_RO_CGEN(config_stats, stats_metadata_rtree, ctl_stats->metadata_rtree, + size_t) +CTL_RO_CGEN(config_stats, stats_metadata_thp, ctl_stats->metadata_thp, size_t) +CTL_RO_CGEN(config_stats, stats_resident, ctl_stats->resident, size_t) +CTL_RO_CGEN(config_stats, stats_mapped, ctl_stats->mapped, size_t) +CTL_RO_CGEN(config_stats, stats_retained, ctl_stats->retained, size_t) + +CTL_RO_CGEN(config_stats, stats_background_thread_num_threads, + ctl_stats->background_thread.num_threads, size_t) +CTL_RO_CGEN(config_stats, stats_background_thread_num_runs, + ctl_stats->background_thread.num_runs, uint64_t) +CTL_RO_CGEN(config_stats, stats_background_thread_run_interval, + nstime_ns(&ctl_stats->background_thread.run_interval), uint64_t) + +CTL_RO_CGEN(config_stats, stats_zero_reallocs, + atomic_load_zu(&zero_realloc_count, ATOMIC_RELAXED), size_t) + +CTL_RO_GEN(stats_arenas_i_dss, arenas_i(mib[2])->dss, const char *) +CTL_RO_GEN(stats_arenas_i_dirty_decay_ms, arenas_i(mib[2])->dirty_decay_ms, + ssize_t) +CTL_RO_GEN(stats_arenas_i_muzzy_decay_ms, arenas_i(mib[2])->muzzy_decay_ms, + ssize_t) +CTL_RO_GEN(stats_arenas_i_nthreads, arenas_i(mib[2])->nthreads, unsigned) +CTL_RO_GEN(stats_arenas_i_uptime, + nstime_ns(&arenas_i(mib[2])->astats->astats.uptime), uint64_t) +CTL_RO_GEN(stats_arenas_i_pactive, arenas_i(mib[2])->pactive, size_t) +CTL_RO_GEN(stats_arenas_i_pdirty, arenas_i(mib[2])->pdirty, size_t) +CTL_RO_GEN(stats_arenas_i_pmuzzy, arenas_i(mib[2])->pmuzzy, size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_mapped, + arenas_i(mib[2])->astats->astats.mapped, size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_retained, + arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.retained, size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_extent_avail, + arenas_i(mib[2])->astats->astats.pa_shard_stats.edata_avail, size_t) + +CTL_RO_CGEN(config_stats, stats_arenas_i_dirty_npurge, + locked_read_u64_unsynchronized( + &arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.decay_dirty.npurge), + uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_dirty_nmadvise, + locked_read_u64_unsynchronized( + &arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.decay_dirty.nmadvise), + uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_dirty_purged, + locked_read_u64_unsynchronized( + &arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.decay_dirty.purged), + uint64_t) + +CTL_RO_CGEN(config_stats, stats_arenas_i_muzzy_npurge, + locked_read_u64_unsynchronized( + &arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.decay_muzzy.npurge), + uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_muzzy_nmadvise, + locked_read_u64_unsynchronized( + &arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.decay_muzzy.nmadvise), + uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_muzzy_purged, + locked_read_u64_unsynchronized( + &arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.decay_muzzy.purged), + uint64_t) + +CTL_RO_CGEN(config_stats, stats_arenas_i_base, + arenas_i(mib[2])->astats->astats.base, + size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_internal, + atomic_load_zu(&arenas_i(mib[2])->astats->astats.internal, ATOMIC_RELAXED), + size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_metadata_edata, + arenas_i(mib[2])->astats->astats.metadata_edata, size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_metadata_rtree, + arenas_i(mib[2])->astats->astats.metadata_rtree, size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_metadata_thp, + arenas_i(mib[2])->astats->astats.metadata_thp, size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_tcache_bytes, + arenas_i(mib[2])->astats->astats.tcache_bytes, size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_tcache_stashed_bytes, + arenas_i(mib[2])->astats->astats.tcache_stashed_bytes, size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_resident, + arenas_i(mib[2])->astats->astats.resident, + size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_abandoned_vm, + atomic_load_zu( + &arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.abandoned_vm, + ATOMIC_RELAXED), size_t) + +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_sec_bytes, + arenas_i(mib[2])->astats->secstats.bytes, size_t) + +CTL_RO_CGEN(config_stats, stats_arenas_i_small_allocated, + arenas_i(mib[2])->astats->allocated_small, size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_small_nmalloc, + arenas_i(mib[2])->astats->nmalloc_small, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_small_ndalloc, + arenas_i(mib[2])->astats->ndalloc_small, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_small_nrequests, + arenas_i(mib[2])->astats->nrequests_small, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_small_nfills, + arenas_i(mib[2])->astats->nfills_small, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_small_nflushes, + arenas_i(mib[2])->astats->nflushes_small, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_large_allocated, + arenas_i(mib[2])->astats->astats.allocated_large, size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_large_nmalloc, + arenas_i(mib[2])->astats->astats.nmalloc_large, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_large_ndalloc, + arenas_i(mib[2])->astats->astats.ndalloc_large, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_large_nrequests, + arenas_i(mib[2])->astats->astats.nrequests_large, uint64_t) +/* + * Note: "nmalloc_large" here instead of "nfills" in the read. This is + * intentional (large has no batch fill). + */ +CTL_RO_CGEN(config_stats, stats_arenas_i_large_nfills, + arenas_i(mib[2])->astats->astats.nmalloc_large, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_large_nflushes, + arenas_i(mib[2])->astats->astats.nflushes_large, uint64_t) + +/* Lock profiling related APIs below. */ +#define RO_MUTEX_CTL_GEN(n, l) \ +CTL_RO_CGEN(config_stats, stats_##n##_num_ops, \ + l.n_lock_ops, uint64_t) \ +CTL_RO_CGEN(config_stats, stats_##n##_num_wait, \ + l.n_wait_times, uint64_t) \ +CTL_RO_CGEN(config_stats, stats_##n##_num_spin_acq, \ + l.n_spin_acquired, uint64_t) \ +CTL_RO_CGEN(config_stats, stats_##n##_num_owner_switch, \ + l.n_owner_switches, uint64_t) \ +CTL_RO_CGEN(config_stats, stats_##n##_total_wait_time, \ + nstime_ns(&l.tot_wait_time), uint64_t) \ +CTL_RO_CGEN(config_stats, stats_##n##_max_wait_time, \ + nstime_ns(&l.max_wait_time), uint64_t) \ +CTL_RO_CGEN(config_stats, stats_##n##_max_num_thds, \ + l.max_n_thds, uint32_t) + +/* Global mutexes. */ +#define OP(mtx) \ + RO_MUTEX_CTL_GEN(mutexes_##mtx, \ + ctl_stats->mutex_prof_data[global_prof_mutex_##mtx]) +MUTEX_PROF_GLOBAL_MUTEXES +#undef OP + +/* Per arena mutexes */ +#define OP(mtx) RO_MUTEX_CTL_GEN(arenas_i_mutexes_##mtx, \ + arenas_i(mib[2])->astats->astats.mutex_prof_data[arena_prof_mutex_##mtx]) +MUTEX_PROF_ARENA_MUTEXES +#undef OP + +/* tcache bin mutex */ +RO_MUTEX_CTL_GEN(arenas_i_bins_j_mutex, + arenas_i(mib[2])->astats->bstats[mib[4]].mutex_data) +#undef RO_MUTEX_CTL_GEN + +/* Resets all mutex stats, including global, arena and bin mutexes. */ +static int +stats_mutexes_reset_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, + void *newp, size_t newlen) { + if (!config_stats) { + return ENOENT; + } + + tsdn_t *tsdn = tsd_tsdn(tsd); + +#define MUTEX_PROF_RESET(mtx) \ + malloc_mutex_lock(tsdn, &mtx); \ + malloc_mutex_prof_data_reset(tsdn, &mtx); \ + malloc_mutex_unlock(tsdn, &mtx); + + /* Global mutexes: ctl and prof. */ + MUTEX_PROF_RESET(ctl_mtx); + if (have_background_thread) { + MUTEX_PROF_RESET(background_thread_lock); + } + if (config_prof && opt_prof) { + MUTEX_PROF_RESET(bt2gctx_mtx); + MUTEX_PROF_RESET(tdatas_mtx); + MUTEX_PROF_RESET(prof_dump_mtx); + MUTEX_PROF_RESET(prof_recent_alloc_mtx); + MUTEX_PROF_RESET(prof_recent_dump_mtx); + MUTEX_PROF_RESET(prof_stats_mtx); + } + + /* Per arena mutexes. */ + unsigned n = narenas_total_get(); + + for (unsigned i = 0; i < n; i++) { + arena_t *arena = arena_get(tsdn, i, false); + if (!arena) { + continue; + } + MUTEX_PROF_RESET(arena->large_mtx); + MUTEX_PROF_RESET(arena->pa_shard.edata_cache.mtx); + MUTEX_PROF_RESET(arena->pa_shard.pac.ecache_dirty.mtx); + MUTEX_PROF_RESET(arena->pa_shard.pac.ecache_muzzy.mtx); + MUTEX_PROF_RESET(arena->pa_shard.pac.ecache_retained.mtx); + MUTEX_PROF_RESET(arena->pa_shard.pac.decay_dirty.mtx); + MUTEX_PROF_RESET(arena->pa_shard.pac.decay_muzzy.mtx); + MUTEX_PROF_RESET(arena->tcache_ql_mtx); + MUTEX_PROF_RESET(arena->base->mtx); + + for (szind_t j = 0; j < SC_NBINS; j++) { + for (unsigned k = 0; k < bin_infos[j].n_shards; k++) { + bin_t *bin = arena_get_bin(arena, j, k); + MUTEX_PROF_RESET(bin->lock); + } + } + } +#undef MUTEX_PROF_RESET + return 0; +} + +CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nmalloc, + arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.nmalloc, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_ndalloc, + arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.ndalloc, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nrequests, + arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.nrequests, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_curregs, + arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.curregs, size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nfills, + arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.nfills, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nflushes, + arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.nflushes, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nslabs, + arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.nslabs, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nreslabs, + arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.reslabs, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_curslabs, + arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.curslabs, size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nonfull_slabs, + arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.nonfull_slabs, size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_batch_pops, + arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.batch_pops, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_batch_failed_pushes, + arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.batch_failed_pushes, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_batch_pushes, + arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.batch_pushes, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_batch_pushed_elems, + arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.batch_pushed_elems, uint64_t) + +static const ctl_named_node_t * +stats_arenas_i_bins_j_index(tsdn_t *tsdn, const size_t *mib, + size_t miblen, size_t j) { + if (j > SC_NBINS) { + return NULL; + } + return super_stats_arenas_i_bins_j_node; +} + +CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_nmalloc, + locked_read_u64_unsynchronized( + &arenas_i(mib[2])->astats->lstats[mib[4]].nmalloc), uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_ndalloc, + locked_read_u64_unsynchronized( + &arenas_i(mib[2])->astats->lstats[mib[4]].ndalloc), uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_nrequests, + locked_read_u64_unsynchronized( + &arenas_i(mib[2])->astats->lstats[mib[4]].nrequests), uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_curlextents, + arenas_i(mib[2])->astats->lstats[mib[4]].curlextents, size_t) + +static const ctl_named_node_t * +stats_arenas_i_lextents_j_index(tsdn_t *tsdn, const size_t *mib, + size_t miblen, size_t j) { + if (j > SC_NSIZES - SC_NBINS) { + return NULL; + } + return super_stats_arenas_i_lextents_j_node; +} + +CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_ndirty, + arenas_i(mib[2])->astats->estats[mib[4]].ndirty, size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_nmuzzy, + arenas_i(mib[2])->astats->estats[mib[4]].nmuzzy, size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_nretained, + arenas_i(mib[2])->astats->estats[mib[4]].nretained, size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_dirty_bytes, + arenas_i(mib[2])->astats->estats[mib[4]].dirty_bytes, size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_muzzy_bytes, + arenas_i(mib[2])->astats->estats[mib[4]].muzzy_bytes, size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_retained_bytes, + arenas_i(mib[2])->astats->estats[mib[4]].retained_bytes, size_t); + +static const ctl_named_node_t * +stats_arenas_i_extents_j_index(tsdn_t *tsdn, const size_t *mib, + size_t miblen, size_t j) { + if (j >= SC_NPSIZES) { + return NULL; + } + return super_stats_arenas_i_extents_j_node; +} + +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_npurge_passes, + arenas_i(mib[2])->astats->hpastats.nonderived_stats.npurge_passes, uint64_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_npurges, + arenas_i(mib[2])->astats->hpastats.nonderived_stats.npurges, uint64_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nhugifies, + arenas_i(mib[2])->astats->hpastats.nonderived_stats.nhugifies, uint64_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_ndehugifies, + arenas_i(mib[2])->astats->hpastats.nonderived_stats.ndehugifies, uint64_t); + +/* Full, nonhuge */ +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_full_slabs_npageslabs_nonhuge, + arenas_i(mib[2])->astats->hpastats.psset_stats.full_slabs[0].npageslabs, + size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_full_slabs_nactive_nonhuge, + arenas_i(mib[2])->astats->hpastats.psset_stats.full_slabs[0].nactive, size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_full_slabs_ndirty_nonhuge, + arenas_i(mib[2])->astats->hpastats.psset_stats.full_slabs[0].ndirty, size_t); + +/* Full, huge */ +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_full_slabs_npageslabs_huge, + arenas_i(mib[2])->astats->hpastats.psset_stats.full_slabs[1].npageslabs, + size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_full_slabs_nactive_huge, + arenas_i(mib[2])->astats->hpastats.psset_stats.full_slabs[1].nactive, size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_full_slabs_ndirty_huge, + arenas_i(mib[2])->astats->hpastats.psset_stats.full_slabs[1].ndirty, size_t); + +/* Empty, nonhuge */ +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_empty_slabs_npageslabs_nonhuge, + arenas_i(mib[2])->astats->hpastats.psset_stats.empty_slabs[0].npageslabs, + size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_empty_slabs_nactive_nonhuge, + arenas_i(mib[2])->astats->hpastats.psset_stats.empty_slabs[0].nactive, size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_empty_slabs_ndirty_nonhuge, + arenas_i(mib[2])->astats->hpastats.psset_stats.empty_slabs[0].ndirty, size_t); + +/* Empty, huge */ +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_empty_slabs_npageslabs_huge, + arenas_i(mib[2])->astats->hpastats.psset_stats.empty_slabs[1].npageslabs, + size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_empty_slabs_nactive_huge, + arenas_i(mib[2])->astats->hpastats.psset_stats.empty_slabs[1].nactive, size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_empty_slabs_ndirty_huge, + arenas_i(mib[2])->astats->hpastats.psset_stats.empty_slabs[1].ndirty, size_t); + +/* Nonfull, nonhuge */ +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nonfull_slabs_j_npageslabs_nonhuge, + arenas_i(mib[2])->astats->hpastats.psset_stats.nonfull_slabs[mib[5]][0].npageslabs, + size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nonfull_slabs_j_nactive_nonhuge, + arenas_i(mib[2])->astats->hpastats.psset_stats.nonfull_slabs[mib[5]][0].nactive, + size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nonfull_slabs_j_ndirty_nonhuge, + arenas_i(mib[2])->astats->hpastats.psset_stats.nonfull_slabs[mib[5]][0].ndirty, + size_t); + +/* Nonfull, huge */ +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nonfull_slabs_j_npageslabs_huge, + arenas_i(mib[2])->astats->hpastats.psset_stats.nonfull_slabs[mib[5]][1].npageslabs, + size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nonfull_slabs_j_nactive_huge, + arenas_i(mib[2])->astats->hpastats.psset_stats.nonfull_slabs[mib[5]][1].nactive, + size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nonfull_slabs_j_ndirty_huge, + arenas_i(mib[2])->astats->hpastats.psset_stats.nonfull_slabs[mib[5]][1].ndirty, + size_t); + +static const ctl_named_node_t * +stats_arenas_i_hpa_shard_nonfull_slabs_j_index(tsdn_t *tsdn, const size_t *mib, + size_t miblen, size_t j) { + if (j >= PSSET_NPSIZES) { + return NULL; + } + return super_stats_arenas_i_hpa_shard_nonfull_slabs_j_node; +} + +static bool +ctl_arenas_i_verify(size_t i) { + size_t a = arenas_i2a_impl(i, true, true); + if (a == UINT_MAX || !ctl_arenas->arenas[a]->initialized) { + return true; + } + + return false; +} + +static const ctl_named_node_t * +stats_arenas_i_index(tsdn_t *tsdn, const size_t *mib, + size_t miblen, size_t i) { + const ctl_named_node_t *ret; + + malloc_mutex_lock(tsdn, &ctl_mtx); + if (ctl_arenas_i_verify(i)) { + ret = NULL; + goto label_return; + } + + ret = super_stats_arenas_i_node; +label_return: + malloc_mutex_unlock(tsdn, &ctl_mtx); + return ret; +} + +static int +experimental_hooks_install_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + if (oldp == NULL || oldlenp == NULL|| newp == NULL) { + ret = EINVAL; + goto label_return; + } + /* + * Note: this is a *private* struct. This is an experimental interface; + * forcing the user to know the jemalloc internals well enough to + * extract the ABI hopefully ensures nobody gets too comfortable with + * this API, which can change at a moment's notice. + */ + hooks_t hooks; + WRITE(hooks, hooks_t); + void *handle = hook_install(tsd_tsdn(tsd), &hooks); + if (handle == NULL) { + ret = EAGAIN; + goto label_return; + } + READ(handle, void *); + + ret = 0; +label_return: + return ret; +} + +static int +experimental_hooks_remove_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + WRITEONLY(); + void *handle = NULL; + WRITE(handle, void *); + if (handle == NULL) { + ret = EINVAL; + goto label_return; + } + hook_remove(tsd_tsdn(tsd), handle); + ret = 0; +label_return: + return ret; +} + +static int +experimental_thread_activity_callback_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + + if (!config_stats) { + return ENOENT; + } + + activity_callback_thunk_t t_old = tsd_activity_callback_thunk_get(tsd); + READ(t_old, activity_callback_thunk_t); + + if (newp != NULL) { + /* + * This initialization is unnecessary. If it's omitted, though, + * clang gets confused and warns on the subsequent use of t_new. + */ + activity_callback_thunk_t t_new = {NULL, NULL}; + WRITE(t_new, activity_callback_thunk_t); + tsd_activity_callback_thunk_set(tsd, t_new); + } + ret = 0; +label_return: + return ret; +} + +/* + * Output six memory utilization entries for an input pointer, the first one of + * type (void *) and the remaining five of type size_t, describing the following + * (in the same order): + * + * (a) memory address of the extent a potential reallocation would go into, + * == the five fields below describe about the extent the pointer resides in == + * (b) number of free regions in the extent, + * (c) number of regions in the extent, + * (d) size of the extent in terms of bytes, + * (e) total number of free regions in the bin the extent belongs to, and + * (f) total number of regions in the bin the extent belongs to. + * + * Note that "(e)" and "(f)" are only available when stats are enabled; + * otherwise their values are undefined. + * + * This API is mainly intended for small class allocations, where extents are + * used as slab. Note that if the bin the extent belongs to is completely + * full, "(a)" will be NULL. + * + * In case of large class allocations, "(a)" will be NULL, and "(e)" and "(f)" + * will be zero (if stats are enabled; otherwise undefined). The other three + * fields will be properly set though the values are trivial: "(b)" will be 0, + * "(c)" will be 1, and "(d)" will be the usable size. + * + * The input pointer and size are respectively passed in by newp and newlen, + * and the output fields and size are respectively oldp and *oldlenp. + * + * It can be beneficial to define the following macros to make it easier to + * access the output: + * + * #define SLABCUR_READ(out) (*(void **)out) + * #define COUNTS(out) ((size_t *)((void **)out + 1)) + * #define NFREE_READ(out) COUNTS(out)[0] + * #define NREGS_READ(out) COUNTS(out)[1] + * #define SIZE_READ(out) COUNTS(out)[2] + * #define BIN_NFREE_READ(out) COUNTS(out)[3] + * #define BIN_NREGS_READ(out) COUNTS(out)[4] + * + * and then write e.g. NFREE_READ(oldp) to fetch the output. See the unit test + * test_query in test/unit/extent_util.c for an example. + * + * For a typical defragmentation workflow making use of this API for + * understanding the fragmentation level, please refer to the comment for + * experimental_utilization_batch_query_ctl. + * + * It's up to the application how to determine the significance of + * fragmentation relying on the outputs returned. Possible choices are: + * + * (a) if extent utilization ratio is below certain threshold, + * (b) if extent memory consumption is above certain threshold, + * (c) if extent utilization ratio is significantly below bin utilization ratio, + * (d) if input pointer deviates a lot from potential reallocation address, or + * (e) some selection/combination of the above. + * + * The caller needs to make sure that the input/output arguments are valid, + * in particular, that the size of the output is correct, i.e.: + * + * *oldlenp = sizeof(void *) + sizeof(size_t) * 5 + * + * Otherwise, the function immediately returns EINVAL without touching anything. + * + * In the rare case where there's no associated extent found for the input + * pointer, the function zeros out all output fields and return. Please refer + * to the comment for experimental_utilization_batch_query_ctl to understand the + * motivation from C++. + */ +static int +experimental_utilization_query_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + + assert(sizeof(inspect_extent_util_stats_verbose_t) + == sizeof(void *) + sizeof(size_t) * 5); + + if (oldp == NULL || oldlenp == NULL + || *oldlenp != sizeof(inspect_extent_util_stats_verbose_t) + || newp == NULL) { + ret = EINVAL; + goto label_return; + } + + void *ptr = NULL; + WRITE(ptr, void *); + inspect_extent_util_stats_verbose_t *util_stats + = (inspect_extent_util_stats_verbose_t *)oldp; + inspect_extent_util_stats_verbose_get(tsd_tsdn(tsd), ptr, + &util_stats->nfree, &util_stats->nregs, &util_stats->size, + &util_stats->bin_nfree, &util_stats->bin_nregs, + &util_stats->slabcur_addr); + ret = 0; + +label_return: + return ret; +} + +/* + * Given an input array of pointers, output three memory utilization entries of + * type size_t for each input pointer about the extent it resides in: + * + * (a) number of free regions in the extent, + * (b) number of regions in the extent, and + * (c) size of the extent in terms of bytes. + * + * This API is mainly intended for small class allocations, where extents are + * used as slab. In case of large class allocations, the outputs are trivial: + * "(a)" will be 0, "(b)" will be 1, and "(c)" will be the usable size. + * + * Note that multiple input pointers may reside on a same extent so the output + * fields may contain duplicates. + * + * The format of the input/output looks like: + * + * input[0]: 1st_pointer_to_query | output[0]: 1st_extent_n_free_regions + * | output[1]: 1st_extent_n_regions + * | output[2]: 1st_extent_size + * input[1]: 2nd_pointer_to_query | output[3]: 2nd_extent_n_free_regions + * | output[4]: 2nd_extent_n_regions + * | output[5]: 2nd_extent_size + * ... | ... + * + * The input array and size are respectively passed in by newp and newlen, and + * the output array and size are respectively oldp and *oldlenp. + * + * It can be beneficial to define the following macros to make it easier to + * access the output: + * + * #define NFREE_READ(out, i) out[(i) * 3] + * #define NREGS_READ(out, i) out[(i) * 3 + 1] + * #define SIZE_READ(out, i) out[(i) * 3 + 2] + * + * and then write e.g. NFREE_READ(oldp, i) to fetch the output. See the unit + * test test_batch in test/unit/extent_util.c for a concrete example. + * + * A typical workflow would be composed of the following steps: + * + * (1) flush tcache: mallctl("thread.tcache.flush", ...) + * (2) initialize input array of pointers to query fragmentation + * (3) allocate output array to hold utilization statistics + * (4) query utilization: mallctl("experimental.utilization.batch_query", ...) + * (5) (optional) decide if it's worthwhile to defragment; otherwise stop here + * (6) disable tcache: mallctl("thread.tcache.enabled", ...) + * (7) defragment allocations with significant fragmentation, e.g.: + * for each allocation { + * if it's fragmented { + * malloc(...); + * memcpy(...); + * free(...); + * } + * } + * (8) enable tcache: mallctl("thread.tcache.enabled", ...) + * + * The application can determine the significance of fragmentation themselves + * relying on the statistics returned, both at the overall level i.e. step "(5)" + * and at individual allocation level i.e. within step "(7)". Possible choices + * are: + * + * (a) whether memory utilization ratio is below certain threshold, + * (b) whether memory consumption is above certain threshold, or + * (c) some combination of the two. + * + * The caller needs to make sure that the input/output arrays are valid and + * their sizes are proper as well as matched, meaning: + * + * (a) newlen = n_pointers * sizeof(const void *) + * (b) *oldlenp = n_pointers * sizeof(size_t) * 3 + * (c) n_pointers > 0 + * + * Otherwise, the function immediately returns EINVAL without touching anything. + * + * In the rare case where there's no associated extent found for some pointers, + * rather than immediately terminating the computation and raising an error, + * the function simply zeros out the corresponding output fields and continues + * the computation until all input pointers are handled. The motivations of + * such a design are as follows: + * + * (a) The function always either processes nothing or processes everything, and + * never leaves the output half touched and half untouched. + * + * (b) It facilitates usage needs especially common in C++. A vast variety of + * C++ objects are instantiated with multiple dynamic memory allocations. For + * example, std::string and std::vector typically use at least two allocations, + * one for the metadata and one for the actual content. Other types may use + * even more allocations. When inquiring about utilization statistics, the + * caller often wants to examine into all such allocations, especially internal + * one(s), rather than just the topmost one. The issue comes when some + * implementations do certain optimizations to reduce/aggregate some internal + * allocations, e.g. putting short strings directly into the metadata, and such + * decisions are not known to the caller. Therefore, we permit pointers to + * memory usages that may not be returned by previous malloc calls, and we + * provide the caller a convenient way to identify such cases. + */ +static int +experimental_utilization_batch_query_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + + assert(sizeof(inspect_extent_util_stats_t) == sizeof(size_t) * 3); + + const size_t len = newlen / sizeof(const void *); + if (oldp == NULL || oldlenp == NULL || newp == NULL || newlen == 0 + || newlen != len * sizeof(const void *) + || *oldlenp != len * sizeof(inspect_extent_util_stats_t)) { + ret = EINVAL; + goto label_return; + } + + void **ptrs = (void **)newp; + inspect_extent_util_stats_t *util_stats = + (inspect_extent_util_stats_t *)oldp; + size_t i; + for (i = 0; i < len; ++i) { + inspect_extent_util_stats_get(tsd_tsdn(tsd), ptrs[i], + &util_stats[i].nfree, &util_stats[i].nregs, + &util_stats[i].size); + } + ret = 0; + +label_return: + return ret; +} + +static const ctl_named_node_t * +experimental_arenas_i_index(tsdn_t *tsdn, const size_t *mib, + size_t miblen, size_t i) { + const ctl_named_node_t *ret; + + malloc_mutex_lock(tsdn, &ctl_mtx); + if (ctl_arenas_i_verify(i)) { + ret = NULL; + goto label_return; + } + ret = super_experimental_arenas_i_node; +label_return: + malloc_mutex_unlock(tsdn, &ctl_mtx); + return ret; +} + +static int +experimental_arenas_i_pactivep_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + if (!config_stats) { + return ENOENT; + } + if (oldp == NULL || oldlenp == NULL || *oldlenp != sizeof(size_t *)) { + return EINVAL; + } + + unsigned arena_ind; + arena_t *arena; + int ret; + size_t *pactivep; + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + READONLY(); + MIB_UNSIGNED(arena_ind, 2); + if (arena_ind < narenas_total_get() && (arena = + arena_get(tsd_tsdn(tsd), arena_ind, false)) != NULL) { +#if defined(JEMALLOC_GCC_ATOMIC_ATOMICS) || \ + defined(JEMALLOC_GCC_SYNC_ATOMICS) || defined(_MSC_VER) + /* Expose the underlying counter for fast read. */ + pactivep = (size_t *)&(arena->pa_shard.nactive.repr); + READ(pactivep, size_t *); + ret = 0; +#else + ret = EFAULT; +#endif + } else { + ret = EFAULT; + } +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + return ret; +} + +static int +experimental_prof_recent_alloc_max_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + + if (!(config_prof && opt_prof)) { + ret = ENOENT; + goto label_return; + } + + ssize_t old_max; + if (newp != NULL) { + if (newlen != sizeof(ssize_t)) { + ret = EINVAL; + goto label_return; + } + ssize_t max = *(ssize_t *)newp; + if (max < -1) { + ret = EINVAL; + goto label_return; + } + old_max = prof_recent_alloc_max_ctl_write(tsd, max); + } else { + old_max = prof_recent_alloc_max_ctl_read(); + } + READ(old_max, ssize_t); + + ret = 0; + +label_return: + return ret; +} + +typedef struct write_cb_packet_s write_cb_packet_t; +struct write_cb_packet_s { + write_cb_t *write_cb; + void *cbopaque; +}; + +static int +experimental_prof_recent_alloc_dump_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + + if (!(config_prof && opt_prof)) { + ret = ENOENT; + goto label_return; + } + + assert(sizeof(write_cb_packet_t) == sizeof(void *) * 2); + + WRITEONLY(); + write_cb_packet_t write_cb_packet; + ASSURED_WRITE(write_cb_packet, write_cb_packet_t); + + prof_recent_alloc_dump(tsd, write_cb_packet.write_cb, + write_cb_packet.cbopaque); + + ret = 0; + +label_return: + return ret; +} + +typedef struct batch_alloc_packet_s batch_alloc_packet_t; +struct batch_alloc_packet_s { + void **ptrs; + size_t num; + size_t size; + int flags; +}; + +static int +experimental_batch_alloc_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + + VERIFY_READ(size_t); + + batch_alloc_packet_t batch_alloc_packet; + ASSURED_WRITE(batch_alloc_packet, batch_alloc_packet_t); + size_t filled = batch_alloc(batch_alloc_packet.ptrs, + batch_alloc_packet.num, batch_alloc_packet.size, + batch_alloc_packet.flags); + READ(filled, size_t); + + ret = 0; + +label_return: + return ret; +} + +static int +prof_stats_bins_i_live_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned binind; + prof_stats_t stats; + + if (!(config_prof && opt_prof && opt_prof_stats)) { + ret = ENOENT; + goto label_return; + } + + READONLY(); + MIB_UNSIGNED(binind, 3); + if (binind >= SC_NBINS) { + ret = EINVAL; + goto label_return; + } + prof_stats_get_live(tsd, (szind_t)binind, &stats); + READ(stats, prof_stats_t); + + ret = 0; +label_return: + return ret; +} + +static int +prof_stats_bins_i_accum_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned binind; + prof_stats_t stats; + + if (!(config_prof && opt_prof && opt_prof_stats)) { + ret = ENOENT; + goto label_return; + } + + READONLY(); + MIB_UNSIGNED(binind, 3); + if (binind >= SC_NBINS) { + ret = EINVAL; + goto label_return; + } + prof_stats_get_accum(tsd, (szind_t)binind, &stats); + READ(stats, prof_stats_t); + + ret = 0; +label_return: + return ret; +} + +static const ctl_named_node_t * +prof_stats_bins_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen, + size_t i) { + if (!(config_prof && opt_prof && opt_prof_stats)) { + return NULL; + } + if (i >= SC_NBINS) { + return NULL; + } + return super_prof_stats_bins_i_node; +} + +static int +prof_stats_lextents_i_live_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned lextent_ind; + prof_stats_t stats; + + if (!(config_prof && opt_prof && opt_prof_stats)) { + ret = ENOENT; + goto label_return; + } + + READONLY(); + MIB_UNSIGNED(lextent_ind, 3); + if (lextent_ind >= SC_NSIZES - SC_NBINS) { + ret = EINVAL; + goto label_return; + } + prof_stats_get_live(tsd, (szind_t)(lextent_ind + SC_NBINS), &stats); + READ(stats, prof_stats_t); + + ret = 0; +label_return: + return ret; +} + +static int +prof_stats_lextents_i_accum_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned lextent_ind; + prof_stats_t stats; + + if (!(config_prof && opt_prof && opt_prof_stats)) { + ret = ENOENT; + goto label_return; + } + + READONLY(); + MIB_UNSIGNED(lextent_ind, 3); + if (lextent_ind >= SC_NSIZES - SC_NBINS) { + ret = EINVAL; + goto label_return; + } + prof_stats_get_accum(tsd, (szind_t)(lextent_ind + SC_NBINS), &stats); + READ(stats, prof_stats_t); + + ret = 0; +label_return: + return ret; +} + +static const ctl_named_node_t * +prof_stats_lextents_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen, + size_t i) { + if (!(config_prof && opt_prof && opt_prof_stats)) { + return NULL; + } + if (i >= SC_NSIZES - SC_NBINS) { + return NULL; + } + return super_prof_stats_lextents_i_node; +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/decay.c b/src/duckdb/extension/jemalloc/jemalloc/src/decay.c new file mode 100644 index 000000000..f75696ddb --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/decay.c @@ -0,0 +1,296 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/decay.h" + +static const uint64_t h_steps[SMOOTHSTEP_NSTEPS] = { +#define STEP(step, h, x, y) \ + h, + SMOOTHSTEP +#undef STEP +}; + +/* + * Generate a new deadline that is uniformly random within the next epoch after + * the current one. + */ +static void +decay_deadline_init(decay_t *decay) { + nstime_copy(&decay->deadline, &decay->epoch); + nstime_add(&decay->deadline, &decay->interval); + if (decay_ms_read(decay) > 0) { + nstime_t jitter; + + nstime_init(&jitter, prng_range_u64(&decay->jitter_state, + nstime_ns(&decay->interval))); + nstime_add(&decay->deadline, &jitter); + } +} + +void +decay_reinit(decay_t *decay, nstime_t *cur_time, ssize_t decay_ms) { + atomic_store_zd(&decay->time_ms, decay_ms, ATOMIC_RELAXED); + if (decay_ms > 0) { + nstime_init(&decay->interval, (uint64_t)decay_ms * + KQU(1000000)); + nstime_idivide(&decay->interval, SMOOTHSTEP_NSTEPS); + } + + nstime_copy(&decay->epoch, cur_time); + decay->jitter_state = (uint64_t)(uintptr_t)decay; + decay_deadline_init(decay); + decay->nunpurged = 0; + memset(decay->backlog, 0, SMOOTHSTEP_NSTEPS * sizeof(size_t)); +} + +bool +decay_init(decay_t *decay, nstime_t *cur_time, ssize_t decay_ms) { + if (config_debug) { + for (size_t i = 0; i < sizeof(decay_t); i++) { + assert(((char *)decay)[i] == 0); + } + decay->ceil_npages = 0; + } + if (malloc_mutex_init(&decay->mtx, "decay", WITNESS_RANK_DECAY, + malloc_mutex_rank_exclusive)) { + return true; + } + decay->purging = false; + decay_reinit(decay, cur_time, decay_ms); + return false; +} + +bool +decay_ms_valid(ssize_t decay_ms) { + if (decay_ms < -1) { + return false; + } + if (decay_ms == -1 || (uint64_t)decay_ms <= NSTIME_SEC_MAX * + KQU(1000)) { + return true; + } + return false; +} + +static void +decay_maybe_update_time(decay_t *decay, nstime_t *new_time) { + if (unlikely(!nstime_monotonic() && nstime_compare(&decay->epoch, + new_time) > 0)) { + /* + * Time went backwards. Move the epoch back in time and + * generate a new deadline, with the expectation that time + * typically flows forward for long enough periods of time that + * epochs complete. Unfortunately, this strategy is susceptible + * to clock jitter triggering premature epoch advances, but + * clock jitter estimation and compensation isn't feasible here + * because calls into this code are event-driven. + */ + nstime_copy(&decay->epoch, new_time); + decay_deadline_init(decay); + } else { + /* Verify that time does not go backwards. */ + assert(nstime_compare(&decay->epoch, new_time) <= 0); + } +} + +static size_t +decay_backlog_npages_limit(const decay_t *decay) { + /* + * For each element of decay_backlog, multiply by the corresponding + * fixed-point smoothstep decay factor. Sum the products, then divide + * to round down to the nearest whole number of pages. + */ + uint64_t sum = 0; + for (unsigned i = 0; i < SMOOTHSTEP_NSTEPS; i++) { + sum += decay->backlog[i] * h_steps[i]; + } + size_t npages_limit_backlog = (size_t)(sum >> SMOOTHSTEP_BFP); + + return npages_limit_backlog; +} + +/* + * Update backlog, assuming that 'nadvance_u64' time intervals have passed. + * Trailing 'nadvance_u64' records should be erased and 'current_npages' is + * placed as the newest record. + */ +static void +decay_backlog_update(decay_t *decay, uint64_t nadvance_u64, + size_t current_npages) { + if (nadvance_u64 >= SMOOTHSTEP_NSTEPS) { + memset(decay->backlog, 0, (SMOOTHSTEP_NSTEPS-1) * + sizeof(size_t)); + } else { + size_t nadvance_z = (size_t)nadvance_u64; + + assert((uint64_t)nadvance_z == nadvance_u64); + + memmove(decay->backlog, &decay->backlog[nadvance_z], + (SMOOTHSTEP_NSTEPS - nadvance_z) * sizeof(size_t)); + if (nadvance_z > 1) { + memset(&decay->backlog[SMOOTHSTEP_NSTEPS - + nadvance_z], 0, (nadvance_z-1) * sizeof(size_t)); + } + } + + size_t npages_delta = (current_npages > decay->nunpurged) ? + current_npages - decay->nunpurged : 0; + decay->backlog[SMOOTHSTEP_NSTEPS-1] = npages_delta; + + if (config_debug) { + if (current_npages > decay->ceil_npages) { + decay->ceil_npages = current_npages; + } + size_t npages_limit = decay_backlog_npages_limit(decay); + assert(decay->ceil_npages >= npages_limit); + if (decay->ceil_npages > npages_limit) { + decay->ceil_npages = npages_limit; + } + } +} + +static inline bool +decay_deadline_reached(const decay_t *decay, const nstime_t *time) { + return (nstime_compare(&decay->deadline, time) <= 0); +} + +uint64_t +decay_npages_purge_in(decay_t *decay, nstime_t *time, size_t npages_new) { + uint64_t decay_interval_ns = decay_epoch_duration_ns(decay); + assert(decay_interval_ns != 0); + size_t n_epoch = (size_t)(nstime_ns(time) / decay_interval_ns); + + uint64_t npages_purge; + if (n_epoch >= SMOOTHSTEP_NSTEPS) { + npages_purge = npages_new; + } else { + uint64_t h_steps_max = h_steps[SMOOTHSTEP_NSTEPS - 1]; + assert(h_steps_max >= + h_steps[SMOOTHSTEP_NSTEPS - 1 - n_epoch]); + npages_purge = npages_new * (h_steps_max - + h_steps[SMOOTHSTEP_NSTEPS - 1 - n_epoch]); + npages_purge >>= SMOOTHSTEP_BFP; + } + return npages_purge; +} + +bool +decay_maybe_advance_epoch(decay_t *decay, nstime_t *new_time, + size_t npages_current) { + /* Handle possible non-monotonicity of time. */ + decay_maybe_update_time(decay, new_time); + + if (!decay_deadline_reached(decay, new_time)) { + return false; + } + nstime_t delta; + nstime_copy(&delta, new_time); + nstime_subtract(&delta, &decay->epoch); + + uint64_t nadvance_u64 = nstime_divide(&delta, &decay->interval); + assert(nadvance_u64 > 0); + + /* Add nadvance_u64 decay intervals to epoch. */ + nstime_copy(&delta, &decay->interval); + nstime_imultiply(&delta, nadvance_u64); + nstime_add(&decay->epoch, &delta); + + /* Set a new deadline. */ + decay_deadline_init(decay); + + /* Update the backlog. */ + decay_backlog_update(decay, nadvance_u64, npages_current); + + decay->npages_limit = decay_backlog_npages_limit(decay); + decay->nunpurged = (decay->npages_limit > npages_current) ? + decay->npages_limit : npages_current; + + return true; +} + +/* + * Calculate how many pages should be purged after 'interval'. + * + * First, calculate how many pages should remain at the moment, then subtract + * the number of pages that should remain after 'interval'. The difference is + * how many pages should be purged until then. + * + * The number of pages that should remain at a specific moment is calculated + * like this: pages(now) = sum(backlog[i] * h_steps[i]). After 'interval' + * passes, backlog would shift 'interval' positions to the left and sigmoid + * curve would be applied starting with backlog[interval]. + * + * The implementation doesn't directly map to the description, but it's + * essentially the same calculation, optimized to avoid iterating over + * [interval..SMOOTHSTEP_NSTEPS) twice. + */ +static inline size_t +decay_npurge_after_interval(decay_t *decay, size_t interval) { + size_t i; + uint64_t sum = 0; + for (i = 0; i < interval; i++) { + sum += decay->backlog[i] * h_steps[i]; + } + for (; i < SMOOTHSTEP_NSTEPS; i++) { + sum += decay->backlog[i] * + (h_steps[i] - h_steps[i - interval]); + } + + return (size_t)(sum >> SMOOTHSTEP_BFP); +} + +uint64_t decay_ns_until_purge(decay_t *decay, size_t npages_current, + uint64_t npages_threshold) { + if (!decay_gradually(decay)) { + return DECAY_UNBOUNDED_TIME_TO_PURGE; + } + uint64_t decay_interval_ns = decay_epoch_duration_ns(decay); + assert(decay_interval_ns > 0); + if (npages_current == 0) { + unsigned i; + for (i = 0; i < SMOOTHSTEP_NSTEPS; i++) { + if (decay->backlog[i] > 0) { + break; + } + } + if (i == SMOOTHSTEP_NSTEPS) { + /* No dirty pages recorded. Sleep indefinitely. */ + return DECAY_UNBOUNDED_TIME_TO_PURGE; + } + } + if (npages_current <= npages_threshold) { + /* Use max interval. */ + return decay_interval_ns * SMOOTHSTEP_NSTEPS; + } + + /* Minimal 2 intervals to ensure reaching next epoch deadline. */ + size_t lb = 2; + size_t ub = SMOOTHSTEP_NSTEPS; + + size_t npurge_lb, npurge_ub; + npurge_lb = decay_npurge_after_interval(decay, lb); + if (npurge_lb > npages_threshold) { + return decay_interval_ns * lb; + } + npurge_ub = decay_npurge_after_interval(decay, ub); + if (npurge_ub < npages_threshold) { + return decay_interval_ns * ub; + } + + unsigned n_search = 0; + size_t target, npurge; + while ((npurge_lb + npages_threshold < npurge_ub) && (lb + 2 < ub)) { + target = (lb + ub) / 2; + npurge = decay_npurge_after_interval(decay, target); + if (npurge > npages_threshold) { + ub = target; + npurge_ub = npurge; + } else { + lb = target; + npurge_lb = npurge; + } + assert(n_search < lg_floor(SMOOTHSTEP_NSTEPS) + 1); + ++n_search; + } + return decay_interval_ns * (ub + lb) / 2; +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/div.c b/src/duckdb/extension/jemalloc/jemalloc/src/div.c new file mode 100644 index 000000000..808892a13 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/div.c @@ -0,0 +1,55 @@ +#include "jemalloc/internal/jemalloc_preamble.h" + +#include "jemalloc/internal/div.h" + +#include "jemalloc/internal/assert.h" + +/* + * Suppose we have n = q * d, all integers. We know n and d, and want q = n / d. + * + * For any k, we have (here, all division is exact; not C-style rounding): + * floor(ceil(2^k / d) * n / 2^k) = floor((2^k + r) / d * n / 2^k), where + * r = (-2^k) mod d. + * + * Expanding this out: + * ... = floor(2^k / d * n / 2^k + r / d * n / 2^k) + * = floor(n / d + (r / d) * (n / 2^k)). + * + * The fractional part of n / d is 0 (because of the assumption that d divides n + * exactly), so we have: + * ... = n / d + floor((r / d) * (n / 2^k)) + * + * So that our initial expression is equal to the quantity we seek, so long as + * (r / d) * (n / 2^k) < 1. + * + * r is a remainder mod d, so r < d and r / d < 1 always. We can make + * n / 2 ^ k < 1 by setting k = 32. This gets us a value of magic that works. + */ + +void +div_init(div_info_t *div_info, size_t d) { + /* Nonsensical. */ + assert(d != 0); + /* + * This would make the value of magic too high to fit into a uint32_t + * (we would want magic = 2^32 exactly). This would mess with code gen + * on 32-bit machines. + */ + assert(d != 1); + + uint64_t two_to_k = ((uint64_t)1 << 32); + uint32_t magic = (uint32_t)(two_to_k / d); + + /* + * We want magic = ceil(2^k / d), but C gives us floor. We have to + * increment it unless the result was exact (i.e. unless d is a power of + * two). + */ + if (two_to_k % d != 0) { + magic++; + } + div_info->magic = magic; +#ifdef JEMALLOC_DEBUG + div_info->d = d; +#endif +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/ecache.c b/src/duckdb/extension/jemalloc/jemalloc/src/ecache.c new file mode 100644 index 000000000..a242227d3 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/ecache.c @@ -0,0 +1,35 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/san.h" + +bool +ecache_init(tsdn_t *tsdn, ecache_t *ecache, extent_state_t state, unsigned ind, + bool delay_coalesce) { + if (malloc_mutex_init(&ecache->mtx, "extents", WITNESS_RANK_EXTENTS, + malloc_mutex_rank_exclusive)) { + return true; + } + ecache->state = state; + ecache->ind = ind; + ecache->delay_coalesce = delay_coalesce; + eset_init(&ecache->eset, state); + eset_init(&ecache->guarded_eset, state); + + return false; +} + +void +ecache_prefork(tsdn_t *tsdn, ecache_t *ecache) { + malloc_mutex_prefork(tsdn, &ecache->mtx); +} + +void +ecache_postfork_parent(tsdn_t *tsdn, ecache_t *ecache) { + malloc_mutex_postfork_parent(tsdn, &ecache->mtx); +} + +void +ecache_postfork_child(tsdn_t *tsdn, ecache_t *ecache) { + malloc_mutex_postfork_child(tsdn, &ecache->mtx); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/edata.c b/src/duckdb/extension/jemalloc/jemalloc/src/edata.c new file mode 100644 index 000000000..82b6f5654 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/edata.c @@ -0,0 +1,6 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +ph_gen(, edata_avail, edata_t, avail_link, + edata_esnead_comp) +ph_gen(, edata_heap, edata_t, heap_link, edata_snad_comp) diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/edata_cache.c b/src/duckdb/extension/jemalloc/jemalloc/src/edata_cache.c new file mode 100644 index 000000000..6bc1848cb --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/edata_cache.c @@ -0,0 +1,154 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +bool +edata_cache_init(edata_cache_t *edata_cache, base_t *base) { + edata_avail_new(&edata_cache->avail); + /* + * This is not strictly necessary, since the edata_cache_t is only + * created inside an arena, which is zeroed on creation. But this is + * handy as a safety measure. + */ + atomic_store_zu(&edata_cache->count, 0, ATOMIC_RELAXED); + if (malloc_mutex_init(&edata_cache->mtx, "edata_cache", + WITNESS_RANK_EDATA_CACHE, malloc_mutex_rank_exclusive)) { + return true; + } + edata_cache->base = base; + return false; +} + +edata_t * +edata_cache_get(tsdn_t *tsdn, edata_cache_t *edata_cache) { + malloc_mutex_lock(tsdn, &edata_cache->mtx); + edata_t *edata = edata_avail_first(&edata_cache->avail); + if (edata == NULL) { + malloc_mutex_unlock(tsdn, &edata_cache->mtx); + return base_alloc_edata(tsdn, edata_cache->base); + } + edata_avail_remove(&edata_cache->avail, edata); + atomic_load_sub_store_zu(&edata_cache->count, 1); + malloc_mutex_unlock(tsdn, &edata_cache->mtx); + return edata; +} + +void +edata_cache_put(tsdn_t *tsdn, edata_cache_t *edata_cache, edata_t *edata) { + malloc_mutex_lock(tsdn, &edata_cache->mtx); + edata_avail_insert(&edata_cache->avail, edata); + atomic_load_add_store_zu(&edata_cache->count, 1); + malloc_mutex_unlock(tsdn, &edata_cache->mtx); +} + +void +edata_cache_prefork(tsdn_t *tsdn, edata_cache_t *edata_cache) { + malloc_mutex_prefork(tsdn, &edata_cache->mtx); +} + +void +edata_cache_postfork_parent(tsdn_t *tsdn, edata_cache_t *edata_cache) { + malloc_mutex_postfork_parent(tsdn, &edata_cache->mtx); +} + +void +edata_cache_postfork_child(tsdn_t *tsdn, edata_cache_t *edata_cache) { + malloc_mutex_postfork_child(tsdn, &edata_cache->mtx); +} + +void +edata_cache_fast_init(edata_cache_fast_t *ecs, edata_cache_t *fallback) { + edata_list_inactive_init(&ecs->list); + ecs->fallback = fallback; + ecs->disabled = false; +} + +static void +edata_cache_fast_try_fill_from_fallback(tsdn_t *tsdn, + edata_cache_fast_t *ecs) { + edata_t *edata; + malloc_mutex_lock(tsdn, &ecs->fallback->mtx); + for (int i = 0; i < EDATA_CACHE_FAST_FILL; i++) { + edata = edata_avail_remove_first(&ecs->fallback->avail); + if (edata == NULL) { + break; + } + edata_list_inactive_append(&ecs->list, edata); + atomic_load_sub_store_zu(&ecs->fallback->count, 1); + } + malloc_mutex_unlock(tsdn, &ecs->fallback->mtx); +} + +edata_t * +edata_cache_fast_get(tsdn_t *tsdn, edata_cache_fast_t *ecs) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_EDATA_CACHE, 0); + + if (ecs->disabled) { + assert(edata_list_inactive_first(&ecs->list) == NULL); + return edata_cache_get(tsdn, ecs->fallback); + } + + edata_t *edata = edata_list_inactive_first(&ecs->list); + if (edata != NULL) { + edata_list_inactive_remove(&ecs->list, edata); + return edata; + } + /* Slow path; requires synchronization. */ + edata_cache_fast_try_fill_from_fallback(tsdn, ecs); + edata = edata_list_inactive_first(&ecs->list); + if (edata != NULL) { + edata_list_inactive_remove(&ecs->list, edata); + } else { + /* + * Slowest path (fallback was also empty); allocate something + * new. + */ + edata = base_alloc_edata(tsdn, ecs->fallback->base); + } + return edata; +} + +static void +edata_cache_fast_flush_all(tsdn_t *tsdn, edata_cache_fast_t *ecs) { + /* + * You could imagine smarter cache management policies (like + * only flushing down to some threshold in anticipation of + * future get requests). But just flushing everything provides + * a good opportunity to defrag too, and lets us share code between the + * flush and disable pathways. + */ + edata_t *edata; + size_t nflushed = 0; + malloc_mutex_lock(tsdn, &ecs->fallback->mtx); + while ((edata = edata_list_inactive_first(&ecs->list)) != NULL) { + edata_list_inactive_remove(&ecs->list, edata); + edata_avail_insert(&ecs->fallback->avail, edata); + nflushed++; + } + atomic_load_add_store_zu(&ecs->fallback->count, nflushed); + malloc_mutex_unlock(tsdn, &ecs->fallback->mtx); +} + +void +edata_cache_fast_put(tsdn_t *tsdn, edata_cache_fast_t *ecs, edata_t *edata) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_EDATA_CACHE, 0); + + if (ecs->disabled) { + assert(edata_list_inactive_first(&ecs->list) == NULL); + edata_cache_put(tsdn, ecs->fallback, edata); + return; + } + + /* + * Prepend rather than append, to do LIFO ordering in the hopes of some + * cache locality. + */ + edata_list_inactive_prepend(&ecs->list, edata); +} + +void +edata_cache_fast_disable(tsdn_t *tsdn, edata_cache_fast_t *ecs) { + edata_cache_fast_flush_all(tsdn, ecs); + ecs->disabled = true; +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/ehooks.c b/src/duckdb/extension/jemalloc/jemalloc/src/ehooks.c new file mode 100644 index 000000000..fc2355e60 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/ehooks.c @@ -0,0 +1,275 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/ehooks.h" +#include "jemalloc/internal/extent_mmap.h" + +void +ehooks_init(ehooks_t *ehooks, extent_hooks_t *extent_hooks, unsigned ind) { + /* All other hooks are optional; this one is not. */ + assert(extent_hooks->alloc != NULL); + ehooks->ind = ind; + ehooks_set_extent_hooks_ptr(ehooks, extent_hooks); +} + +/* + * If the caller specifies (!*zero), it is still possible to receive zeroed + * memory, in which case *zero is toggled to true. arena_extent_alloc() takes + * advantage of this to avoid demanding zeroed extents, but taking advantage of + * them if they are returned. + */ +static void * +extent_alloc_core(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size, + size_t alignment, bool *zero, bool *commit, dss_prec_t dss_prec) { + void *ret; + + assert(size != 0); + assert(alignment != 0); + + /* "primary" dss. */ + if (have_dss && dss_prec == dss_prec_primary && (ret = + extent_alloc_dss(tsdn, arena, new_addr, size, alignment, zero, + commit)) != NULL) { + return ret; + } + /* mmap. */ + if ((ret = extent_alloc_mmap(new_addr, size, alignment, zero, commit)) + != NULL) { + return ret; + } + /* "secondary" dss. */ + if (have_dss && dss_prec == dss_prec_secondary && (ret = + extent_alloc_dss(tsdn, arena, new_addr, size, alignment, zero, + commit)) != NULL) { + return ret; + } + + /* All strategies for allocation failed. */ + return NULL; +} + +void * +ehooks_default_alloc_impl(tsdn_t *tsdn, void *new_addr, size_t size, + size_t alignment, bool *zero, bool *commit, unsigned arena_ind) { + arena_t *arena = arena_get(tsdn, arena_ind, false); + /* NULL arena indicates arena_create. */ + assert(arena != NULL || alignment == HUGEPAGE); + dss_prec_t dss = (arena == NULL) ? dss_prec_disabled : + (dss_prec_t)atomic_load_u(&arena->dss_prec, ATOMIC_RELAXED); + void *ret = extent_alloc_core(tsdn, arena, new_addr, size, alignment, + zero, commit, dss); + if (have_madvise_huge && ret) { + pages_set_thp_state(ret, size); + } + return ret; +} + +static void * +ehooks_default_alloc(extent_hooks_t *extent_hooks, void *new_addr, size_t size, + size_t alignment, bool *zero, bool *commit, unsigned arena_ind) { + return ehooks_default_alloc_impl(tsdn_fetch(), new_addr, size, + ALIGNMENT_CEILING(alignment, PAGE), zero, commit, arena_ind); +} + +bool +ehooks_default_dalloc_impl(void *addr, size_t size) { + if (!have_dss || !extent_in_dss(addr)) { + return extent_dalloc_mmap(addr, size); + } + return true; +} + +static bool +ehooks_default_dalloc(extent_hooks_t *extent_hooks, void *addr, size_t size, + bool committed, unsigned arena_ind) { + return ehooks_default_dalloc_impl(addr, size); +} + +void +ehooks_default_destroy_impl(void *addr, size_t size) { + if (!have_dss || !extent_in_dss(addr)) { + pages_unmap(addr, size); + } +} + +static void +ehooks_default_destroy(extent_hooks_t *extent_hooks, void *addr, size_t size, + bool committed, unsigned arena_ind) { + ehooks_default_destroy_impl(addr, size); +} + +bool +ehooks_default_commit_impl(void *addr, size_t offset, size_t length) { + return pages_commit((void *)((byte_t *)addr + (uintptr_t)offset), + length); +} + +static bool +ehooks_default_commit(extent_hooks_t *extent_hooks, void *addr, size_t size, + size_t offset, size_t length, unsigned arena_ind) { + return ehooks_default_commit_impl(addr, offset, length); +} + +bool +ehooks_default_decommit_impl(void *addr, size_t offset, size_t length) { + return pages_decommit((void *)((byte_t *)addr + (uintptr_t)offset), + length); +} + +static bool +ehooks_default_decommit(extent_hooks_t *extent_hooks, void *addr, size_t size, + size_t offset, size_t length, unsigned arena_ind) { + return ehooks_default_decommit_impl(addr, offset, length); +} + +#ifdef PAGES_CAN_PURGE_LAZY +bool +ehooks_default_purge_lazy_impl(void *addr, size_t offset, size_t length) { + return pages_purge_lazy((void *)((byte_t *)addr + (uintptr_t)offset), + length); +} + +static bool +ehooks_default_purge_lazy(extent_hooks_t *extent_hooks, void *addr, size_t size, + size_t offset, size_t length, unsigned arena_ind) { + assert(addr != NULL); + assert((offset & PAGE_MASK) == 0); + assert(length != 0); + assert((length & PAGE_MASK) == 0); + return ehooks_default_purge_lazy_impl(addr, offset, length); +} +#endif + +#ifdef PAGES_CAN_PURGE_FORCED +bool +ehooks_default_purge_forced_impl(void *addr, size_t offset, size_t length) { + return pages_purge_forced((void *)((byte_t *)addr + + (uintptr_t)offset), length); +} + +static bool +ehooks_default_purge_forced(extent_hooks_t *extent_hooks, void *addr, + size_t size, size_t offset, size_t length, unsigned arena_ind) { + assert(addr != NULL); + assert((offset & PAGE_MASK) == 0); + assert(length != 0); + assert((length & PAGE_MASK) == 0); + return ehooks_default_purge_forced_impl(addr, offset, length); +} +#endif + +bool +ehooks_default_split_impl(void) { + if (!maps_coalesce) { + /* + * Without retain, only whole regions can be purged (required by + * MEM_RELEASE on Windows) -- therefore disallow splitting. See + * comments in extent_head_no_merge(). + */ + return !opt_retain; + } + + return false; +} + +static bool +ehooks_default_split(extent_hooks_t *extent_hooks, void *addr, size_t size, + size_t size_a, size_t size_b, bool committed, unsigned arena_ind) { + return ehooks_default_split_impl(); +} + +bool +ehooks_default_merge_impl(tsdn_t *tsdn, void *addr_a, void *addr_b) { + assert(addr_a < addr_b); + /* + * For non-DSS cases -- + * a) W/o maps_coalesce, merge is not always allowed (Windows): + * 1) w/o retain, never merge (first branch below). + * 2) with retain, only merge extents from the same VirtualAlloc + * region (in which case MEM_DECOMMIT is utilized for purging). + * + * b) With maps_coalesce, it's always possible to merge. + * 1) w/o retain, always allow merge (only about dirty / muzzy). + * 2) with retain, to preserve the SN / first-fit, merge is still + * disallowed if b is a head extent, i.e. no merging across + * different mmap regions. + * + * a2) and b2) are implemented in emap_try_acquire_edata_neighbor, and + * sanity checked in the second branch below. + */ + if (!maps_coalesce && !opt_retain) { + return true; + } + if (config_debug) { + edata_t *a = emap_edata_lookup(tsdn, &arena_emap_global, + addr_a); + bool head_a = edata_is_head_get(a); + edata_t *b = emap_edata_lookup(tsdn, &arena_emap_global, + addr_b); + bool head_b = edata_is_head_get(b); + emap_assert_mapped(tsdn, &arena_emap_global, a); + emap_assert_mapped(tsdn, &arena_emap_global, b); + assert(extent_neighbor_head_state_mergeable(head_a, head_b, + /* forward */ true)); + } + if (have_dss && !extent_dss_mergeable(addr_a, addr_b)) { + return true; + } + + return false; +} + +bool +ehooks_default_merge(extent_hooks_t *extent_hooks, void *addr_a, size_t size_a, + void *addr_b, size_t size_b, bool committed, unsigned arena_ind) { + tsdn_t *tsdn = tsdn_fetch(); + + return ehooks_default_merge_impl(tsdn, addr_a, addr_b); +} + +void +ehooks_default_zero_impl(void *addr, size_t size) { + /* + * By default, we try to zero out memory using OS-provided demand-zeroed + * pages. If the user has specifically requested hugepages, though, we + * don't want to purge in the middle of a hugepage (which would break it + * up), so we act conservatively and use memset. + */ + bool needs_memset = true; + if (opt_thp != thp_mode_always) { + needs_memset = pages_purge_forced(addr, size); + } + if (needs_memset) { + memset(addr, 0, size); + } +} + +void +ehooks_default_guard_impl(void *guard1, void *guard2) { + pages_mark_guards(guard1, guard2); +} + +void +ehooks_default_unguard_impl(void *guard1, void *guard2) { + pages_unmark_guards(guard1, guard2); +} + +const extent_hooks_t ehooks_default_extent_hooks = { + ehooks_default_alloc, + ehooks_default_dalloc, + ehooks_default_destroy, + ehooks_default_commit, + ehooks_default_decommit, +#ifdef PAGES_CAN_PURGE_LAZY + ehooks_default_purge_lazy, +#else + NULL, +#endif +#ifdef PAGES_CAN_PURGE_FORCED + ehooks_default_purge_forced, +#else + NULL, +#endif + ehooks_default_split, + ehooks_default_merge +}; diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/emap.c b/src/duckdb/extension/jemalloc/jemalloc/src/emap.c new file mode 100644 index 000000000..f7d5c25a5 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/emap.c @@ -0,0 +1,386 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/emap.h" + +enum emap_lock_result_e { + emap_lock_result_success, + emap_lock_result_failure, + emap_lock_result_no_extent +}; +typedef enum emap_lock_result_e emap_lock_result_t; + +bool +emap_init(emap_t *emap, base_t *base, bool zeroed) { + return rtree_new(&emap->rtree, base, zeroed); +} + +void +emap_update_edata_state(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + extent_state_t state) { + witness_assert_positive_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE); + + edata_state_set(edata, state); + + EMAP_DECLARE_RTREE_CTX; + rtree_leaf_elm_t *elm1 = rtree_leaf_elm_lookup(tsdn, &emap->rtree, + rtree_ctx, (uintptr_t)edata_base_get(edata), /* dependent */ true, + /* init_missing */ false); + assert(elm1 != NULL); + rtree_leaf_elm_t *elm2 = edata_size_get(edata) == PAGE ? NULL : + rtree_leaf_elm_lookup(tsdn, &emap->rtree, rtree_ctx, + (uintptr_t)edata_last_get(edata), /* dependent */ true, + /* init_missing */ false); + + rtree_leaf_elm_state_update(tsdn, &emap->rtree, elm1, elm2, state); + + emap_assert_mapped(tsdn, emap, edata); +} + +static inline edata_t * +emap_try_acquire_edata_neighbor_impl(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + extent_pai_t pai, extent_state_t expected_state, bool forward, + bool expanding) { + witness_assert_positive_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE); + assert(!edata_guarded_get(edata)); + assert(!expanding || forward); + assert(!edata_state_in_transition(expected_state)); + assert(expected_state == extent_state_dirty || + expected_state == extent_state_muzzy || + expected_state == extent_state_retained); + + void *neighbor_addr = forward ? edata_past_get(edata) : + edata_before_get(edata); + /* + * This is subtle; the rtree code asserts that its input pointer is + * non-NULL, and this is a useful thing to check. But it's possible + * that edata corresponds to an address of (void *)PAGE (in practice, + * this has only been observed on FreeBSD when address-space + * randomization is on, but it could in principle happen anywhere). In + * this case, edata_before_get(edata) is NULL, triggering the assert. + */ + if (neighbor_addr == NULL) { + return NULL; + } + + EMAP_DECLARE_RTREE_CTX; + rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, &emap->rtree, + rtree_ctx, (uintptr_t)neighbor_addr, /* dependent*/ false, + /* init_missing */ false); + if (elm == NULL) { + return NULL; + } + + rtree_contents_t neighbor_contents = rtree_leaf_elm_read(tsdn, + &emap->rtree, elm, /* dependent */ false); + if (!extent_can_acquire_neighbor(edata, neighbor_contents, pai, + expected_state, forward, expanding)) { + return NULL; + } + + /* From this point, the neighbor edata can be safely acquired. */ + edata_t *neighbor = neighbor_contents.edata; + assert(edata_state_get(neighbor) == expected_state); + emap_update_edata_state(tsdn, emap, neighbor, extent_state_merging); + if (expanding) { + extent_assert_can_expand(edata, neighbor); + } else { + extent_assert_can_coalesce(edata, neighbor); + } + + return neighbor; +} + +edata_t * +emap_try_acquire_edata_neighbor(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + extent_pai_t pai, extent_state_t expected_state, bool forward) { + return emap_try_acquire_edata_neighbor_impl(tsdn, emap, edata, pai, + expected_state, forward, /* expand */ false); +} + +edata_t * +emap_try_acquire_edata_neighbor_expand(tsdn_t *tsdn, emap_t *emap, + edata_t *edata, extent_pai_t pai, extent_state_t expected_state) { + /* Try expanding forward. */ + return emap_try_acquire_edata_neighbor_impl(tsdn, emap, edata, pai, + expected_state, /* forward */ true, /* expand */ true); +} + +void +emap_release_edata(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + extent_state_t new_state) { + assert(emap_edata_in_transition(tsdn, emap, edata)); + assert(emap_edata_is_acquired(tsdn, emap, edata)); + + emap_update_edata_state(tsdn, emap, edata, new_state); +} + +static bool +emap_rtree_leaf_elms_lookup(tsdn_t *tsdn, emap_t *emap, rtree_ctx_t *rtree_ctx, + const edata_t *edata, bool dependent, bool init_missing, + rtree_leaf_elm_t **r_elm_a, rtree_leaf_elm_t **r_elm_b) { + *r_elm_a = rtree_leaf_elm_lookup(tsdn, &emap->rtree, rtree_ctx, + (uintptr_t)edata_base_get(edata), dependent, init_missing); + if (!dependent && *r_elm_a == NULL) { + return true; + } + assert(*r_elm_a != NULL); + + *r_elm_b = rtree_leaf_elm_lookup(tsdn, &emap->rtree, rtree_ctx, + (uintptr_t)edata_last_get(edata), dependent, init_missing); + if (!dependent && *r_elm_b == NULL) { + return true; + } + assert(*r_elm_b != NULL); + + return false; +} + +static void +emap_rtree_write_acquired(tsdn_t *tsdn, emap_t *emap, rtree_leaf_elm_t *elm_a, + rtree_leaf_elm_t *elm_b, edata_t *edata, szind_t szind, bool slab) { + rtree_contents_t contents; + contents.edata = edata; + contents.metadata.szind = szind; + contents.metadata.slab = slab; + contents.metadata.is_head = (edata == NULL) ? false : + edata_is_head_get(edata); + contents.metadata.state = (edata == NULL) ? 0 : edata_state_get(edata); + rtree_leaf_elm_write(tsdn, &emap->rtree, elm_a, contents); + if (elm_b != NULL) { + rtree_leaf_elm_write(tsdn, &emap->rtree, elm_b, contents); + } +} + +bool +emap_register_boundary(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + szind_t szind, bool slab) { + assert(edata_state_get(edata) == extent_state_active); + EMAP_DECLARE_RTREE_CTX; + + rtree_leaf_elm_t *elm_a, *elm_b; + bool err = emap_rtree_leaf_elms_lookup(tsdn, emap, rtree_ctx, edata, + false, true, &elm_a, &elm_b); + if (err) { + return true; + } + assert(rtree_leaf_elm_read(tsdn, &emap->rtree, elm_a, + /* dependent */ false).edata == NULL); + assert(rtree_leaf_elm_read(tsdn, &emap->rtree, elm_b, + /* dependent */ false).edata == NULL); + emap_rtree_write_acquired(tsdn, emap, elm_a, elm_b, edata, szind, slab); + return false; +} + +/* Invoked *after* emap_register_boundary. */ +void +emap_register_interior(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + szind_t szind) { + EMAP_DECLARE_RTREE_CTX; + + assert(edata_slab_get(edata)); + assert(edata_state_get(edata) == extent_state_active); + + if (config_debug) { + /* Making sure the boundary is registered already. */ + rtree_leaf_elm_t *elm_a, *elm_b; + bool err = emap_rtree_leaf_elms_lookup(tsdn, emap, rtree_ctx, + edata, /* dependent */ true, /* init_missing */ false, + &elm_a, &elm_b); + assert(!err); + rtree_contents_t contents_a, contents_b; + contents_a = rtree_leaf_elm_read(tsdn, &emap->rtree, elm_a, + /* dependent */ true); + contents_b = rtree_leaf_elm_read(tsdn, &emap->rtree, elm_b, + /* dependent */ true); + assert(contents_a.edata == edata && contents_b.edata == edata); + assert(contents_a.metadata.slab && contents_b.metadata.slab); + } + + rtree_contents_t contents; + contents.edata = edata; + contents.metadata.szind = szind; + contents.metadata.slab = true; + contents.metadata.state = extent_state_active; + contents.metadata.is_head = false; /* Not allowed to access. */ + + assert(edata_size_get(edata) > (2 << LG_PAGE)); + rtree_write_range(tsdn, &emap->rtree, rtree_ctx, + (uintptr_t)edata_base_get(edata) + PAGE, + (uintptr_t)edata_last_get(edata) - PAGE, contents); +} + +void +emap_deregister_boundary(tsdn_t *tsdn, emap_t *emap, edata_t *edata) { + /* + * The edata must be either in an acquired state, or protected by state + * based locks. + */ + if (!emap_edata_is_acquired(tsdn, emap, edata)) { + witness_assert_positive_depth_to_rank( + tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE); + } + + EMAP_DECLARE_RTREE_CTX; + rtree_leaf_elm_t *elm_a, *elm_b; + + emap_rtree_leaf_elms_lookup(tsdn, emap, rtree_ctx, edata, + true, false, &elm_a, &elm_b); + emap_rtree_write_acquired(tsdn, emap, elm_a, elm_b, NULL, SC_NSIZES, + false); +} + +void +emap_deregister_interior(tsdn_t *tsdn, emap_t *emap, edata_t *edata) { + EMAP_DECLARE_RTREE_CTX; + + assert(edata_slab_get(edata)); + if (edata_size_get(edata) > (2 << LG_PAGE)) { + rtree_clear_range(tsdn, &emap->rtree, rtree_ctx, + (uintptr_t)edata_base_get(edata) + PAGE, + (uintptr_t)edata_last_get(edata) - PAGE); + } +} + +void +emap_remap(tsdn_t *tsdn, emap_t *emap, edata_t *edata, szind_t szind, + bool slab) { + EMAP_DECLARE_RTREE_CTX; + + if (szind != SC_NSIZES) { + rtree_contents_t contents; + contents.edata = edata; + contents.metadata.szind = szind; + contents.metadata.slab = slab; + contents.metadata.is_head = edata_is_head_get(edata); + contents.metadata.state = edata_state_get(edata); + + rtree_write(tsdn, &emap->rtree, rtree_ctx, + (uintptr_t)edata_addr_get(edata), contents); + /* + * Recall that this is called only for active->inactive and + * inactive->active transitions (since only active extents have + * meaningful values for szind and slab). Active, non-slab + * extents only need to handle lookups at their head (on + * deallocation), so we don't bother filling in the end + * boundary. + * + * For slab extents, we do the end-mapping change. This still + * leaves the interior unmodified; an emap_register_interior + * call is coming in those cases, though. + */ + if (slab && edata_size_get(edata) > PAGE) { + uintptr_t key = (uintptr_t)edata_past_get(edata) + - (uintptr_t)PAGE; + rtree_write(tsdn, &emap->rtree, rtree_ctx, key, + contents); + } + } +} + +bool +emap_split_prepare(tsdn_t *tsdn, emap_t *emap, emap_prepare_t *prepare, + edata_t *edata, size_t size_a, edata_t *trail, size_t size_b) { + EMAP_DECLARE_RTREE_CTX; + + /* + * We use incorrect constants for things like arena ind, zero, ranged, + * and commit state, and head status. This is a fake edata_t, used to + * facilitate a lookup. + */ + edata_t lead = {0}; + edata_init(&lead, 0U, edata_addr_get(edata), size_a, false, 0, 0, + extent_state_active, false, false, EXTENT_PAI_PAC, EXTENT_NOT_HEAD); + + emap_rtree_leaf_elms_lookup(tsdn, emap, rtree_ctx, &lead, false, true, + &prepare->lead_elm_a, &prepare->lead_elm_b); + emap_rtree_leaf_elms_lookup(tsdn, emap, rtree_ctx, trail, false, true, + &prepare->trail_elm_a, &prepare->trail_elm_b); + + if (prepare->lead_elm_a == NULL || prepare->lead_elm_b == NULL + || prepare->trail_elm_a == NULL || prepare->trail_elm_b == NULL) { + return true; + } + return false; +} + +void +emap_split_commit(tsdn_t *tsdn, emap_t *emap, emap_prepare_t *prepare, + edata_t *lead, size_t size_a, edata_t *trail, size_t size_b) { + /* + * We should think about not writing to the lead leaf element. We can + * get into situations where a racing realloc-like call can disagree + * with a size lookup request. I think it's fine to declare that these + * situations are race bugs, but there's an argument to be made that for + * things like xallocx, a size lookup call should return either the old + * size or the new size, but not anything else. + */ + emap_rtree_write_acquired(tsdn, emap, prepare->lead_elm_a, + prepare->lead_elm_b, lead, SC_NSIZES, /* slab */ false); + emap_rtree_write_acquired(tsdn, emap, prepare->trail_elm_a, + prepare->trail_elm_b, trail, SC_NSIZES, /* slab */ false); +} + +void +emap_merge_prepare(tsdn_t *tsdn, emap_t *emap, emap_prepare_t *prepare, + edata_t *lead, edata_t *trail) { + EMAP_DECLARE_RTREE_CTX; + emap_rtree_leaf_elms_lookup(tsdn, emap, rtree_ctx, lead, true, false, + &prepare->lead_elm_a, &prepare->lead_elm_b); + emap_rtree_leaf_elms_lookup(tsdn, emap, rtree_ctx, trail, true, false, + &prepare->trail_elm_a, &prepare->trail_elm_b); +} + +void +emap_merge_commit(tsdn_t *tsdn, emap_t *emap, emap_prepare_t *prepare, + edata_t *lead, edata_t *trail) { + rtree_contents_t clear_contents; + clear_contents.edata = NULL; + clear_contents.metadata.szind = SC_NSIZES; + clear_contents.metadata.slab = false; + clear_contents.metadata.is_head = false; + clear_contents.metadata.state = (extent_state_t)0; + + if (prepare->lead_elm_b != NULL) { + rtree_leaf_elm_write(tsdn, &emap->rtree, + prepare->lead_elm_b, clear_contents); + } + + rtree_leaf_elm_t *merged_b; + if (prepare->trail_elm_b != NULL) { + rtree_leaf_elm_write(tsdn, &emap->rtree, + prepare->trail_elm_a, clear_contents); + merged_b = prepare->trail_elm_b; + } else { + merged_b = prepare->trail_elm_a; + } + + emap_rtree_write_acquired(tsdn, emap, prepare->lead_elm_a, merged_b, + lead, SC_NSIZES, false); +} + +void +emap_do_assert_mapped(tsdn_t *tsdn, emap_t *emap, edata_t *edata) { + EMAP_DECLARE_RTREE_CTX; + + rtree_contents_t contents = rtree_read(tsdn, &emap->rtree, rtree_ctx, + (uintptr_t)edata_base_get(edata)); + assert(contents.edata == edata); + assert(contents.metadata.is_head == edata_is_head_get(edata)); + assert(contents.metadata.state == edata_state_get(edata)); +} + +void +emap_do_assert_not_mapped(tsdn_t *tsdn, emap_t *emap, edata_t *edata) { + emap_full_alloc_ctx_t context1 = {0}; + emap_full_alloc_ctx_try_lookup(tsdn, emap, edata_base_get(edata), + &context1); + assert(context1.edata == NULL); + + emap_full_alloc_ctx_t context2 = {0}; + emap_full_alloc_ctx_try_lookup(tsdn, emap, edata_last_get(edata), + &context2); + assert(context2.edata == NULL); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/eset.c b/src/duckdb/extension/jemalloc/jemalloc/src/eset.c new file mode 100644 index 000000000..6f8f335e1 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/eset.c @@ -0,0 +1,282 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/eset.h" + +#define ESET_NPSIZES (SC_NPSIZES + 1) + +static void +eset_bin_init(eset_bin_t *bin) { + edata_heap_new(&bin->heap); + /* + * heap_min doesn't need initialization; it gets filled in when the bin + * goes from non-empty to empty. + */ +} + +static void +eset_bin_stats_init(eset_bin_stats_t *bin_stats) { + atomic_store_zu(&bin_stats->nextents, 0, ATOMIC_RELAXED); + atomic_store_zu(&bin_stats->nbytes, 0, ATOMIC_RELAXED); +} + +void +eset_init(eset_t *eset, extent_state_t state) { + for (unsigned i = 0; i < ESET_NPSIZES; i++) { + eset_bin_init(&eset->bins[i]); + eset_bin_stats_init(&eset->bin_stats[i]); + } + fb_init(eset->bitmap, ESET_NPSIZES); + edata_list_inactive_init(&eset->lru); + eset->state = state; +} + +size_t +eset_npages_get(eset_t *eset) { + return atomic_load_zu(&eset->npages, ATOMIC_RELAXED); +} + +size_t +eset_nextents_get(eset_t *eset, pszind_t pind) { + return atomic_load_zu(&eset->bin_stats[pind].nextents, ATOMIC_RELAXED); +} + +size_t +eset_nbytes_get(eset_t *eset, pszind_t pind) { + return atomic_load_zu(&eset->bin_stats[pind].nbytes, ATOMIC_RELAXED); +} + +static void +eset_stats_add(eset_t *eset, pszind_t pind, size_t sz) { + size_t cur = atomic_load_zu(&eset->bin_stats[pind].nextents, + ATOMIC_RELAXED); + atomic_store_zu(&eset->bin_stats[pind].nextents, cur + 1, + ATOMIC_RELAXED); + cur = atomic_load_zu(&eset->bin_stats[pind].nbytes, ATOMIC_RELAXED); + atomic_store_zu(&eset->bin_stats[pind].nbytes, cur + sz, + ATOMIC_RELAXED); +} + +static void +eset_stats_sub(eset_t *eset, pszind_t pind, size_t sz) { + size_t cur = atomic_load_zu(&eset->bin_stats[pind].nextents, + ATOMIC_RELAXED); + atomic_store_zu(&eset->bin_stats[pind].nextents, cur - 1, + ATOMIC_RELAXED); + cur = atomic_load_zu(&eset->bin_stats[pind].nbytes, ATOMIC_RELAXED); + atomic_store_zu(&eset->bin_stats[pind].nbytes, cur - sz, + ATOMIC_RELAXED); +} + +void +eset_insert(eset_t *eset, edata_t *edata) { + assert(edata_state_get(edata) == eset->state); + + size_t size = edata_size_get(edata); + size_t psz = sz_psz_quantize_floor(size); + pszind_t pind = sz_psz2ind(psz); + + edata_cmp_summary_t edata_cmp_summary = edata_cmp_summary_get(edata); + if (edata_heap_empty(&eset->bins[pind].heap)) { + fb_set(eset->bitmap, ESET_NPSIZES, (size_t)pind); + /* Only element is automatically the min element. */ + eset->bins[pind].heap_min = edata_cmp_summary; + } else { + /* + * There's already a min element; update the summary if we're + * about to insert a lower one. + */ + if (edata_cmp_summary_comp(edata_cmp_summary, + eset->bins[pind].heap_min) < 0) { + eset->bins[pind].heap_min = edata_cmp_summary; + } + } + edata_heap_insert(&eset->bins[pind].heap, edata); + + if (config_stats) { + eset_stats_add(eset, pind, size); + } + + edata_list_inactive_append(&eset->lru, edata); + size_t npages = size >> LG_PAGE; + /* + * All modifications to npages hold the mutex (as asserted above), so we + * don't need an atomic fetch-add; we can get by with a load followed by + * a store. + */ + size_t cur_eset_npages = + atomic_load_zu(&eset->npages, ATOMIC_RELAXED); + atomic_store_zu(&eset->npages, cur_eset_npages + npages, + ATOMIC_RELAXED); +} + +void +eset_remove(eset_t *eset, edata_t *edata) { + assert(edata_state_get(edata) == eset->state || + edata_state_in_transition(edata_state_get(edata))); + + size_t size = edata_size_get(edata); + size_t psz = sz_psz_quantize_floor(size); + pszind_t pind = sz_psz2ind(psz); + if (config_stats) { + eset_stats_sub(eset, pind, size); + } + + edata_cmp_summary_t edata_cmp_summary = edata_cmp_summary_get(edata); + edata_heap_remove(&eset->bins[pind].heap, edata); + if (edata_heap_empty(&eset->bins[pind].heap)) { + fb_unset(eset->bitmap, ESET_NPSIZES, (size_t)pind); + } else { + /* + * This is a little weird; we compare if the summaries are + * equal, rather than if the edata we removed was the heap + * minimum. The reason why is that getting the heap minimum + * can cause a pairing heap merge operation. We can avoid this + * if we only update the min if it's changed, in which case the + * summaries of the removed element and the min element should + * compare equal. + */ + if (edata_cmp_summary_comp(edata_cmp_summary, + eset->bins[pind].heap_min) == 0) { + eset->bins[pind].heap_min = edata_cmp_summary_get( + edata_heap_first(&eset->bins[pind].heap)); + } + } + edata_list_inactive_remove(&eset->lru, edata); + size_t npages = size >> LG_PAGE; + /* + * As in eset_insert, we hold eset->mtx and so don't need atomic + * operations for updating eset->npages. + */ + size_t cur_extents_npages = + atomic_load_zu(&eset->npages, ATOMIC_RELAXED); + assert(cur_extents_npages >= npages); + atomic_store_zu(&eset->npages, + cur_extents_npages - (size >> LG_PAGE), ATOMIC_RELAXED); +} + +/* + * Find an extent with size [min_size, max_size) to satisfy the alignment + * requirement. For each size, try only the first extent in the heap. + */ +static edata_t * +eset_fit_alignment(eset_t *eset, size_t min_size, size_t max_size, + size_t alignment) { + pszind_t pind = sz_psz2ind(sz_psz_quantize_ceil(min_size)); + pszind_t pind_max = sz_psz2ind(sz_psz_quantize_ceil(max_size)); + + for (pszind_t i = + (pszind_t)fb_ffs(eset->bitmap, ESET_NPSIZES, (size_t)pind); + i < pind_max; + i = (pszind_t)fb_ffs(eset->bitmap, ESET_NPSIZES, (size_t)i + 1)) { + assert(i < SC_NPSIZES); + assert(!edata_heap_empty(&eset->bins[i].heap)); + edata_t *edata = edata_heap_first(&eset->bins[i].heap); + uintptr_t base = (uintptr_t)edata_base_get(edata); + size_t candidate_size = edata_size_get(edata); + assert(candidate_size >= min_size); + + uintptr_t next_align = ALIGNMENT_CEILING((uintptr_t)base, + PAGE_CEILING(alignment)); + if (base > next_align || base + candidate_size <= next_align) { + /* Overflow or not crossing the next alignment. */ + continue; + } + + size_t leadsize = next_align - base; + if (candidate_size - leadsize >= min_size) { + return edata; + } + } + + return NULL; +} + +/* + * Do first-fit extent selection, i.e. select the oldest/lowest extent that is + * large enough. + * + * lg_max_fit is the (log of the) maximum ratio between the requested size and + * the returned size that we'll allow. This can reduce fragmentation by + * avoiding reusing and splitting large extents for smaller sizes. In practice, + * it's set to opt_lg_extent_max_active_fit for the dirty eset and SC_PTR_BITS + * for others. + */ +static edata_t * +eset_first_fit(eset_t *eset, size_t size, bool exact_only, + unsigned lg_max_fit) { + edata_t *ret = NULL; + edata_cmp_summary_t ret_summ JEMALLOC_CC_SILENCE_INIT({0}); + + pszind_t pind = sz_psz2ind(sz_psz_quantize_ceil(size)); + + if (exact_only) { + return edata_heap_empty(&eset->bins[pind].heap) ? NULL : + edata_heap_first(&eset->bins[pind].heap); + } + + for (pszind_t i = + (pszind_t)fb_ffs(eset->bitmap, ESET_NPSIZES, (size_t)pind); + i < ESET_NPSIZES; + i = (pszind_t)fb_ffs(eset->bitmap, ESET_NPSIZES, (size_t)i + 1)) { + assert(!edata_heap_empty(&eset->bins[i].heap)); + if (lg_max_fit == SC_PTR_BITS) { + /* + * We'll shift by this below, and shifting out all the + * bits is undefined. Decreasing is safe, since the + * page size is larger than 1 byte. + */ + lg_max_fit = SC_PTR_BITS - 1; + } + if ((sz_pind2sz(i) >> lg_max_fit) > size) { + break; + } + if (ret == NULL || edata_cmp_summary_comp( + eset->bins[i].heap_min, ret_summ) < 0) { + /* + * We grab the edata as early as possible, even though + * we might change it later. Practically, a large + * portion of eset_fit calls succeed at the first valid + * index, so this doesn't cost much, and we get the + * effect of prefetching the edata as early as possible. + */ + edata_t *edata = edata_heap_first(&eset->bins[i].heap); + assert(edata_size_get(edata) >= size); + assert(ret == NULL || edata_snad_comp(edata, ret) < 0); + assert(ret == NULL || edata_cmp_summary_comp( + eset->bins[i].heap_min, + edata_cmp_summary_get(edata)) == 0); + ret = edata; + ret_summ = eset->bins[i].heap_min; + } + if (i == SC_NPSIZES) { + break; + } + assert(i < SC_NPSIZES); + } + + return ret; +} + +edata_t * +eset_fit(eset_t *eset, size_t esize, size_t alignment, bool exact_only, + unsigned lg_max_fit) { + size_t max_size = esize + PAGE_CEILING(alignment) - PAGE; + /* Beware size_t wrap-around. */ + if (max_size < esize) { + return NULL; + } + + edata_t *edata = eset_first_fit(eset, max_size, exact_only, lg_max_fit); + + if (alignment > PAGE && edata == NULL) { + /* + * max_size guarantees the alignment requirement but is rather + * pessimistic. Next we try to satisfy the aligned allocation + * with sizes in [esize, max_size). + */ + edata = eset_fit_alignment(eset, esize, max_size, alignment); + } + + return edata; +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/exp_grow.c b/src/duckdb/extension/jemalloc/jemalloc/src/exp_grow.c new file mode 100644 index 000000000..386471f49 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/exp_grow.c @@ -0,0 +1,8 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +void +exp_grow_init(exp_grow_t *exp_grow) { + exp_grow->next = sz_psz2ind(HUGEPAGE); + exp_grow->limit = sz_psz2ind(SC_LARGE_MAXCLASS); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/extent.c b/src/duckdb/extension/jemalloc/jemalloc/src/extent.c new file mode 100644 index 000000000..2efc7938b --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/extent.c @@ -0,0 +1,1333 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/emap.h" +#include "jemalloc/internal/extent_dss.h" +#include "jemalloc/internal/extent_mmap.h" +#include "jemalloc/internal/ph.h" +#include "jemalloc/internal/mutex.h" + +/******************************************************************************/ +/* Data. */ + +size_t opt_lg_extent_max_active_fit = LG_EXTENT_MAX_ACTIVE_FIT_DEFAULT; + +static bool extent_commit_impl(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + size_t offset, size_t length, bool growing_retained); +static bool extent_purge_lazy_impl(tsdn_t *tsdn, ehooks_t *ehooks, + edata_t *edata, size_t offset, size_t length, bool growing_retained); +static bool extent_purge_forced_impl(tsdn_t *tsdn, ehooks_t *ehooks, + edata_t *edata, size_t offset, size_t length, bool growing_retained); +static edata_t *extent_split_impl(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata, size_t size_a, size_t size_b, bool holding_core_locks); +static bool extent_merge_impl(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *a, edata_t *b, bool holding_core_locks); + +/* Used exclusively for gdump triggering. */ +static atomic_zu_t curpages; +static atomic_zu_t highpages; + +/******************************************************************************/ +/* + * Function prototypes for static functions that are referenced prior to + * definition. + */ + +static void extent_deregister(tsdn_t *tsdn, pac_t *pac, edata_t *edata); +static edata_t *extent_recycle(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, edata_t *expand_edata, size_t usize, size_t alignment, + bool zero, bool *commit, bool growing_retained, bool guarded); +static edata_t *extent_try_coalesce(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, edata_t *edata, bool *coalesced); +static edata_t *extent_alloc_retained(tsdn_t *tsdn, pac_t *pac, + ehooks_t *ehooks, edata_t *expand_edata, size_t size, size_t alignment, + bool zero, bool *commit, bool guarded); +static bool extent_decommit_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, + edata_t *edata, size_t offset, size_t length); + +/******************************************************************************/ + +size_t +extent_sn_next(pac_t *pac) { + return atomic_fetch_add_zu(&pac->extent_sn_next, 1, ATOMIC_RELAXED); +} + +static inline bool +extent_may_force_decay(pac_t *pac) { + return !(pac_decay_ms_get(pac, extent_state_dirty) == -1 + || pac_decay_ms_get(pac, extent_state_muzzy) == -1); +} + +static bool +extent_try_delayed_coalesce(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, edata_t *edata) { + emap_update_edata_state(tsdn, pac->emap, edata, extent_state_active); + + bool coalesced; + edata = extent_try_coalesce(tsdn, pac, ehooks, ecache, + edata, &coalesced); + emap_update_edata_state(tsdn, pac->emap, edata, ecache->state); + + if (!coalesced) { + return true; + } + eset_insert(&ecache->eset, edata); + return false; +} + +edata_t * +ecache_alloc(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, + edata_t *expand_edata, size_t size, size_t alignment, bool zero, + bool guarded) { + assert(size != 0); + assert(alignment != 0); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + bool commit = true; + edata_t *edata = extent_recycle(tsdn, pac, ehooks, ecache, expand_edata, + size, alignment, zero, &commit, false, guarded); + assert(edata == NULL || edata_pai_get(edata) == EXTENT_PAI_PAC); + assert(edata == NULL || edata_guarded_get(edata) == guarded); + return edata; +} + +edata_t * +ecache_alloc_grow(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, + edata_t *expand_edata, size_t size, size_t alignment, bool zero, + bool guarded) { + assert(size != 0); + assert(alignment != 0); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + bool commit = true; + edata_t *edata = extent_alloc_retained(tsdn, pac, ehooks, expand_edata, + size, alignment, zero, &commit, guarded); + if (edata == NULL) { + if (opt_retain && expand_edata != NULL) { + /* + * When retain is enabled and trying to expand, we do + * not attempt extent_alloc_wrapper which does mmap that + * is very unlikely to succeed (unless it happens to be + * at the end). + */ + return NULL; + } + if (guarded) { + /* + * Means no cached guarded extents available (and no + * grow_retained was attempted). The pac_alloc flow + * will alloc regular extents to make new guarded ones. + */ + return NULL; + } + void *new_addr = (expand_edata == NULL) ? NULL : + edata_past_get(expand_edata); + edata = extent_alloc_wrapper(tsdn, pac, ehooks, new_addr, + size, alignment, zero, &commit, + /* growing_retained */ false); + } + + assert(edata == NULL || edata_pai_get(edata) == EXTENT_PAI_PAC); + return edata; +} + +void +ecache_dalloc(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, + edata_t *edata) { + assert(edata_base_get(edata) != NULL); + assert(edata_size_get(edata) != 0); + assert(edata_pai_get(edata) == EXTENT_PAI_PAC); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + edata_addr_set(edata, edata_base_get(edata)); + edata_zeroed_set(edata, false); + + extent_record(tsdn, pac, ehooks, ecache, edata); +} + +edata_t * +ecache_evict(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, size_t npages_min) { + malloc_mutex_lock(tsdn, &ecache->mtx); + + /* + * Get the LRU coalesced extent, if any. If coalescing was delayed, + * the loop will iterate until the LRU extent is fully coalesced. + */ + edata_t *edata; + while (true) { + /* Get the LRU extent, if any. */ + eset_t *eset = &ecache->eset; + edata = edata_list_inactive_first(&eset->lru); + if (edata == NULL) { + /* + * Next check if there are guarded extents. They are + * more expensive to purge (since they are not + * mergeable), thus in favor of caching them longer. + */ + eset = &ecache->guarded_eset; + edata = edata_list_inactive_first(&eset->lru); + if (edata == NULL) { + goto label_return; + } + } + /* Check the eviction limit. */ + size_t extents_npages = ecache_npages_get(ecache); + if (extents_npages <= npages_min) { + edata = NULL; + goto label_return; + } + eset_remove(eset, edata); + if (!ecache->delay_coalesce || edata_guarded_get(edata)) { + break; + } + /* Try to coalesce. */ + if (extent_try_delayed_coalesce(tsdn, pac, ehooks, ecache, + edata)) { + break; + } + /* + * The LRU extent was just coalesced and the result placed in + * the LRU at its neighbor's position. Start over. + */ + } + + /* + * Either mark the extent active or deregister it to protect against + * concurrent operations. + */ + switch (ecache->state) { + case extent_state_dirty: + case extent_state_muzzy: + emap_update_edata_state(tsdn, pac->emap, edata, + extent_state_active); + break; + case extent_state_retained: + extent_deregister(tsdn, pac, edata); + break; + case extent_state_active: + case extent_state_transition: + case extent_state_merging: + default: + not_reached(); + } + +label_return: + malloc_mutex_unlock(tsdn, &ecache->mtx); + return edata; +} + +/* + * This can only happen when we fail to allocate a new extent struct (which + * indicates OOM), e.g. when trying to split an existing extent. + */ +static void +extents_abandon_vm(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, + edata_t *edata, bool growing_retained) { + size_t sz = edata_size_get(edata); + if (config_stats) { + atomic_fetch_add_zu(&pac->stats->abandoned_vm, sz, + ATOMIC_RELAXED); + } + /* + * Leak extent after making sure its pages have already been purged, so + * that this is only a virtual memory leak. + */ + if (ecache->state == extent_state_dirty) { + if (extent_purge_lazy_impl(tsdn, ehooks, edata, 0, sz, + growing_retained)) { + extent_purge_forced_impl(tsdn, ehooks, edata, 0, + edata_size_get(edata), growing_retained); + } + } + edata_cache_put(tsdn, pac->edata_cache, edata); +} + +static void +extent_deactivate_locked_impl(tsdn_t *tsdn, pac_t *pac, ecache_t *ecache, + edata_t *edata) { + malloc_mutex_assert_owner(tsdn, &ecache->mtx); + assert(edata_arena_ind_get(edata) == ecache_ind_get(ecache)); + + emap_update_edata_state(tsdn, pac->emap, edata, ecache->state); + eset_t *eset = edata_guarded_get(edata) ? &ecache->guarded_eset : + &ecache->eset; + eset_insert(eset, edata); +} + +static void +extent_deactivate_locked(tsdn_t *tsdn, pac_t *pac, ecache_t *ecache, + edata_t *edata) { + assert(edata_state_get(edata) == extent_state_active); + extent_deactivate_locked_impl(tsdn, pac, ecache, edata); +} + +static void +extent_deactivate_check_state_locked(tsdn_t *tsdn, pac_t *pac, ecache_t *ecache, + edata_t *edata, extent_state_t expected_state) { + assert(edata_state_get(edata) == expected_state); + extent_deactivate_locked_impl(tsdn, pac, ecache, edata); +} + +static void +extent_activate_locked(tsdn_t *tsdn, pac_t *pac, ecache_t *ecache, eset_t *eset, + edata_t *edata) { + assert(edata_arena_ind_get(edata) == ecache_ind_get(ecache)); + assert(edata_state_get(edata) == ecache->state || + edata_state_get(edata) == extent_state_merging); + + eset_remove(eset, edata); + emap_update_edata_state(tsdn, pac->emap, edata, extent_state_active); +} + +void +extent_gdump_add(tsdn_t *tsdn, const edata_t *edata) { + cassert(config_prof); + /* prof_gdump() requirement. */ + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + if (opt_prof && edata_state_get(edata) == extent_state_active) { + size_t nadd = edata_size_get(edata) >> LG_PAGE; + size_t cur = atomic_fetch_add_zu(&curpages, nadd, + ATOMIC_RELAXED) + nadd; + size_t high = atomic_load_zu(&highpages, ATOMIC_RELAXED); + while (cur > high && !atomic_compare_exchange_weak_zu( + &highpages, &high, cur, ATOMIC_RELAXED, ATOMIC_RELAXED)) { + /* + * Don't refresh cur, because it may have decreased + * since this thread lost the highpages update race. + * Note that high is updated in case of CAS failure. + */ + } + if (cur > high && prof_gdump_get_unlocked()) { + prof_gdump(tsdn); + } + } +} + +static void +extent_gdump_sub(tsdn_t *tsdn, const edata_t *edata) { + cassert(config_prof); + + if (opt_prof && edata_state_get(edata) == extent_state_active) { + size_t nsub = edata_size_get(edata) >> LG_PAGE; + assert(atomic_load_zu(&curpages, ATOMIC_RELAXED) >= nsub); + atomic_fetch_sub_zu(&curpages, nsub, ATOMIC_RELAXED); + } +} + +static bool +extent_register_impl(tsdn_t *tsdn, pac_t *pac, edata_t *edata, bool gdump_add) { + assert(edata_state_get(edata) == extent_state_active); + /* + * No locking needed, as the edata must be in active state, which + * prevents other threads from accessing the edata. + */ + if (emap_register_boundary(tsdn, pac->emap, edata, SC_NSIZES, + /* slab */ false)) { + return true; + } + + if (config_prof && gdump_add) { + extent_gdump_add(tsdn, edata); + } + + return false; +} + +static bool +extent_register(tsdn_t *tsdn, pac_t *pac, edata_t *edata) { + return extent_register_impl(tsdn, pac, edata, true); +} + +static bool +extent_register_no_gdump_add(tsdn_t *tsdn, pac_t *pac, edata_t *edata) { + return extent_register_impl(tsdn, pac, edata, false); +} + +static void +extent_reregister(tsdn_t *tsdn, pac_t *pac, edata_t *edata) { + bool err = extent_register(tsdn, pac, edata); + assert(!err); +} + +/* + * Removes all pointers to the given extent from the global rtree. + */ +static void +extent_deregister_impl(tsdn_t *tsdn, pac_t *pac, edata_t *edata, + bool gdump) { + emap_deregister_boundary(tsdn, pac->emap, edata); + + if (config_prof && gdump) { + extent_gdump_sub(tsdn, edata); + } +} + +static void +extent_deregister(tsdn_t *tsdn, pac_t *pac, edata_t *edata) { + extent_deregister_impl(tsdn, pac, edata, true); +} + +static void +extent_deregister_no_gdump_sub(tsdn_t *tsdn, pac_t *pac, + edata_t *edata) { + extent_deregister_impl(tsdn, pac, edata, false); +} + +/* + * Tries to find and remove an extent from ecache that can be used for the + * given allocation request. + */ +static edata_t * +extent_recycle_extract(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, edata_t *expand_edata, size_t size, size_t alignment, + bool guarded) { + malloc_mutex_assert_owner(tsdn, &ecache->mtx); + assert(alignment > 0); + if (config_debug && expand_edata != NULL) { + /* + * Non-NULL expand_edata indicates in-place expanding realloc. + * new_addr must either refer to a non-existing extent, or to + * the base of an extant extent, since only active slabs support + * interior lookups (which of course cannot be recycled). + */ + void *new_addr = edata_past_get(expand_edata); + assert(PAGE_ADDR2BASE(new_addr) == new_addr); + assert(alignment <= PAGE); + } + + edata_t *edata; + eset_t *eset = guarded ? &ecache->guarded_eset : &ecache->eset; + if (expand_edata != NULL) { + edata = emap_try_acquire_edata_neighbor_expand(tsdn, pac->emap, + expand_edata, EXTENT_PAI_PAC, ecache->state); + if (edata != NULL) { + /* NOLINTNEXTLINE(readability-suspicious-call-argument) */ + extent_assert_can_expand(expand_edata, edata); + if (edata_size_get(edata) < size) { + emap_release_edata(tsdn, pac->emap, edata, + ecache->state); + edata = NULL; + } + } + } else { + /* + * A large extent might be broken up from its original size to + * some small size to satisfy a small request. When that small + * request is freed, though, it won't merge back with the larger + * extent if delayed coalescing is on. The large extent can + * then no longer satify a request for its original size. To + * limit this effect, when delayed coalescing is enabled, we + * put a cap on how big an extent we can split for a request. + */ + unsigned lg_max_fit = ecache->delay_coalesce + ? (unsigned)opt_lg_extent_max_active_fit : SC_PTR_BITS; + + /* + * If split and merge are not allowed (Windows w/o retain), try + * exact fit only. + * + * For simplicity purposes, splitting guarded extents is not + * supported. Hence, we do only exact fit for guarded + * allocations. + */ + bool exact_only = (!maps_coalesce && !opt_retain) || guarded; + edata = eset_fit(eset, size, alignment, exact_only, + lg_max_fit); + } + if (edata == NULL) { + return NULL; + } + assert(!guarded || edata_guarded_get(edata)); + extent_activate_locked(tsdn, pac, ecache, eset, edata); + + return edata; +} + +/* + * Given an allocation request and an extent guaranteed to be able to satisfy + * it, this splits off lead and trail extents, leaving edata pointing to an + * extent satisfying the allocation. + * This function doesn't put lead or trail into any ecache; it's the caller's + * job to ensure that they can be reused. + */ +typedef enum { + /* + * Split successfully. lead, edata, and trail, are modified to extents + * describing the ranges before, in, and after the given allocation. + */ + extent_split_interior_ok, + /* + * The extent can't satisfy the given allocation request. None of the + * input edata_t *s are touched. + */ + extent_split_interior_cant_alloc, + /* + * In a potentially invalid state. Must leak (if *to_leak is non-NULL), + * and salvage what's still salvageable (if *to_salvage is non-NULL). + * None of lead, edata, or trail are valid. + */ + extent_split_interior_error +} extent_split_interior_result_t; + +static extent_split_interior_result_t +extent_split_interior(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + /* The result of splitting, in case of success. */ + edata_t **edata, edata_t **lead, edata_t **trail, + /* The mess to clean up, in case of error. */ + edata_t **to_leak, edata_t **to_salvage, + edata_t *expand_edata, size_t size, size_t alignment) { + size_t leadsize = ALIGNMENT_CEILING((uintptr_t)edata_base_get(*edata), + PAGE_CEILING(alignment)) - (uintptr_t)edata_base_get(*edata); + assert(expand_edata == NULL || leadsize == 0); + if (edata_size_get(*edata) < leadsize + size) { + return extent_split_interior_cant_alloc; + } + size_t trailsize = edata_size_get(*edata) - leadsize - size; + + *lead = NULL; + *trail = NULL; + *to_leak = NULL; + *to_salvage = NULL; + + /* Split the lead. */ + if (leadsize != 0) { + assert(!edata_guarded_get(*edata)); + *lead = *edata; + *edata = extent_split_impl(tsdn, pac, ehooks, *lead, leadsize, + size + trailsize, /* holding_core_locks*/ true); + if (*edata == NULL) { + *to_leak = *lead; + *lead = NULL; + return extent_split_interior_error; + } + } + + /* Split the trail. */ + if (trailsize != 0) { + assert(!edata_guarded_get(*edata)); + *trail = extent_split_impl(tsdn, pac, ehooks, *edata, size, + trailsize, /* holding_core_locks */ true); + if (*trail == NULL) { + *to_leak = *edata; + *to_salvage = *lead; + *lead = NULL; + *edata = NULL; + return extent_split_interior_error; + } + } + + return extent_split_interior_ok; +} + +/* + * This fulfills the indicated allocation request out of the given extent (which + * the caller should have ensured was big enough). If there's any unused space + * before or after the resulting allocation, that space is given its own extent + * and put back into ecache. + */ +static edata_t * +extent_recycle_split(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, edata_t *expand_edata, size_t size, size_t alignment, + edata_t *edata, bool growing_retained) { + assert(!edata_guarded_get(edata) || size == edata_size_get(edata)); + malloc_mutex_assert_owner(tsdn, &ecache->mtx); + + edata_t *lead; + edata_t *trail; + edata_t *to_leak JEMALLOC_CC_SILENCE_INIT(NULL); + edata_t *to_salvage JEMALLOC_CC_SILENCE_INIT(NULL); + + extent_split_interior_result_t result = extent_split_interior( + tsdn, pac, ehooks, &edata, &lead, &trail, &to_leak, &to_salvage, + expand_edata, size, alignment); + + if (!maps_coalesce && result != extent_split_interior_ok + && !opt_retain) { + /* + * Split isn't supported (implies Windows w/o retain). Avoid + * leaking the extent. + */ + assert(to_leak != NULL && lead == NULL && trail == NULL); + extent_deactivate_locked(tsdn, pac, ecache, to_leak); + return NULL; + } + + if (result == extent_split_interior_ok) { + if (lead != NULL) { + extent_deactivate_locked(tsdn, pac, ecache, lead); + } + if (trail != NULL) { + extent_deactivate_locked(tsdn, pac, ecache, trail); + } + return edata; + } else { + /* + * We should have picked an extent that was large enough to + * fulfill our allocation request. + */ + assert(result == extent_split_interior_error); + if (to_salvage != NULL) { + extent_deregister(tsdn, pac, to_salvage); + } + if (to_leak != NULL) { + extent_deregister_no_gdump_sub(tsdn, pac, to_leak); + /* + * May go down the purge path (which assume no ecache + * locks). Only happens with OOM caused split failures. + */ + malloc_mutex_unlock(tsdn, &ecache->mtx); + extents_abandon_vm(tsdn, pac, ehooks, ecache, to_leak, + growing_retained); + malloc_mutex_lock(tsdn, &ecache->mtx); + } + return NULL; + } + unreachable(); +} + +/* + * Tries to satisfy the given allocation request by reusing one of the extents + * in the given ecache_t. + */ +static edata_t * +extent_recycle(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, + edata_t *expand_edata, size_t size, size_t alignment, bool zero, + bool *commit, bool growing_retained, bool guarded) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, growing_retained ? 1 : 0); + assert(!guarded || expand_edata == NULL); + assert(!guarded || alignment <= PAGE); + + malloc_mutex_lock(tsdn, &ecache->mtx); + + edata_t *edata = extent_recycle_extract(tsdn, pac, ehooks, ecache, + expand_edata, size, alignment, guarded); + if (edata == NULL) { + malloc_mutex_unlock(tsdn, &ecache->mtx); + return NULL; + } + + edata = extent_recycle_split(tsdn, pac, ehooks, ecache, expand_edata, + size, alignment, edata, growing_retained); + malloc_mutex_unlock(tsdn, &ecache->mtx); + if (edata == NULL) { + return NULL; + } + + assert(edata_state_get(edata) == extent_state_active); + if (extent_commit_zero(tsdn, ehooks, edata, *commit, zero, + growing_retained)) { + extent_record(tsdn, pac, ehooks, ecache, edata); + return NULL; + } + if (edata_committed_get(edata)) { + /* + * This reverses the purpose of this variable - previously it + * was treated as an input parameter, now it turns into an + * output parameter, reporting if the edata has actually been + * committed. + */ + *commit = true; + } + return edata; +} + +/* + * If virtual memory is retained, create increasingly larger extents from which + * to split requested extents in order to limit the total number of disjoint + * virtual memory ranges retained by each shard. + */ +static edata_t * +extent_grow_retained(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + size_t size, size_t alignment, bool zero, bool *commit) { + malloc_mutex_assert_owner(tsdn, &pac->grow_mtx); + + size_t alloc_size_min = size + PAGE_CEILING(alignment) - PAGE; + /* Beware size_t wrap-around. */ + if (alloc_size_min < size) { + goto label_err; + } + /* + * Find the next extent size in the series that would be large enough to + * satisfy this request. + */ + size_t alloc_size; + pszind_t exp_grow_skip; + bool err = exp_grow_size_prepare(&pac->exp_grow, alloc_size_min, + &alloc_size, &exp_grow_skip); + if (err) { + goto label_err; + } + + edata_t *edata = edata_cache_get(tsdn, pac->edata_cache); + if (edata == NULL) { + goto label_err; + } + bool zeroed = false; + bool committed = false; + + void *ptr = ehooks_alloc(tsdn, ehooks, NULL, alloc_size, PAGE, &zeroed, + &committed); + + if (ptr == NULL) { + edata_cache_put(tsdn, pac->edata_cache, edata); + goto label_err; + } + + edata_init(edata, ecache_ind_get(&pac->ecache_retained), ptr, + alloc_size, false, SC_NSIZES, extent_sn_next(pac), + extent_state_active, zeroed, committed, EXTENT_PAI_PAC, + EXTENT_IS_HEAD); + + if (extent_register_no_gdump_add(tsdn, pac, edata)) { + edata_cache_put(tsdn, pac->edata_cache, edata); + goto label_err; + } + + if (edata_committed_get(edata)) { + *commit = true; + } + + edata_t *lead; + edata_t *trail; + edata_t *to_leak JEMALLOC_CC_SILENCE_INIT(NULL); + edata_t *to_salvage JEMALLOC_CC_SILENCE_INIT(NULL); + + extent_split_interior_result_t result = extent_split_interior(tsdn, + pac, ehooks, &edata, &lead, &trail, &to_leak, &to_salvage, NULL, + size, alignment); + + if (result == extent_split_interior_ok) { + if (lead != NULL) { + extent_record(tsdn, pac, ehooks, &pac->ecache_retained, + lead); + } + if (trail != NULL) { + extent_record(tsdn, pac, ehooks, &pac->ecache_retained, + trail); + } + } else { + /* + * We should have allocated a sufficiently large extent; the + * cant_alloc case should not occur. + */ + assert(result == extent_split_interior_error); + if (to_salvage != NULL) { + if (config_prof) { + extent_gdump_add(tsdn, to_salvage); + } + extent_record(tsdn, pac, ehooks, &pac->ecache_retained, + to_salvage); + } + if (to_leak != NULL) { + extent_deregister_no_gdump_sub(tsdn, pac, to_leak); + extents_abandon_vm(tsdn, pac, ehooks, + &pac->ecache_retained, to_leak, true); + } + goto label_err; + } + + if (*commit && !edata_committed_get(edata)) { + if (extent_commit_impl(tsdn, ehooks, edata, 0, + edata_size_get(edata), true)) { + extent_record(tsdn, pac, ehooks, + &pac->ecache_retained, edata); + goto label_err; + } + /* A successful commit should return zeroed memory. */ + if (config_debug) { + void *addr = edata_addr_get(edata); + size_t *p = (size_t *)addr; + /* Check the first page only. */ + for (size_t i = 0; i < PAGE / sizeof(size_t); i++) { + assert(p[i] == 0); + } + } + } + + /* + * Increment extent_grow_next if doing so wouldn't exceed the allowed + * range. + */ + /* All opportunities for failure are past. */ + exp_grow_size_commit(&pac->exp_grow, exp_grow_skip); + malloc_mutex_unlock(tsdn, &pac->grow_mtx); + + if (config_prof) { + /* Adjust gdump stats now that extent is final size. */ + extent_gdump_add(tsdn, edata); + } + if (zero && !edata_zeroed_get(edata)) { + ehooks_zero(tsdn, ehooks, edata_base_get(edata), + edata_size_get(edata)); + } + return edata; +label_err: + malloc_mutex_unlock(tsdn, &pac->grow_mtx); + return NULL; +} + +static edata_t * +extent_alloc_retained(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *expand_edata, size_t size, size_t alignment, bool zero, + bool *commit, bool guarded) { + assert(size != 0); + assert(alignment != 0); + + malloc_mutex_lock(tsdn, &pac->grow_mtx); + + edata_t *edata = extent_recycle(tsdn, pac, ehooks, + &pac->ecache_retained, expand_edata, size, alignment, zero, commit, + /* growing_retained */ true, guarded); + if (edata != NULL) { + malloc_mutex_unlock(tsdn, &pac->grow_mtx); + if (config_prof) { + extent_gdump_add(tsdn, edata); + } + } else if (opt_retain && expand_edata == NULL && !guarded) { + edata = extent_grow_retained(tsdn, pac, ehooks, size, + alignment, zero, commit); + /* extent_grow_retained() always releases pac->grow_mtx. */ + } else { + malloc_mutex_unlock(tsdn, &pac->grow_mtx); + } + malloc_mutex_assert_not_owner(tsdn, &pac->grow_mtx); + + return edata; +} + +static bool +extent_coalesce(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, + edata_t *inner, edata_t *outer, bool forward) { + extent_assert_can_coalesce(inner, outer); + eset_remove(&ecache->eset, outer); + + bool err = extent_merge_impl(tsdn, pac, ehooks, + forward ? inner : outer, forward ? outer : inner, + /* holding_core_locks */ true); + if (err) { + extent_deactivate_check_state_locked(tsdn, pac, ecache, outer, + extent_state_merging); + } + + return err; +} + +static edata_t * +extent_try_coalesce_impl(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, edata_t *edata, bool *coalesced) { + assert(!edata_guarded_get(edata)); + assert(coalesced != NULL); + /* + * We avoid checking / locking inactive neighbors for large size + * classes, since they are eagerly coalesced on deallocation which can + * cause lock contention. + */ + /* + * Continue attempting to coalesce until failure, to protect against + * races with other threads that are thwarted by this one. + */ + bool again; + do { + again = false; + + /* Try to coalesce forward. */ + edata_t *next = emap_try_acquire_edata_neighbor(tsdn, pac->emap, + edata, EXTENT_PAI_PAC, ecache->state, /* forward */ true); + if (next != NULL) { + if (!extent_coalesce(tsdn, pac, ehooks, ecache, edata, + next, true)) { + if (ecache->delay_coalesce) { + /* Do minimal coalescing. */ + *coalesced = true; + return edata; + } + again = true; + } + } + + /* Try to coalesce backward. */ + edata_t *prev = emap_try_acquire_edata_neighbor(tsdn, pac->emap, + edata, EXTENT_PAI_PAC, ecache->state, /* forward */ false); + if (prev != NULL) { + if (!extent_coalesce(tsdn, pac, ehooks, ecache, edata, + prev, false)) { + edata = prev; + if (ecache->delay_coalesce) { + /* Do minimal coalescing. */ + *coalesced = true; + return edata; + } + again = true; + } + } + } while (again); + + if (ecache->delay_coalesce) { + *coalesced = false; + } + return edata; +} + +static edata_t * +extent_try_coalesce(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, edata_t *edata, bool *coalesced) { + return extent_try_coalesce_impl(tsdn, pac, ehooks, ecache, edata, + coalesced); +} + +static edata_t * +extent_try_coalesce_large(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, edata_t *edata, bool *coalesced) { + return extent_try_coalesce_impl(tsdn, pac, ehooks, ecache, edata, + coalesced); +} + +/* Purge a single extent to retained / unmapped directly. */ +static void +extent_maximally_purge(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata) { + size_t extent_size = edata_size_get(edata); + extent_dalloc_wrapper(tsdn, pac, ehooks, edata); + if (config_stats) { + /* Update stats accordingly. */ + LOCKEDINT_MTX_LOCK(tsdn, *pac->stats_mtx); + locked_inc_u64(tsdn, + LOCKEDINT_MTX(*pac->stats_mtx), + &pac->stats->decay_dirty.nmadvise, 1); + locked_inc_u64(tsdn, + LOCKEDINT_MTX(*pac->stats_mtx), + &pac->stats->decay_dirty.purged, + extent_size >> LG_PAGE); + LOCKEDINT_MTX_UNLOCK(tsdn, *pac->stats_mtx); + atomic_fetch_sub_zu(&pac->stats->pac_mapped, extent_size, + ATOMIC_RELAXED); + } +} + +/* + * Does the metadata management portions of putting an unused extent into the + * given ecache_t (coalesces and inserts into the eset). + */ +void +extent_record(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, + edata_t *edata) { + assert((ecache->state != extent_state_dirty && + ecache->state != extent_state_muzzy) || + !edata_zeroed_get(edata)); + + malloc_mutex_lock(tsdn, &ecache->mtx); + + emap_assert_mapped(tsdn, pac->emap, edata); + + if (edata_guarded_get(edata)) { + goto label_skip_coalesce; + } + if (!ecache->delay_coalesce) { + bool coalesced_unused; + edata = extent_try_coalesce(tsdn, pac, ehooks, ecache, edata, + &coalesced_unused); + } else if (edata_size_get(edata) >= SC_LARGE_MINCLASS) { + assert(ecache == &pac->ecache_dirty); + /* Always coalesce large extents eagerly. */ + bool coalesced; + do { + assert(edata_state_get(edata) == extent_state_active); + edata = extent_try_coalesce_large(tsdn, pac, ehooks, + ecache, edata, &coalesced); + } while (coalesced); + if (edata_size_get(edata) >= + atomic_load_zu(&pac->oversize_threshold, ATOMIC_RELAXED) + && !background_thread_enabled() + && extent_may_force_decay(pac)) { + /* Shortcut to purge the oversize extent eagerly. */ + malloc_mutex_unlock(tsdn, &ecache->mtx); + extent_maximally_purge(tsdn, pac, ehooks, edata); + return; + } + } +label_skip_coalesce: + extent_deactivate_locked(tsdn, pac, ecache, edata); + + malloc_mutex_unlock(tsdn, &ecache->mtx); +} + +void +extent_dalloc_gap(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + if (extent_register(tsdn, pac, edata)) { + edata_cache_put(tsdn, pac->edata_cache, edata); + return; + } + extent_dalloc_wrapper(tsdn, pac, ehooks, edata); +} + +static bool +extent_dalloc_wrapper_try(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata) { + bool err; + + assert(edata_base_get(edata) != NULL); + assert(edata_size_get(edata) != 0); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + edata_addr_set(edata, edata_base_get(edata)); + + /* Try to deallocate. */ + err = ehooks_dalloc(tsdn, ehooks, edata_base_get(edata), + edata_size_get(edata), edata_committed_get(edata)); + + if (!err) { + edata_cache_put(tsdn, pac->edata_cache, edata); + } + + return err; +} + +edata_t * +extent_alloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + void *new_addr, size_t size, size_t alignment, bool zero, bool *commit, + bool growing_retained) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, growing_retained ? 1 : 0); + + edata_t *edata = edata_cache_get(tsdn, pac->edata_cache); + if (edata == NULL) { + return NULL; + } + size_t palignment = ALIGNMENT_CEILING(alignment, PAGE); + void *addr = ehooks_alloc(tsdn, ehooks, new_addr, size, palignment, + &zero, commit); + if (addr == NULL) { + edata_cache_put(tsdn, pac->edata_cache, edata); + return NULL; + } + edata_init(edata, ecache_ind_get(&pac->ecache_dirty), addr, + size, /* slab */ false, SC_NSIZES, extent_sn_next(pac), + extent_state_active, zero, *commit, EXTENT_PAI_PAC, + opt_retain ? EXTENT_IS_HEAD : EXTENT_NOT_HEAD); + /* + * Retained memory is not counted towards gdump. Only if an extent is + * allocated as a separate mapping, i.e. growing_retained is false, then + * gdump should be updated. + */ + bool gdump_add = !growing_retained; + if (extent_register_impl(tsdn, pac, edata, gdump_add)) { + edata_cache_put(tsdn, pac->edata_cache, edata); + return NULL; + } + + return edata; +} + +void +extent_dalloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata) { + assert(edata_pai_get(edata) == EXTENT_PAI_PAC); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + /* Avoid calling the default extent_dalloc unless have to. */ + if (!ehooks_dalloc_will_fail(ehooks)) { + /* Remove guard pages for dalloc / unmap. */ + if (edata_guarded_get(edata)) { + assert(ehooks_are_default(ehooks)); + san_unguard_pages_two_sided(tsdn, ehooks, edata, + pac->emap); + } + /* + * Deregister first to avoid a race with other allocating + * threads, and reregister if deallocation fails. + */ + extent_deregister(tsdn, pac, edata); + if (!extent_dalloc_wrapper_try(tsdn, pac, ehooks, edata)) { + return; + } + extent_reregister(tsdn, pac, edata); + } + + /* Try to decommit; purge if that fails. */ + bool zeroed; + if (!edata_committed_get(edata)) { + zeroed = true; + } else if (!extent_decommit_wrapper(tsdn, ehooks, edata, 0, + edata_size_get(edata))) { + zeroed = true; + } else if (!ehooks_purge_forced(tsdn, ehooks, edata_base_get(edata), + edata_size_get(edata), 0, edata_size_get(edata))) { + zeroed = true; + } else if (edata_state_get(edata) == extent_state_muzzy || + !ehooks_purge_lazy(tsdn, ehooks, edata_base_get(edata), + edata_size_get(edata), 0, edata_size_get(edata))) { + zeroed = false; + } else { + zeroed = false; + } + edata_zeroed_set(edata, zeroed); + + if (config_prof) { + extent_gdump_sub(tsdn, edata); + } + + extent_record(tsdn, pac, ehooks, &pac->ecache_retained, edata); +} + +void +extent_destroy_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata) { + assert(edata_base_get(edata) != NULL); + assert(edata_size_get(edata) != 0); + extent_state_t state = edata_state_get(edata); + assert(state == extent_state_retained || state == extent_state_active); + assert(emap_edata_is_acquired(tsdn, pac->emap, edata)); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + if (edata_guarded_get(edata)) { + assert(opt_retain); + san_unguard_pages_pre_destroy(tsdn, ehooks, edata, pac->emap); + } + edata_addr_set(edata, edata_base_get(edata)); + + /* Try to destroy; silently fail otherwise. */ + ehooks_destroy(tsdn, ehooks, edata_base_get(edata), + edata_size_get(edata), edata_committed_get(edata)); + + edata_cache_put(tsdn, pac->edata_cache, edata); +} + +static bool +extent_commit_impl(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + size_t offset, size_t length, bool growing_retained) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, growing_retained ? 1 : 0); + bool err = ehooks_commit(tsdn, ehooks, edata_base_get(edata), + edata_size_get(edata), offset, length); + edata_committed_set(edata, edata_committed_get(edata) || !err); + return err; +} + +bool +extent_commit_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + size_t offset, size_t length) { + return extent_commit_impl(tsdn, ehooks, edata, offset, length, + /* growing_retained */ false); +} + +static bool +extent_decommit_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + size_t offset, size_t length) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + bool err = ehooks_decommit(tsdn, ehooks, edata_base_get(edata), + edata_size_get(edata), offset, length); + edata_committed_set(edata, edata_committed_get(edata) && err); + return err; +} + +static bool +extent_purge_lazy_impl(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + size_t offset, size_t length, bool growing_retained) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, growing_retained ? 1 : 0); + bool err = ehooks_purge_lazy(tsdn, ehooks, edata_base_get(edata), + edata_size_get(edata), offset, length); + return err; +} + +bool +extent_purge_lazy_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + size_t offset, size_t length) { + return extent_purge_lazy_impl(tsdn, ehooks, edata, offset, + length, false); +} + +static bool +extent_purge_forced_impl(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + size_t offset, size_t length, bool growing_retained) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, growing_retained ? 1 : 0); + bool err = ehooks_purge_forced(tsdn, ehooks, edata_base_get(edata), + edata_size_get(edata), offset, length); + return err; +} + +bool +extent_purge_forced_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + size_t offset, size_t length) { + return extent_purge_forced_impl(tsdn, ehooks, edata, offset, length, + false); +} + +/* + * Accepts the extent to split, and the characteristics of each side of the + * split. The 'a' parameters go with the 'lead' of the resulting pair of + * extents (the lower addressed portion of the split), and the 'b' parameters go + * with the trail (the higher addressed portion). This makes 'extent' the lead, + * and returns the trail (except in case of error). + */ +static edata_t * +extent_split_impl(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata, size_t size_a, size_t size_b, bool holding_core_locks) { + assert(edata_size_get(edata) == size_a + size_b); + /* Only the shrink path may split w/o holding core locks. */ + if (holding_core_locks) { + witness_assert_positive_depth_to_rank( + tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE); + } else { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + } + + if (ehooks_split_will_fail(ehooks)) { + return NULL; + } + + edata_t *trail = edata_cache_get(tsdn, pac->edata_cache); + if (trail == NULL) { + goto label_error_a; + } + + edata_init(trail, edata_arena_ind_get(edata), + (void *)((byte_t *)edata_base_get(edata) + size_a), size_b, + /* slab */ false, SC_NSIZES, edata_sn_get(edata), + edata_state_get(edata), edata_zeroed_get(edata), + edata_committed_get(edata), EXTENT_PAI_PAC, EXTENT_NOT_HEAD); + emap_prepare_t prepare; + bool err = emap_split_prepare(tsdn, pac->emap, &prepare, edata, + size_a, trail, size_b); + if (err) { + goto label_error_b; + } + + /* + * No need to acquire trail or edata, because: 1) trail was new (just + * allocated); and 2) edata is either an active allocation (the shrink + * path), or in an acquired state (extracted from the ecache on the + * extent_recycle_split path). + */ + assert(emap_edata_is_acquired(tsdn, pac->emap, edata)); + assert(emap_edata_is_acquired(tsdn, pac->emap, trail)); + + err = ehooks_split(tsdn, ehooks, edata_base_get(edata), size_a + size_b, + size_a, size_b, edata_committed_get(edata)); + + if (err) { + goto label_error_b; + } + + edata_size_set(edata, size_a); + emap_split_commit(tsdn, pac->emap, &prepare, edata, size_a, trail, + size_b); + + return trail; +label_error_b: + edata_cache_put(tsdn, pac->edata_cache, trail); +label_error_a: + return NULL; +} + +edata_t * +extent_split_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, edata_t *edata, + size_t size_a, size_t size_b, bool holding_core_locks) { + return extent_split_impl(tsdn, pac, ehooks, edata, size_a, size_b, + holding_core_locks); +} + +static bool +extent_merge_impl(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, edata_t *a, + edata_t *b, bool holding_core_locks) { + /* Only the expanding path may merge w/o holding ecache locks. */ + if (holding_core_locks) { + witness_assert_positive_depth_to_rank( + tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE); + } else { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + } + + assert(edata_base_get(a) < edata_base_get(b)); + assert(edata_arena_ind_get(a) == edata_arena_ind_get(b)); + assert(edata_arena_ind_get(a) == ehooks_ind_get(ehooks)); + emap_assert_mapped(tsdn, pac->emap, a); + emap_assert_mapped(tsdn, pac->emap, b); + + bool err = ehooks_merge(tsdn, ehooks, edata_base_get(a), + edata_size_get(a), edata_base_get(b), edata_size_get(b), + edata_committed_get(a)); + + if (err) { + return true; + } + + /* + * The rtree writes must happen while all the relevant elements are + * owned, so the following code uses decomposed helper functions rather + * than extent_{,de}register() to do things in the right order. + */ + emap_prepare_t prepare; + emap_merge_prepare(tsdn, pac->emap, &prepare, a, b); + + assert(edata_state_get(a) == extent_state_active || + edata_state_get(a) == extent_state_merging); + edata_state_set(a, extent_state_active); + edata_size_set(a, edata_size_get(a) + edata_size_get(b)); + edata_sn_set(a, (edata_sn_get(a) < edata_sn_get(b)) ? + edata_sn_get(a) : edata_sn_get(b)); + edata_zeroed_set(a, edata_zeroed_get(a) && edata_zeroed_get(b)); + + emap_merge_commit(tsdn, pac->emap, &prepare, a, b); + + edata_cache_put(tsdn, pac->edata_cache, b); + + return false; +} + +bool +extent_merge_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *a, edata_t *b) { + return extent_merge_impl(tsdn, pac, ehooks, a, b, + /* holding_core_locks */ false); +} + +bool +extent_commit_zero(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + bool commit, bool zero, bool growing_retained) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, growing_retained ? 1 : 0); + + if (commit && !edata_committed_get(edata)) { + if (extent_commit_impl(tsdn, ehooks, edata, 0, + edata_size_get(edata), growing_retained)) { + return true; + } + } + if (zero && !edata_zeroed_get(edata)) { + void *addr = edata_base_get(edata); + size_t size = edata_size_get(edata); + ehooks_zero(tsdn, ehooks, addr, size); + } + return false; +} + +bool +extent_boot(void) { + assert(sizeof(slab_data_t) >= sizeof(e_prof_info_t)); + + if (have_dss) { + extent_dss_boot(); + } + + return false; +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/extent_dss.c b/src/duckdb/extension/jemalloc/jemalloc/src/extent_dss.c new file mode 100644 index 000000000..32fb41122 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/extent_dss.c @@ -0,0 +1,280 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/extent_dss.h" +#include "jemalloc/internal/spin.h" + +/******************************************************************************/ +/* Data. */ + +/* NOLINTNEXTLINE(performance-no-int-to-ptr) */ +#define SBRK_INVALID ((void *)-1) + +const char *opt_dss = DSS_DEFAULT; + +const char *const dss_prec_names[] = { + "disabled", + "primary", + "secondary", + "N/A" +}; + +/* + * Current dss precedence default, used when creating new arenas. NB: This is + * stored as unsigned rather than dss_prec_t because in principle there's no + * guarantee that sizeof(dss_prec_t) is the same as sizeof(unsigned), and we use + * atomic operations to synchronize the setting. + */ +static atomic_u_t dss_prec_default = ATOMIC_INIT( + (unsigned)DSS_PREC_DEFAULT); + +/* Base address of the DSS. */ +static void *dss_base; +/* Atomic boolean indicating whether a thread is currently extending DSS. */ +static atomic_b_t dss_extending; +/* Atomic boolean indicating whether the DSS is exhausted. */ +static atomic_b_t dss_exhausted; +/* Atomic current upper limit on DSS addresses. */ +static atomic_p_t dss_max; + +/******************************************************************************/ + +static void * +extent_dss_sbrk(intptr_t increment) { +#ifdef JEMALLOC_DSS + return sbrk(increment); +#else + not_implemented(); + return NULL; +#endif +} + +dss_prec_t +extent_dss_prec_get(void) { + dss_prec_t ret; + + if (!have_dss) { + return dss_prec_disabled; + } + ret = (dss_prec_t)atomic_load_u(&dss_prec_default, ATOMIC_ACQUIRE); + return ret; +} + +bool +extent_dss_prec_set(dss_prec_t dss_prec) { + if (!have_dss) { + return (dss_prec != dss_prec_disabled); + } + atomic_store_u(&dss_prec_default, (unsigned)dss_prec, ATOMIC_RELEASE); + return false; +} + +static void +extent_dss_extending_start(void) { + spin_t spinner = SPIN_INITIALIZER; + while (true) { + bool expected = false; + if (atomic_compare_exchange_weak_b(&dss_extending, &expected, + true, ATOMIC_ACQ_REL, ATOMIC_RELAXED)) { + break; + } + spin_adaptive(&spinner); + } +} + +static void +extent_dss_extending_finish(void) { + assert(atomic_load_b(&dss_extending, ATOMIC_RELAXED)); + + atomic_store_b(&dss_extending, false, ATOMIC_RELEASE); +} + +static void * +extent_dss_max_update(void *new_addr) { + /* + * Get the current end of the DSS as max_cur and assure that dss_max is + * up to date. + */ + void *max_cur = extent_dss_sbrk(0); + if (max_cur == SBRK_INVALID) { + return NULL; + } + atomic_store_p(&dss_max, max_cur, ATOMIC_RELEASE); + /* Fixed new_addr can only be supported if it is at the edge of DSS. */ + if (new_addr != NULL && max_cur != new_addr) { + return NULL; + } + return max_cur; +} + +void * +extent_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size, + size_t alignment, bool *zero, bool *commit) { + edata_t *gap; + + cassert(have_dss); + assert(size > 0); + assert(alignment == ALIGNMENT_CEILING(alignment, PAGE)); + + /* + * sbrk() uses a signed increment argument, so take care not to + * interpret a large allocation request as a negative increment. + */ + if ((intptr_t)size < 0) { + return NULL; + } + + gap = edata_cache_get(tsdn, &arena->pa_shard.edata_cache); + if (gap == NULL) { + return NULL; + } + + extent_dss_extending_start(); + if (!atomic_load_b(&dss_exhausted, ATOMIC_ACQUIRE)) { + /* + * The loop is necessary to recover from races with other + * threads that are using the DSS for something other than + * malloc. + */ + while (true) { + void *max_cur = extent_dss_max_update(new_addr); + if (max_cur == NULL) { + goto label_oom; + } + + bool head_state = opt_retain ? EXTENT_IS_HEAD : + EXTENT_NOT_HEAD; + /* + * Compute how much page-aligned gap space (if any) is + * necessary to satisfy alignment. This space can be + * recycled for later use. + */ + void *gap_addr_page = ALIGNMENT_ADDR2CEILING(max_cur, + PAGE); + void *ret = ALIGNMENT_ADDR2CEILING( + gap_addr_page, alignment); + size_t gap_size_page = (uintptr_t)ret - + (uintptr_t)gap_addr_page; + if (gap_size_page != 0) { + edata_init(gap, arena_ind_get(arena), + gap_addr_page, gap_size_page, false, + SC_NSIZES, extent_sn_next( + &arena->pa_shard.pac), + extent_state_active, false, true, + EXTENT_PAI_PAC, head_state); + } + /* + * Compute the address just past the end of the desired + * allocation space. + */ + void *dss_next = (void *)((byte_t *)ret + size); + if ((uintptr_t)ret < (uintptr_t)max_cur || + (uintptr_t)dss_next < (uintptr_t)max_cur) { + goto label_oom; /* Wrap-around. */ + } + /* Compute the increment, including subpage bytes. */ + void *gap_addr_subpage = max_cur; + size_t gap_size_subpage = (uintptr_t)ret - + (uintptr_t)gap_addr_subpage; + intptr_t incr = gap_size_subpage + size; + + assert((uintptr_t)max_cur + incr == (uintptr_t)ret + + size); + + /* Try to allocate. */ + void *dss_prev = extent_dss_sbrk(incr); + if (dss_prev == max_cur) { + /* Success. */ + atomic_store_p(&dss_max, dss_next, + ATOMIC_RELEASE); + extent_dss_extending_finish(); + + if (gap_size_page != 0) { + ehooks_t *ehooks = arena_get_ehooks( + arena); + extent_dalloc_gap(tsdn, + &arena->pa_shard.pac, ehooks, gap); + } else { + edata_cache_put(tsdn, + &arena->pa_shard.edata_cache, gap); + } + if (!*commit) { + *commit = pages_decommit(ret, size); + } + if (*zero && *commit) { + edata_t edata = {0}; + ehooks_t *ehooks = arena_get_ehooks( + arena); + + edata_init(&edata, + arena_ind_get(arena), ret, size, + size, false, SC_NSIZES, + extent_state_active, false, true, + EXTENT_PAI_PAC, head_state); + if (extent_purge_forced_wrapper(tsdn, + ehooks, &edata, 0, size)) { + memset(ret, 0, size); + } + } + return ret; + } + /* + * Failure, whether due to OOM or a race with a raw + * sbrk() call from outside the allocator. + */ + if (dss_prev == SBRK_INVALID) { + /* OOM. */ + atomic_store_b(&dss_exhausted, true, + ATOMIC_RELEASE); + goto label_oom; + } + } + } +label_oom: + extent_dss_extending_finish(); + edata_cache_put(tsdn, &arena->pa_shard.edata_cache, gap); + return NULL; +} + +static bool +extent_in_dss_helper(void *addr, void *max) { + return ((uintptr_t)addr >= (uintptr_t)dss_base && (uintptr_t)addr < + (uintptr_t)max); +} + +bool +extent_in_dss(void *addr) { + cassert(have_dss); + + return extent_in_dss_helper(addr, atomic_load_p(&dss_max, + ATOMIC_ACQUIRE)); +} + +bool +extent_dss_mergeable(void *addr_a, void *addr_b) { + void *max; + + cassert(have_dss); + + if ((uintptr_t)addr_a < (uintptr_t)dss_base && (uintptr_t)addr_b < + (uintptr_t)dss_base) { + return true; + } + + max = atomic_load_p(&dss_max, ATOMIC_ACQUIRE); + return (extent_in_dss_helper(addr_a, max) == + extent_in_dss_helper(addr_b, max)); +} + +void +extent_dss_boot(void) { + cassert(have_dss); + + dss_base = extent_dss_sbrk(0); + atomic_store_b(&dss_extending, false, ATOMIC_RELAXED); + atomic_store_b(&dss_exhausted, dss_base == SBRK_INVALID, ATOMIC_RELAXED); + atomic_store_p(&dss_max, dss_base, ATOMIC_RELAXED); +} + +/******************************************************************************/ diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/extent_mmap.c b/src/duckdb/extension/jemalloc/jemalloc/src/extent_mmap.c new file mode 100644 index 000000000..5f0ee2d24 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/extent_mmap.c @@ -0,0 +1,41 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/extent_mmap.h" + +/******************************************************************************/ +/* Data. */ + +bool opt_retain = +#ifdef JEMALLOC_RETAIN + true +#else + false +#endif + ; + +/******************************************************************************/ + +void * +extent_alloc_mmap(void *new_addr, size_t size, size_t alignment, bool *zero, + bool *commit) { + assert(alignment == ALIGNMENT_CEILING(alignment, PAGE)); + void *ret = pages_map(new_addr, size, alignment, commit); + if (ret == NULL) { + return NULL; + } + assert(ret != NULL); + if (*commit) { + *zero = true; + } + return ret; +} + +bool +extent_dalloc_mmap(void *addr, size_t size) { + if (!opt_retain) { + pages_unmap(addr, size); + } + return opt_retain; +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/fxp.c b/src/duckdb/extension/jemalloc/jemalloc/src/fxp.c new file mode 100644 index 000000000..96585f0a6 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/fxp.c @@ -0,0 +1,124 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/fxp.h" + +static bool +fxp_isdigit(char c) { + return '0' <= c && c <= '9'; +} + +bool +fxp_parse(fxp_t *result, const char *str, char **end) { + /* + * Using malloc_strtoumax in this method isn't as handy as you might + * expect (I tried). In the fractional part, significant leading zeros + * mean that you still need to do your own parsing, now with trickier + * math. In the integer part, the casting (uintmax_t to uint32_t) + * forces more reasoning about bounds than just checking for overflow as + * we parse. + */ + uint32_t integer_part = 0; + + const char *cur = str; + + /* The string must start with a digit or a decimal point. */ + if (*cur != '.' && !fxp_isdigit(*cur)) { + return true; + } + + while ('0' <= *cur && *cur <= '9') { + integer_part *= 10; + integer_part += *cur - '0'; + if (integer_part >= (1U << 16)) { + return true; + } + cur++; + } + + /* + * We've parsed all digits at the beginning of the string, without + * overflow. Either we're done, or there's a fractional part. + */ + if (*cur != '.') { + *result = (integer_part << 16); + if (end != NULL) { + *end = (char *)cur; + } + return false; + } + + /* There's a fractional part. */ + cur++; + if (!fxp_isdigit(*cur)) { + /* Shouldn't end on the decimal point. */ + return true; + } + + /* + * We use a lot of precision for the fractional part, even though we'll + * discard most of it; this lets us get exact values for the important + * special case where the denominator is a small power of 2 (for + * instance, 1/512 == 0.001953125 is exactly representable even with + * only 16 bits of fractional precision). We need to left-shift by 16 + * before dividing so we pick the number of digits to be + * floor(log(2**48)) = 14. + */ + uint64_t fractional_part = 0; + uint64_t frac_div = 1; + for (int i = 0; i < FXP_FRACTIONAL_PART_DIGITS; i++) { + fractional_part *= 10; + frac_div *= 10; + if (fxp_isdigit(*cur)) { + fractional_part += *cur - '0'; + cur++; + } + } + /* + * We only parse the first maxdigits characters, but we can still ignore + * any digits after that. + */ + while (fxp_isdigit(*cur)) { + cur++; + } + + assert(fractional_part < frac_div); + uint32_t fractional_repr = (uint32_t)( + (fractional_part << 16) / frac_div); + + /* Success! */ + *result = (integer_part << 16) + fractional_repr; + if (end != NULL) { + *end = (char *)cur; + } + return false; +} + +void +fxp_print(fxp_t a, char buf[FXP_BUF_SIZE]) { + uint32_t integer_part = fxp_round_down(a); + uint32_t fractional_part = (a & ((1U << 16) - 1)); + + int leading_fraction_zeros = 0; + uint64_t fraction_digits = fractional_part; + for (int i = 0; i < FXP_FRACTIONAL_PART_DIGITS; i++) { + if (fraction_digits < (1U << 16) + && fraction_digits * 10 >= (1U << 16)) { + leading_fraction_zeros = i; + } + fraction_digits *= 10; + } + fraction_digits >>= 16; + while (fraction_digits > 0 && fraction_digits % 10 == 0) { + fraction_digits /= 10; + } + + size_t printed = malloc_snprintf(buf, FXP_BUF_SIZE, "%"FMTu32".", + integer_part); + for (int i = 0; i < leading_fraction_zeros; i++) { + buf[printed] = '0'; + printed++; + } + malloc_snprintf(&buf[printed], FXP_BUF_SIZE - printed, "%"FMTu64, + fraction_digits); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/hook.c b/src/duckdb/extension/jemalloc/jemalloc/src/hook.c new file mode 100644 index 000000000..77a988d72 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/hook.c @@ -0,0 +1,195 @@ +#include "jemalloc/internal/jemalloc_preamble.h" + +#include "jemalloc/internal/hook.h" + +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/seq.h" + +typedef struct hooks_internal_s hooks_internal_t; +struct hooks_internal_s { + hooks_t hooks; + bool in_use; +}; + +seq_define(hooks_internal_t, hooks) + +static atomic_u_t nhooks = ATOMIC_INIT(0); +static seq_hooks_t hooks[HOOK_MAX]; +static malloc_mutex_t hooks_mu; + +bool +hook_boot(void) { + return malloc_mutex_init(&hooks_mu, "hooks", WITNESS_RANK_HOOK, + malloc_mutex_rank_exclusive); +} + +static void * +hook_install_locked(hooks_t *to_install) { + hooks_internal_t hooks_internal; + for (int i = 0; i < HOOK_MAX; i++) { + bool success = seq_try_load_hooks(&hooks_internal, &hooks[i]); + /* We hold mu; no concurrent access. */ + assert(success); + if (!hooks_internal.in_use) { + hooks_internal.hooks = *to_install; + hooks_internal.in_use = true; + seq_store_hooks(&hooks[i], &hooks_internal); + atomic_store_u(&nhooks, + atomic_load_u(&nhooks, ATOMIC_RELAXED) + 1, + ATOMIC_RELAXED); + return &hooks[i]; + } + } + return NULL; +} + +void * +hook_install(tsdn_t *tsdn, hooks_t *to_install) { + malloc_mutex_lock(tsdn, &hooks_mu); + void *ret = hook_install_locked(to_install); + if (ret != NULL) { + tsd_global_slow_inc(tsdn); + } + malloc_mutex_unlock(tsdn, &hooks_mu); + return ret; +} + +static void +hook_remove_locked(seq_hooks_t *to_remove) { + hooks_internal_t hooks_internal; + bool success = seq_try_load_hooks(&hooks_internal, to_remove); + /* We hold mu; no concurrent access. */ + assert(success); + /* Should only remove hooks that were added. */ + assert(hooks_internal.in_use); + hooks_internal.in_use = false; + seq_store_hooks(to_remove, &hooks_internal); + atomic_store_u(&nhooks, atomic_load_u(&nhooks, ATOMIC_RELAXED) - 1, + ATOMIC_RELAXED); +} + +void +hook_remove(tsdn_t *tsdn, void *opaque) { + if (config_debug) { + char *hooks_begin = (char *)&hooks[0]; + char *hooks_end = (char *)&hooks[HOOK_MAX]; + char *hook = (char *)opaque; + assert(hooks_begin <= hook && hook < hooks_end + && (hook - hooks_begin) % sizeof(seq_hooks_t) == 0); + } + malloc_mutex_lock(tsdn, &hooks_mu); + hook_remove_locked((seq_hooks_t *)opaque); + tsd_global_slow_dec(tsdn); + malloc_mutex_unlock(tsdn, &hooks_mu); +} + +#define FOR_EACH_HOOK_BEGIN(hooks_internal_ptr) \ +for (int for_each_hook_counter = 0; \ + for_each_hook_counter < HOOK_MAX; \ + for_each_hook_counter++) { \ + bool for_each_hook_success = seq_try_load_hooks( \ + (hooks_internal_ptr), &hooks[for_each_hook_counter]); \ + if (!for_each_hook_success) { \ + continue; \ + } \ + if (!(hooks_internal_ptr)->in_use) { \ + continue; \ + } +#define FOR_EACH_HOOK_END \ +} + +static bool * +hook_reentrantp(void) { + /* + * We prevent user reentrancy within hooks. This is basically just a + * thread-local bool that triggers an early-exit. + * + * We don't fold in_hook into reentrancy. There are two reasons for + * this: + * - Right now, we turn on reentrancy during things like extent hook + * execution. Allocating during extent hooks is not officially + * supported, but we don't want to break it for the time being. These + * sorts of allocations should probably still be hooked, though. + * - If a hook allocates, we may want it to be relatively fast (after + * all, it executes on every allocator operation). Turning on + * reentrancy is a fairly heavyweight mode (disabling tcache, + * redirecting to arena 0, etc.). It's possible we may one day want + * to turn on reentrant mode here, if it proves too difficult to keep + * this working. But that's fairly easy for us to see; OTOH, people + * not using hooks because they're too slow is easy for us to miss. + * + * The tricky part is + * that this code might get invoked even if we don't have access to tsd. + * This function mimics getting a pointer to thread-local data, except + * that it might secretly return a pointer to some global data if we + * know that the caller will take the early-exit path. + * If we return a bool that indicates that we are reentrant, then the + * caller will go down the early exit path, leaving the global + * untouched. + */ + static bool in_hook_global = true; + tsdn_t *tsdn = tsdn_fetch(); + bool *in_hook = tsdn_in_hookp_get(tsdn); + if (in_hook!= NULL) { + return in_hook; + } + return &in_hook_global; +} + +#define HOOK_PROLOGUE \ + if (likely(atomic_load_u(&nhooks, ATOMIC_RELAXED) == 0)) { \ + return; \ + } \ + bool *in_hook = hook_reentrantp(); \ + if (*in_hook) { \ + return; \ + } \ + *in_hook = true; + +#define HOOK_EPILOGUE \ + *in_hook = false; + +void +hook_invoke_alloc(hook_alloc_t type, void *result, uintptr_t result_raw, + uintptr_t args_raw[3]) { + HOOK_PROLOGUE + + hooks_internal_t hook; + FOR_EACH_HOOK_BEGIN(&hook) + hook_alloc h = hook.hooks.alloc_hook; + if (h != NULL) { + h(hook.hooks.extra, type, result, result_raw, args_raw); + } + FOR_EACH_HOOK_END + + HOOK_EPILOGUE +} + +void +hook_invoke_dalloc(hook_dalloc_t type, void *address, uintptr_t args_raw[3]) { + HOOK_PROLOGUE + hooks_internal_t hook; + FOR_EACH_HOOK_BEGIN(&hook) + hook_dalloc h = hook.hooks.dalloc_hook; + if (h != NULL) { + h(hook.hooks.extra, type, address, args_raw); + } + FOR_EACH_HOOK_END + HOOK_EPILOGUE +} + +void +hook_invoke_expand(hook_expand_t type, void *address, size_t old_usize, + size_t new_usize, uintptr_t result_raw, uintptr_t args_raw[4]) { + HOOK_PROLOGUE + hooks_internal_t hook; + FOR_EACH_HOOK_BEGIN(&hook) + hook_expand h = hook.hooks.expand_hook; + if (h != NULL) { + h(hook.hooks.extra, type, address, old_usize, new_usize, + result_raw, args_raw); + } + FOR_EACH_HOOK_END + HOOK_EPILOGUE +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/hpa.c b/src/duckdb/extension/jemalloc/jemalloc/src/hpa.c new file mode 100644 index 000000000..49d6b037c --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/hpa.c @@ -0,0 +1,1074 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/hpa.h" + +#include "jemalloc/internal/fb.h" +#include "jemalloc/internal/witness.h" + +#define HPA_EDEN_SIZE (128 * HUGEPAGE) + +static edata_t *hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size, + size_t alignment, bool zero, bool guarded, bool frequent_reuse, + bool *deferred_work_generated); +static size_t hpa_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size, + size_t nallocs, edata_list_active_t *results, bool frequent_reuse, + bool *deferred_work_generated); +static bool hpa_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, + size_t old_size, size_t new_size, bool zero, bool *deferred_work_generated); +static bool hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, + size_t old_size, size_t new_size, bool *deferred_work_generated); +static void hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata, + bool *deferred_work_generated); +static void hpa_dalloc_batch(tsdn_t *tsdn, pai_t *self, + edata_list_active_t *list, bool *deferred_work_generated); +static uint64_t hpa_time_until_deferred_work(tsdn_t *tsdn, pai_t *self); + +bool +hpa_supported(void) { +#ifdef _WIN32 + /* + * At least until the API and implementation is somewhat settled, we + * don't want to try to debug the VM subsystem on the hardest-to-test + * platform. + */ + return false; +#endif + if (!pages_can_hugify) { + return false; + } + /* + * We fundamentally rely on a address-space-hungry growth strategy for + * hugepages. + */ + if (LG_SIZEOF_PTR != 3) { + return false; + } + /* + * If we couldn't detect the value of HUGEPAGE, HUGEPAGE_PAGES becomes + * this sentinel value -- see the comment in pages.h. + */ + if (HUGEPAGE_PAGES == 1) { + return false; + } + return true; +} + +static void +hpa_do_consistency_checks(hpa_shard_t *shard) { + assert(shard->base != NULL); +} + +bool +hpa_central_init(hpa_central_t *central, base_t *base, const hpa_hooks_t *hooks) { + /* malloc_conf processing should have filtered out these cases. */ + assert(hpa_supported()); + bool err; + err = malloc_mutex_init(¢ral->grow_mtx, "hpa_central_grow", + WITNESS_RANK_HPA_CENTRAL_GROW, malloc_mutex_rank_exclusive); + if (err) { + return true; + } + + central->base = base; + central->eden = NULL; + central->eden_len = 0; + central->age_counter = 0; + central->hooks = *hooks; + return false; +} + +static hpdata_t * +hpa_alloc_ps(tsdn_t *tsdn, hpa_central_t *central) { + return (hpdata_t *)base_alloc(tsdn, central->base, sizeof(hpdata_t), + CACHELINE); +} + +static hpdata_t * +hpa_central_extract(tsdn_t *tsdn, hpa_central_t *central, size_t size, + bool *oom) { + /* Don't yet support big allocations; these should get filtered out. */ + assert(size <= HUGEPAGE); + /* + * Should only try to extract from the central allocator if the local + * shard is exhausted. We should hold the grow_mtx on that shard. + */ + witness_assert_positive_depth_to_rank( + tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_HPA_SHARD_GROW); + + malloc_mutex_lock(tsdn, ¢ral->grow_mtx); + *oom = false; + + hpdata_t *ps = NULL; + + /* Is eden a perfect fit? */ + if (central->eden != NULL && central->eden_len == HUGEPAGE) { + ps = hpa_alloc_ps(tsdn, central); + if (ps == NULL) { + *oom = true; + malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); + return NULL; + } + hpdata_init(ps, central->eden, central->age_counter++); + central->eden = NULL; + central->eden_len = 0; + malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); + return ps; + } + + /* + * We're about to try to allocate from eden by splitting. If eden is + * NULL, we have to allocate it too. Otherwise, we just have to + * allocate an edata_t for the new psset. + */ + if (central->eden == NULL) { + /* + * During development, we're primarily concerned with systems + * with overcommit. Eventually, we should be more careful here. + */ + bool commit = true; + /* Allocate address space, bailing if we fail. */ + void *new_eden = pages_map(NULL, HPA_EDEN_SIZE, HUGEPAGE, + &commit); + if (new_eden == NULL) { + *oom = true; + malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); + return NULL; + } + ps = hpa_alloc_ps(tsdn, central); + if (ps == NULL) { + pages_unmap(new_eden, HPA_EDEN_SIZE); + *oom = true; + malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); + return NULL; + } + central->eden = new_eden; + central->eden_len = HPA_EDEN_SIZE; + } else { + /* Eden is already nonempty; only need an edata for ps. */ + ps = hpa_alloc_ps(tsdn, central); + if (ps == NULL) { + *oom = true; + malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); + return NULL; + } + } + assert(ps != NULL); + assert(central->eden != NULL); + assert(central->eden_len > HUGEPAGE); + assert(central->eden_len % HUGEPAGE == 0); + assert(HUGEPAGE_ADDR2BASE(central->eden) == central->eden); + + hpdata_init(ps, central->eden, central->age_counter++); + + char *eden_char = (char *)central->eden; + eden_char += HUGEPAGE; + central->eden = (void *)eden_char; + central->eden_len -= HUGEPAGE; + + malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); + + return ps; +} + +bool +hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap, + base_t *base, edata_cache_t *edata_cache, unsigned ind, + const hpa_shard_opts_t *opts) { + /* malloc_conf processing should have filtered out these cases. */ + assert(hpa_supported()); + bool err; + err = malloc_mutex_init(&shard->grow_mtx, "hpa_shard_grow", + WITNESS_RANK_HPA_SHARD_GROW, malloc_mutex_rank_exclusive); + if (err) { + return true; + } + err = malloc_mutex_init(&shard->mtx, "hpa_shard", + WITNESS_RANK_HPA_SHARD, malloc_mutex_rank_exclusive); + if (err) { + return true; + } + + assert(edata_cache != NULL); + shard->central = central; + shard->base = base; + edata_cache_fast_init(&shard->ecf, edata_cache); + psset_init(&shard->psset); + shard->age_counter = 0; + shard->ind = ind; + shard->emap = emap; + + shard->opts = *opts; + + shard->npending_purge = 0; + nstime_init_zero(&shard->last_purge); + + shard->stats.npurge_passes = 0; + shard->stats.npurges = 0; + shard->stats.nhugifies = 0; + shard->stats.ndehugifies = 0; + + /* + * Fill these in last, so that if an hpa_shard gets used despite + * initialization failing, we'll at least crash instead of just + * operating on corrupted data. + */ + shard->pai.alloc = &hpa_alloc; + shard->pai.alloc_batch = &hpa_alloc_batch; + shard->pai.expand = &hpa_expand; + shard->pai.shrink = &hpa_shrink; + shard->pai.dalloc = &hpa_dalloc; + shard->pai.dalloc_batch = &hpa_dalloc_batch; + shard->pai.time_until_deferred_work = &hpa_time_until_deferred_work; + + hpa_do_consistency_checks(shard); + + return false; +} + +/* + * Note that the stats functions here follow the usual stats naming conventions; + * "merge" obtains the stats from some live object of instance, while "accum" + * only combines the stats from one stats objet to another. Hence the lack of + * locking here. + */ +static void +hpa_shard_nonderived_stats_accum(hpa_shard_nonderived_stats_t *dst, + hpa_shard_nonderived_stats_t *src) { + dst->npurge_passes += src->npurge_passes; + dst->npurges += src->npurges; + dst->nhugifies += src->nhugifies; + dst->ndehugifies += src->ndehugifies; +} + +void +hpa_shard_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src) { + psset_stats_accum(&dst->psset_stats, &src->psset_stats); + hpa_shard_nonderived_stats_accum(&dst->nonderived_stats, + &src->nonderived_stats); +} + +void +hpa_shard_stats_merge(tsdn_t *tsdn, hpa_shard_t *shard, + hpa_shard_stats_t *dst) { + hpa_do_consistency_checks(shard); + + malloc_mutex_lock(tsdn, &shard->grow_mtx); + malloc_mutex_lock(tsdn, &shard->mtx); + psset_stats_accum(&dst->psset_stats, &shard->psset.stats); + hpa_shard_nonderived_stats_accum(&dst->nonderived_stats, &shard->stats); + malloc_mutex_unlock(tsdn, &shard->mtx); + malloc_mutex_unlock(tsdn, &shard->grow_mtx); +} + +static bool +hpa_good_hugification_candidate(hpa_shard_t *shard, hpdata_t *ps) { + /* + * Note that this needs to be >= rather than just >, because of the + * important special case in which the hugification threshold is exactly + * HUGEPAGE. + */ + return hpdata_nactive_get(ps) * PAGE + >= shard->opts.hugification_threshold; +} + +static size_t +hpa_adjusted_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + return psset_ndirty(&shard->psset) - shard->npending_purge; +} + +static size_t +hpa_ndirty_max(tsdn_t *tsdn, hpa_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + if (shard->opts.dirty_mult == (fxp_t)-1) { + return (size_t)-1; + } + return fxp_mul_frac(psset_nactive(&shard->psset), + shard->opts.dirty_mult); +} + +static bool +hpa_hugify_blocked_by_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); + if (to_hugify == NULL) { + return false; + } + return hpa_adjusted_ndirty(tsdn, shard) + + hpdata_nretained_get(to_hugify) > hpa_ndirty_max(tsdn, shard); +} + +static bool +hpa_should_purge(tsdn_t *tsdn, hpa_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + if (hpa_adjusted_ndirty(tsdn, shard) > hpa_ndirty_max(tsdn, shard)) { + return true; + } + if (hpa_hugify_blocked_by_ndirty(tsdn, shard)) { + return true; + } + return false; +} + +static void +hpa_update_purge_hugify_eligibility(tsdn_t *tsdn, hpa_shard_t *shard, + hpdata_t *ps) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + if (hpdata_changing_state_get(ps)) { + hpdata_purge_allowed_set(ps, false); + hpdata_disallow_hugify(ps); + return; + } + /* + * Hugepages are distinctly costly to purge, so try to avoid it unless + * they're *particularly* full of dirty pages. Eventually, we should + * use a smarter / more dynamic heuristic for situations where we have + * to manually hugify. + * + * In situations where we don't manually hugify, this problem is + * reduced. The "bad" situation we're trying to avoid is one's that's + * common in some Linux configurations (where both enabled and defrag + * are set to madvise) that can lead to long latency spikes on the first + * access after a hugification. The ideal policy in such configurations + * is probably time-based for both purging and hugifying; only hugify a + * hugepage if it's met the criteria for some extended period of time, + * and only dehugify it if it's failed to meet the criteria for an + * extended period of time. When background threads are on, we should + * try to take this hit on one of them, as well. + * + * I think the ideal setting is THP always enabled, and defrag set to + * deferred; in that case we don't need any explicit calls on the + * allocator's end at all; we just try to pack allocations in a + * hugepage-friendly manner and let the OS hugify in the background. + */ + hpdata_purge_allowed_set(ps, hpdata_ndirty_get(ps) > 0); + if (hpa_good_hugification_candidate(shard, ps) + && !hpdata_huge_get(ps)) { + nstime_t now; + shard->central->hooks.curtime(&now, /* first_reading */ true); + hpdata_allow_hugify(ps, now); + } + /* + * Once a hugepage has become eligible for hugification, we don't mark + * it as ineligible just because it stops meeting the criteria (this + * could lead to situations where a hugepage that spends most of its + * time meeting the criteria never quite getting hugified if there are + * intervening deallocations). The idea is that the hugification delay + * will allow them to get purged, reseting their "hugify-allowed" bit. + * If they don't get purged, then the hugification isn't hurting and + * might help. As an exception, we don't hugify hugepages that are now + * empty; it definitely doesn't help there until the hugepage gets + * reused, which is likely not for a while. + */ + if (hpdata_nactive_get(ps) == 0) { + hpdata_disallow_hugify(ps); + } +} + +static bool +hpa_shard_has_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); + return to_hugify != NULL || hpa_should_purge(tsdn, shard); +} + +/* Returns whether or not we purged anything. */ +static bool +hpa_try_purge(tsdn_t *tsdn, hpa_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + + /* + * Make sure we respect purge interval setting and don't purge + * too frequently. + */ + if (shard->opts.strict_min_purge_interval) { + uint64_t since_last_purge_ms = shard->central->hooks.ms_since( + &shard->last_purge); + if (since_last_purge_ms < shard->opts.min_purge_interval_ms) { + return false; + } + } + + hpdata_t *to_purge = psset_pick_purge(&shard->psset); + if (to_purge == NULL) { + return false; + } + assert(hpdata_purge_allowed_get(to_purge)); + assert(!hpdata_changing_state_get(to_purge)); + + /* + * Don't let anyone else purge or hugify this page while + * we're purging it (allocations and deallocations are + * OK). + */ + psset_update_begin(&shard->psset, to_purge); + assert(hpdata_alloc_allowed_get(to_purge)); + hpdata_mid_purge_set(to_purge, true); + hpdata_purge_allowed_set(to_purge, false); + hpdata_disallow_hugify(to_purge); + /* + * Unlike with hugification (where concurrent + * allocations are allowed), concurrent allocation out + * of a hugepage being purged is unsafe; we might hand + * out an extent for an allocation and then purge it + * (clearing out user data). + */ + hpdata_alloc_allowed_set(to_purge, false); + psset_update_end(&shard->psset, to_purge); + + /* Gather all the metadata we'll need during the purge. */ + bool dehugify = hpdata_huge_get(to_purge); + hpdata_purge_state_t purge_state; + size_t num_to_purge = hpdata_purge_begin(to_purge, &purge_state); + + shard->npending_purge += num_to_purge; + + malloc_mutex_unlock(tsdn, &shard->mtx); + + /* Actually do the purging, now that the lock is dropped. */ + if (dehugify) { + shard->central->hooks.dehugify(hpdata_addr_get(to_purge), + HUGEPAGE); + } + size_t total_purged = 0; + uint64_t purges_this_pass = 0; + void *purge_addr; + size_t purge_size; + while (hpdata_purge_next(to_purge, &purge_state, &purge_addr, + &purge_size)) { + total_purged += purge_size; + assert(total_purged <= HUGEPAGE); + purges_this_pass++; + shard->central->hooks.purge(purge_addr, purge_size); + } + + malloc_mutex_lock(tsdn, &shard->mtx); + /* The shard updates */ + shard->npending_purge -= num_to_purge; + shard->stats.npurge_passes++; + shard->stats.npurges += purges_this_pass; + shard->central->hooks.curtime(&shard->last_purge, + /* first_reading */ false); + if (dehugify) { + shard->stats.ndehugifies++; + } + + /* The hpdata updates. */ + psset_update_begin(&shard->psset, to_purge); + if (dehugify) { + hpdata_dehugify(to_purge); + } + hpdata_purge_end(to_purge, &purge_state); + hpdata_mid_purge_set(to_purge, false); + + hpdata_alloc_allowed_set(to_purge, true); + hpa_update_purge_hugify_eligibility(tsdn, shard, to_purge); + + psset_update_end(&shard->psset, to_purge); + + return true; +} + +/* Returns whether or not we hugified anything. */ +static bool +hpa_try_hugify(tsdn_t *tsdn, hpa_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + + if (hpa_hugify_blocked_by_ndirty(tsdn, shard)) { + return false; + } + + hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); + if (to_hugify == NULL) { + return false; + } + assert(hpdata_hugify_allowed_get(to_hugify)); + assert(!hpdata_changing_state_get(to_hugify)); + + /* Make sure that it's been hugifiable for long enough. */ + nstime_t time_hugify_allowed = hpdata_time_hugify_allowed(to_hugify); + uint64_t millis = shard->central->hooks.ms_since(&time_hugify_allowed); + if (millis < shard->opts.hugify_delay_ms) { + return false; + } + + /* + * Don't let anyone else purge or hugify this page while + * we're hugifying it (allocations and deallocations are + * OK). + */ + psset_update_begin(&shard->psset, to_hugify); + hpdata_mid_hugify_set(to_hugify, true); + hpdata_purge_allowed_set(to_hugify, false); + hpdata_disallow_hugify(to_hugify); + assert(hpdata_alloc_allowed_get(to_hugify)); + psset_update_end(&shard->psset, to_hugify); + + malloc_mutex_unlock(tsdn, &shard->mtx); + + shard->central->hooks.hugify(hpdata_addr_get(to_hugify), HUGEPAGE); + + malloc_mutex_lock(tsdn, &shard->mtx); + shard->stats.nhugifies++; + + psset_update_begin(&shard->psset, to_hugify); + hpdata_hugify(to_hugify); + hpdata_mid_hugify_set(to_hugify, false); + hpa_update_purge_hugify_eligibility(tsdn, shard, to_hugify); + psset_update_end(&shard->psset, to_hugify); + + return true; +} + +/* + * Execution of deferred work is forced if it's triggered by an explicit + * hpa_shard_do_deferred_work() call. + */ +static void +hpa_shard_maybe_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard, + bool forced) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + if (!forced && shard->opts.deferral_allowed) { + return; + } + /* + * If we're on a background thread, do work so long as there's work to + * be done. Otherwise, bound latency to not be *too* bad by doing at + * most a small fixed number of operations. + */ + bool hugified = false; + bool purged = false; + size_t max_ops = (forced ? (size_t)-1 : 16); + size_t nops = 0; + do { + /* + * Always purge before hugifying, to make sure we get some + * ability to hit our quiescence targets. + */ + purged = false; + while (hpa_should_purge(tsdn, shard) && nops < max_ops) { + purged = hpa_try_purge(tsdn, shard); + if (!purged) { + /* + * It is fine if we couldn't purge as sometimes + * we try to purge just to unblock + * hugification, but there is maybe no dirty + * pages at all at the moment. + */ + break; + } + nops++; + } + hugified = hpa_try_hugify(tsdn, shard); + if (hugified) { + nops++; + } + malloc_mutex_assert_owner(tsdn, &shard->mtx); + malloc_mutex_assert_owner(tsdn, &shard->mtx); + } while ((hugified || purged) && nops < max_ops); +} + +static edata_t * +hpa_try_alloc_one_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, + bool *oom) { + bool err; + edata_t *edata = edata_cache_fast_get(tsdn, &shard->ecf); + if (edata == NULL) { + *oom = true; + return NULL; + } + + hpdata_t *ps = psset_pick_alloc(&shard->psset, size); + if (ps == NULL) { + edata_cache_fast_put(tsdn, &shard->ecf, edata); + return NULL; + } + + psset_update_begin(&shard->psset, ps); + + if (hpdata_empty(ps)) { + /* + * If the pageslab used to be empty, treat it as though it's + * brand new for fragmentation-avoidance purposes; what we're + * trying to approximate is the age of the allocations *in* that + * pageslab, and the allocations in the new pageslab are + * definitionally the youngest in this hpa shard. + */ + hpdata_age_set(ps, shard->age_counter++); + } + + void *addr = hpdata_reserve_alloc(ps, size); + edata_init(edata, shard->ind, addr, size, /* slab */ false, + SC_NSIZES, /* sn */ hpdata_age_get(ps), extent_state_active, + /* zeroed */ false, /* committed */ true, EXTENT_PAI_HPA, + EXTENT_NOT_HEAD); + edata_ps_set(edata, ps); + + /* + * This could theoretically be moved outside of the critical section, + * but that introduces the potential for a race. Without the lock, the + * (initially nonempty, since this is the reuse pathway) pageslab we + * allocated out of could become otherwise empty while the lock is + * dropped. This would force us to deal with a pageslab eviction down + * the error pathway, which is a pain. + */ + err = emap_register_boundary(tsdn, shard->emap, edata, + SC_NSIZES, /* slab */ false); + if (err) { + hpdata_unreserve(ps, edata_addr_get(edata), + edata_size_get(edata)); + /* + * We should arguably reset dirty state here, but this would + * require some sort of prepare + commit functionality that's a + * little much to deal with for now. + * + * We don't have a do_deferred_work down this pathway, on the + * principle that we didn't *really* affect shard state (we + * tweaked the stats, but our tweaks weren't really accurate). + */ + psset_update_end(&shard->psset, ps); + edata_cache_fast_put(tsdn, &shard->ecf, edata); + *oom = true; + return NULL; + } + + hpa_update_purge_hugify_eligibility(tsdn, shard, ps); + psset_update_end(&shard->psset, ps); + return edata; +} + +static size_t +hpa_try_alloc_batch_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, + bool *oom, size_t nallocs, edata_list_active_t *results, + bool *deferred_work_generated) { + malloc_mutex_lock(tsdn, &shard->mtx); + size_t nsuccess = 0; + for (; nsuccess < nallocs; nsuccess++) { + edata_t *edata = hpa_try_alloc_one_no_grow(tsdn, shard, size, + oom); + if (edata == NULL) { + break; + } + edata_list_active_append(results, edata); + } + + hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ false); + *deferred_work_generated = hpa_shard_has_deferred_work(tsdn, shard); + malloc_mutex_unlock(tsdn, &shard->mtx); + return nsuccess; +} + +static size_t +hpa_alloc_batch_psset(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, + size_t nallocs, edata_list_active_t *results, + bool *deferred_work_generated) { + assert(size <= HUGEPAGE); + assert(size <= shard->opts.slab_max_alloc || + size == sz_index2size(sz_size2index(size))); + bool oom = false; + + size_t nsuccess = hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom, + nallocs, results, deferred_work_generated); + + if (nsuccess == nallocs || oom) { + return nsuccess; + } + + /* + * We didn't OOM, but weren't able to fill everything requested of us; + * try to grow. + */ + malloc_mutex_lock(tsdn, &shard->grow_mtx); + /* + * Check for grow races; maybe some earlier thread expanded the psset + * in between when we dropped the main mutex and grabbed the grow mutex. + */ + nsuccess += hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom, + nallocs - nsuccess, results, deferred_work_generated); + if (nsuccess == nallocs || oom) { + malloc_mutex_unlock(tsdn, &shard->grow_mtx); + return nsuccess; + } + + /* + * Note that we don't hold shard->mtx here (while growing); + * deallocations (and allocations of smaller sizes) may still succeed + * while we're doing this potentially expensive system call. + */ + hpdata_t *ps = hpa_central_extract(tsdn, shard->central, size, &oom); + if (ps == NULL) { + malloc_mutex_unlock(tsdn, &shard->grow_mtx); + return nsuccess; + } + + /* + * We got the pageslab; allocate from it. This does an unlock followed + * by a lock on the same mutex, and holds the grow mutex while doing + * deferred work, but this is an uncommon path; the simplicity is worth + * it. + */ + malloc_mutex_lock(tsdn, &shard->mtx); + psset_insert(&shard->psset, ps); + malloc_mutex_unlock(tsdn, &shard->mtx); + + nsuccess += hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom, + nallocs - nsuccess, results, deferred_work_generated); + /* + * Drop grow_mtx before doing deferred work; other threads blocked on it + * should be allowed to proceed while we're working. + */ + malloc_mutex_unlock(tsdn, &shard->grow_mtx); + + return nsuccess; +} + +static hpa_shard_t * +hpa_from_pai(pai_t *self) { + assert(self->alloc == &hpa_alloc); + assert(self->expand == &hpa_expand); + assert(self->shrink == &hpa_shrink); + assert(self->dalloc == &hpa_dalloc); + return (hpa_shard_t *)self; +} + +static size_t +hpa_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size, size_t nallocs, + edata_list_active_t *results, bool frequent_reuse, + bool *deferred_work_generated) { + assert(nallocs > 0); + assert((size & PAGE_MASK) == 0); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + hpa_shard_t *shard = hpa_from_pai(self); + + /* + * frequent_use here indicates this request comes from the arena bins, + * in which case it will be split into slabs, and therefore there is no + * intrinsic slack in the allocation (the entire range of allocated size + * will be accessed). + * + * In this case bypass the slab_max_alloc limit (if still within the + * huge page size). These requests do not concern internal + * fragmentation with huge pages (again, the full size will be used). + */ + if (!(frequent_reuse && size <= HUGEPAGE) && + (size > shard->opts.slab_max_alloc)) { + return 0; + } + + size_t nsuccess = hpa_alloc_batch_psset(tsdn, shard, size, nallocs, + results, deferred_work_generated); + + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + /* + * Guard the sanity checks with config_debug because the loop cannot be + * proven non-circular by the compiler, even if everything within the + * loop is optimized away. + */ + if (config_debug) { + edata_t *edata; + ql_foreach(edata, &results->head, ql_link_active) { + emap_assert_mapped(tsdn, shard->emap, edata); + assert(edata_pai_get(edata) == EXTENT_PAI_HPA); + assert(edata_state_get(edata) == extent_state_active); + assert(edata_arena_ind_get(edata) == shard->ind); + assert(edata_szind_get_maybe_invalid(edata) == + SC_NSIZES); + assert(!edata_slab_get(edata)); + assert(edata_committed_get(edata)); + assert(edata_base_get(edata) == edata_addr_get(edata)); + assert(edata_base_get(edata) != NULL); + } + } + return nsuccess; +} + +static edata_t * +hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero, + bool guarded, bool frequent_reuse, bool *deferred_work_generated) { + assert((size & PAGE_MASK) == 0); + assert(!guarded); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + /* We don't handle alignment or zeroing for now. */ + if (alignment > PAGE || zero) { + return NULL; + } + /* + * An alloc with alignment == PAGE and zero == false is equivalent to a + * batch alloc of 1. Just do that, so we can share code. + */ + edata_list_active_t results; + edata_list_active_init(&results); + size_t nallocs = hpa_alloc_batch(tsdn, self, size, /* nallocs */ 1, + &results, frequent_reuse, deferred_work_generated); + assert(nallocs == 0 || nallocs == 1); + edata_t *edata = edata_list_active_first(&results); + return edata; +} + +static bool +hpa_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, + size_t new_size, bool zero, bool *deferred_work_generated) { + /* Expand not yet supported. */ + return true; +} + +static bool +hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, + size_t old_size, size_t new_size, bool *deferred_work_generated) { + /* Shrink not yet supported. */ + return true; +} + +static void +hpa_dalloc_prepare_unlocked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) { + malloc_mutex_assert_not_owner(tsdn, &shard->mtx); + + assert(edata_pai_get(edata) == EXTENT_PAI_HPA); + assert(edata_state_get(edata) == extent_state_active); + assert(edata_arena_ind_get(edata) == shard->ind); + assert(edata_szind_get_maybe_invalid(edata) == SC_NSIZES); + assert(edata_committed_get(edata)); + assert(edata_base_get(edata) != NULL); + + /* + * Another thread shouldn't be trying to touch the metadata of an + * allocation being freed. The one exception is a merge attempt from a + * lower-addressed PAC extent; in this case we have a nominal race on + * the edata metadata bits, but in practice the fact that the PAI bits + * are different will prevent any further access. The race is bad, but + * benign in practice, and the long term plan is to track enough state + * in the rtree to prevent these merge attempts in the first place. + */ + edata_addr_set(edata, edata_base_get(edata)); + edata_zeroed_set(edata, false); + emap_deregister_boundary(tsdn, shard->emap, edata); +} + +static void +hpa_dalloc_locked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + + /* + * Release the metadata early, to avoid having to remember to do it + * while we're also doing tricky purging logic. First, we need to grab + * a few bits of metadata from it. + * + * Note that the shard mutex protects ps's metadata too; it wouldn't be + * correct to try to read most information out of it without the lock. + */ + hpdata_t *ps = edata_ps_get(edata); + /* Currently, all edatas come from pageslabs. */ + assert(ps != NULL); + void *unreserve_addr = edata_addr_get(edata); + size_t unreserve_size = edata_size_get(edata); + edata_cache_fast_put(tsdn, &shard->ecf, edata); + + psset_update_begin(&shard->psset, ps); + hpdata_unreserve(ps, unreserve_addr, unreserve_size); + hpa_update_purge_hugify_eligibility(tsdn, shard, ps); + psset_update_end(&shard->psset, ps); +} + +static void +hpa_dalloc_batch(tsdn_t *tsdn, pai_t *self, edata_list_active_t *list, + bool *deferred_work_generated) { + hpa_shard_t *shard = hpa_from_pai(self); + + edata_t *edata; + ql_foreach(edata, &list->head, ql_link_active) { + hpa_dalloc_prepare_unlocked(tsdn, shard, edata); + } + + malloc_mutex_lock(tsdn, &shard->mtx); + /* Now, remove from the list. */ + while ((edata = edata_list_active_first(list)) != NULL) { + edata_list_active_remove(list, edata); + hpa_dalloc_locked(tsdn, shard, edata); + } + hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ false); + *deferred_work_generated = + hpa_shard_has_deferred_work(tsdn, shard); + + malloc_mutex_unlock(tsdn, &shard->mtx); +} + +static void +hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata, + bool *deferred_work_generated) { + assert(!edata_guarded_get(edata)); + /* Just a dalloc_batch of size 1; this lets us share logic. */ + edata_list_active_t dalloc_list; + edata_list_active_init(&dalloc_list); + edata_list_active_append(&dalloc_list, edata); + hpa_dalloc_batch(tsdn, self, &dalloc_list, deferred_work_generated); +} + +/* + * Calculate time until either purging or hugification ought to happen. + * Called by background threads. + */ +static uint64_t +hpa_time_until_deferred_work(tsdn_t *tsdn, pai_t *self) { + hpa_shard_t *shard = hpa_from_pai(self); + uint64_t time_ns = BACKGROUND_THREAD_DEFERRED_MAX; + + malloc_mutex_lock(tsdn, &shard->mtx); + + hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); + if (to_hugify != NULL) { + nstime_t time_hugify_allowed = + hpdata_time_hugify_allowed(to_hugify); + uint64_t since_hugify_allowed_ms = + shard->central->hooks.ms_since(&time_hugify_allowed); + /* + * If not enough time has passed since hugification was allowed, + * sleep for the rest. + */ + if (since_hugify_allowed_ms < shard->opts.hugify_delay_ms) { + time_ns = shard->opts.hugify_delay_ms - + since_hugify_allowed_ms; + time_ns *= 1000 * 1000; + } else { + malloc_mutex_unlock(tsdn, &shard->mtx); + return BACKGROUND_THREAD_DEFERRED_MIN; + } + } + + if (hpa_should_purge(tsdn, shard)) { + /* + * If we haven't purged before, no need to check interval + * between purges. Simply purge as soon as possible. + */ + if (shard->stats.npurge_passes == 0) { + malloc_mutex_unlock(tsdn, &shard->mtx); + return BACKGROUND_THREAD_DEFERRED_MIN; + } + uint64_t since_last_purge_ms = shard->central->hooks.ms_since( + &shard->last_purge); + + if (since_last_purge_ms < shard->opts.min_purge_interval_ms) { + uint64_t until_purge_ns; + until_purge_ns = shard->opts.min_purge_interval_ms - + since_last_purge_ms; + until_purge_ns *= 1000 * 1000; + + if (until_purge_ns < time_ns) { + time_ns = until_purge_ns; + } + } else { + time_ns = BACKGROUND_THREAD_DEFERRED_MIN; + } + } + malloc_mutex_unlock(tsdn, &shard->mtx); + return time_ns; +} + +void +hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard) { + hpa_do_consistency_checks(shard); + + malloc_mutex_lock(tsdn, &shard->mtx); + edata_cache_fast_disable(tsdn, &shard->ecf); + malloc_mutex_unlock(tsdn, &shard->mtx); +} + +static void +hpa_shard_assert_stats_empty(psset_bin_stats_t *bin_stats) { + assert(bin_stats->npageslabs == 0); + assert(bin_stats->nactive == 0); +} + +static void +hpa_assert_empty(tsdn_t *tsdn, hpa_shard_t *shard, psset_t *psset) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + for (int huge = 0; huge <= 1; huge++) { + hpa_shard_assert_stats_empty(&psset->stats.full_slabs[huge]); + for (pszind_t i = 0; i < PSSET_NPSIZES; i++) { + hpa_shard_assert_stats_empty( + &psset->stats.nonfull_slabs[i][huge]); + } + } +} + +void +hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard) { + hpa_do_consistency_checks(shard); + /* + * By the time we're here, the arena code should have dalloc'd all the + * active extents, which means we should have eventually evicted + * everything from the psset, so it shouldn't be able to serve even a + * 1-page allocation. + */ + if (config_debug) { + malloc_mutex_lock(tsdn, &shard->mtx); + hpa_assert_empty(tsdn, shard, &shard->psset); + malloc_mutex_unlock(tsdn, &shard->mtx); + } + hpdata_t *ps; + while ((ps = psset_pick_alloc(&shard->psset, PAGE)) != NULL) { + /* There should be no allocations anywhere. */ + assert(hpdata_empty(ps)); + psset_remove(&shard->psset, ps); + shard->central->hooks.unmap(hpdata_addr_get(ps), HUGEPAGE); + } +} + +void +hpa_shard_set_deferral_allowed(tsdn_t *tsdn, hpa_shard_t *shard, + bool deferral_allowed) { + hpa_do_consistency_checks(shard); + + malloc_mutex_lock(tsdn, &shard->mtx); + bool deferral_previously_allowed = shard->opts.deferral_allowed; + shard->opts.deferral_allowed = deferral_allowed; + if (deferral_previously_allowed && !deferral_allowed) { + hpa_shard_maybe_do_deferred_work(tsdn, shard, + /* forced */ true); + } + malloc_mutex_unlock(tsdn, &shard->mtx); +} + +void +hpa_shard_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) { + hpa_do_consistency_checks(shard); + + malloc_mutex_lock(tsdn, &shard->mtx); + hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ true); + malloc_mutex_unlock(tsdn, &shard->mtx); +} + +void +hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard) { + hpa_do_consistency_checks(shard); + + malloc_mutex_prefork(tsdn, &shard->grow_mtx); +} + +void +hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard) { + hpa_do_consistency_checks(shard); + + malloc_mutex_prefork(tsdn, &shard->mtx); +} + +void +hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard) { + hpa_do_consistency_checks(shard); + + malloc_mutex_postfork_parent(tsdn, &shard->grow_mtx); + malloc_mutex_postfork_parent(tsdn, &shard->mtx); +} + +void +hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard) { + hpa_do_consistency_checks(shard); + + malloc_mutex_postfork_child(tsdn, &shard->grow_mtx); + malloc_mutex_postfork_child(tsdn, &shard->mtx); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/hpa_hooks.c b/src/duckdb/extension/jemalloc/jemalloc/src/hpa_hooks.c new file mode 100644 index 000000000..6048f3821 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/hpa_hooks.c @@ -0,0 +1,63 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/hpa_hooks.h" + +static void *hpa_hooks_map(size_t size); +static void hpa_hooks_unmap(void *ptr, size_t size); +static void hpa_hooks_purge(void *ptr, size_t size); +static void hpa_hooks_hugify(void *ptr, size_t size); +static void hpa_hooks_dehugify(void *ptr, size_t size); +static void hpa_hooks_curtime(nstime_t *r_nstime, bool first_reading); +static uint64_t hpa_hooks_ms_since(nstime_t *past_nstime); + +const hpa_hooks_t hpa_hooks_default = { + &hpa_hooks_map, + &hpa_hooks_unmap, + &hpa_hooks_purge, + &hpa_hooks_hugify, + &hpa_hooks_dehugify, + &hpa_hooks_curtime, + &hpa_hooks_ms_since +}; + +static void * +hpa_hooks_map(size_t size) { + bool commit = true; + return pages_map(NULL, size, HUGEPAGE, &commit); +} + +static void +hpa_hooks_unmap(void *ptr, size_t size) { + pages_unmap(ptr, size); +} + +static void +hpa_hooks_purge(void *ptr, size_t size) { + pages_purge_forced(ptr, size); +} + +static void +hpa_hooks_hugify(void *ptr, size_t size) { + bool err = pages_huge(ptr, size); + (void)err; +} + +static void +hpa_hooks_dehugify(void *ptr, size_t size) { + bool err = pages_nohuge(ptr, size); + (void)err; +} + +static void +hpa_hooks_curtime(nstime_t *r_nstime, bool first_reading) { + if (first_reading) { + nstime_init_zero(r_nstime); + } + nstime_update(r_nstime); +} + +static uint64_t +hpa_hooks_ms_since(nstime_t *past_nstime) { + return nstime_ns_since(past_nstime) / 1000 / 1000; +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/hpdata.c b/src/duckdb/extension/jemalloc/jemalloc/src/hpdata.c new file mode 100644 index 000000000..3058eafe8 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/hpdata.c @@ -0,0 +1,325 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/hpdata.h" + +static int +hpdata_age_comp(const hpdata_t *a, const hpdata_t *b) { + uint64_t a_age = hpdata_age_get(a); + uint64_t b_age = hpdata_age_get(b); + /* + * hpdata ages are operation counts in the psset; no two should be the + * same. + */ + assert(a_age != b_age); + return (a_age > b_age) - (a_age < b_age); +} + +ph_gen(, hpdata_age_heap, hpdata_t, age_link, hpdata_age_comp) + +void +hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age) { + hpdata_addr_set(hpdata, addr); + hpdata_age_set(hpdata, age); + hpdata->h_huge = false; + hpdata->h_alloc_allowed = true; + hpdata->h_in_psset_alloc_container = false; + hpdata->h_purge_allowed = false; + hpdata->h_hugify_allowed = false; + hpdata->h_in_psset_hugify_container = false; + hpdata->h_mid_purge = false; + hpdata->h_mid_hugify = false; + hpdata->h_updating = false; + hpdata->h_in_psset = false; + hpdata_longest_free_range_set(hpdata, HUGEPAGE_PAGES); + hpdata->h_nactive = 0; + fb_init(hpdata->active_pages, HUGEPAGE_PAGES); + hpdata->h_ntouched = 0; + fb_init(hpdata->touched_pages, HUGEPAGE_PAGES); + + hpdata_assert_consistent(hpdata); +} + +void * +hpdata_reserve_alloc(hpdata_t *hpdata, size_t sz) { + hpdata_assert_consistent(hpdata); + /* + * This is a metadata change; the hpdata should therefore either not be + * in the psset, or should have explicitly marked itself as being + * mid-update. + */ + assert(!hpdata->h_in_psset || hpdata->h_updating); + assert(hpdata->h_alloc_allowed); + assert((sz & PAGE_MASK) == 0); + size_t npages = sz >> LG_PAGE; + assert(npages <= hpdata_longest_free_range_get(hpdata)); + + size_t result; + + size_t start = 0; + /* + * These are dead stores, but the compiler will issue warnings on them + * since it can't tell statically that found is always true below. + */ + size_t begin = 0; + size_t len = 0; + + size_t largest_unchosen_range = 0; + while (true) { + bool found = fb_urange_iter(hpdata->active_pages, + HUGEPAGE_PAGES, start, &begin, &len); + /* + * A precondition to this function is that hpdata must be able + * to serve the allocation. + */ + assert(found); + assert(len <= hpdata_longest_free_range_get(hpdata)); + if (len >= npages) { + /* + * We use first-fit within the page slabs; this gives + * bounded worst-case fragmentation within a slab. It's + * not necessarily right; we could experiment with + * various other options. + */ + break; + } + if (len > largest_unchosen_range) { + largest_unchosen_range = len; + } + start = begin + len; + } + /* We found a range; remember it. */ + result = begin; + fb_set_range(hpdata->active_pages, HUGEPAGE_PAGES, begin, npages); + hpdata->h_nactive += npages; + + /* + * We might be about to dirty some memory for the first time; update our + * count if so. + */ + size_t new_dirty = fb_ucount(hpdata->touched_pages, HUGEPAGE_PAGES, + result, npages); + fb_set_range(hpdata->touched_pages, HUGEPAGE_PAGES, result, npages); + hpdata->h_ntouched += new_dirty; + + /* + * If we allocated out of a range that was the longest in the hpdata, it + * might be the only one of that size and we'll have to adjust the + * metadata. + */ + if (len == hpdata_longest_free_range_get(hpdata)) { + start = begin + npages; + while (start < HUGEPAGE_PAGES) { + bool found = fb_urange_iter(hpdata->active_pages, + HUGEPAGE_PAGES, start, &begin, &len); + if (!found) { + break; + } + assert(len <= hpdata_longest_free_range_get(hpdata)); + if (len == hpdata_longest_free_range_get(hpdata)) { + largest_unchosen_range = len; + break; + } + if (len > largest_unchosen_range) { + largest_unchosen_range = len; + } + start = begin + len; + } + hpdata_longest_free_range_set(hpdata, largest_unchosen_range); + } + + hpdata_assert_consistent(hpdata); + return (void *)( + (byte_t *)hpdata_addr_get(hpdata) + (result << LG_PAGE)); +} + +void +hpdata_unreserve(hpdata_t *hpdata, void *addr, size_t sz) { + hpdata_assert_consistent(hpdata); + /* See the comment in reserve. */ + assert(!hpdata->h_in_psset || hpdata->h_updating); + assert(((uintptr_t)addr & PAGE_MASK) == 0); + assert((sz & PAGE_MASK) == 0); + size_t begin = ((uintptr_t)addr - (uintptr_t)hpdata_addr_get(hpdata)) + >> LG_PAGE; + assert(begin < HUGEPAGE_PAGES); + size_t npages = sz >> LG_PAGE; + size_t old_longest_range = hpdata_longest_free_range_get(hpdata); + + fb_unset_range(hpdata->active_pages, HUGEPAGE_PAGES, begin, npages); + /* We might have just created a new, larger range. */ + size_t new_begin = (fb_fls(hpdata->active_pages, HUGEPAGE_PAGES, + begin) + 1); + size_t new_end = fb_ffs(hpdata->active_pages, HUGEPAGE_PAGES, + begin + npages - 1); + size_t new_range_len = new_end - new_begin; + + if (new_range_len > old_longest_range) { + hpdata_longest_free_range_set(hpdata, new_range_len); + } + + hpdata->h_nactive -= npages; + + hpdata_assert_consistent(hpdata); +} + +size_t +hpdata_purge_begin(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) { + hpdata_assert_consistent(hpdata); + /* + * See the comment below; we might purge any inactive extent, so it's + * unsafe for any other thread to turn any inactive extent active while + * we're operating on it. + */ + assert(!hpdata_alloc_allowed_get(hpdata)); + + purge_state->npurged = 0; + purge_state->next_purge_search_begin = 0; + + /* + * Initialize to_purge. + * + * It's possible to end up in situations where two dirty extents are + * separated by a retained extent: + * - 1 page allocated. + * - 1 page allocated. + * - 1 pages allocated. + * + * If the middle page is freed and purged, and then the first and third + * pages are freed, and then another purge pass happens, the hpdata + * looks like this: + * - 1 page dirty. + * - 1 page retained. + * - 1 page dirty. + * + * But it's safe to do a single 3-page purge. + * + * We do this by first computing the dirty pages, and then filling in + * any gaps by extending each range in the dirty bitmap to extend until + * the next active page. This purges more pages, but the expensive part + * of purging is the TLB shootdowns, rather than the kernel state + * tracking; doing a little bit more of the latter is fine if it saves + * us from doing some of the former. + */ + + /* + * The dirty pages are those that are touched but not active. Note that + * in a normal-ish case, HUGEPAGE_PAGES is something like 512 and the + * fb_group_t is 64 bits, so this is 64 bytes, spread across 8 + * fb_group_ts. + */ + fb_group_t dirty_pages[FB_NGROUPS(HUGEPAGE_PAGES)]; + fb_init(dirty_pages, HUGEPAGE_PAGES); + fb_bit_not(dirty_pages, hpdata->active_pages, HUGEPAGE_PAGES); + fb_bit_and(dirty_pages, dirty_pages, hpdata->touched_pages, + HUGEPAGE_PAGES); + + fb_init(purge_state->to_purge, HUGEPAGE_PAGES); + size_t next_bit = 0; + while (next_bit < HUGEPAGE_PAGES) { + size_t next_dirty = fb_ffs(dirty_pages, HUGEPAGE_PAGES, + next_bit); + /* Recall that fb_ffs returns nbits if no set bit is found. */ + if (next_dirty == HUGEPAGE_PAGES) { + break; + } + size_t next_active = fb_ffs(hpdata->active_pages, + HUGEPAGE_PAGES, next_dirty); + /* + * Don't purge past the end of the dirty extent, into retained + * pages. This helps the kernel a tiny bit, but honestly it's + * mostly helpful for testing (where we tend to write test cases + * that think in terms of the dirty ranges). + */ + ssize_t last_dirty = fb_fls(dirty_pages, HUGEPAGE_PAGES, + next_active - 1); + assert(last_dirty >= 0); + assert((size_t)last_dirty >= next_dirty); + assert((size_t)last_dirty - next_dirty + 1 <= HUGEPAGE_PAGES); + + fb_set_range(purge_state->to_purge, HUGEPAGE_PAGES, next_dirty, + last_dirty - next_dirty + 1); + next_bit = next_active + 1; + } + + /* We should purge, at least, everything dirty. */ + size_t ndirty = hpdata->h_ntouched - hpdata->h_nactive; + purge_state->ndirty_to_purge = ndirty; + assert(ndirty <= fb_scount( + purge_state->to_purge, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES)); + assert(ndirty == fb_scount(dirty_pages, HUGEPAGE_PAGES, 0, + HUGEPAGE_PAGES)); + + hpdata_assert_consistent(hpdata); + + return ndirty; +} + +bool +hpdata_purge_next(hpdata_t *hpdata, hpdata_purge_state_t *purge_state, + void **r_purge_addr, size_t *r_purge_size) { + /* + * Note that we don't have a consistency check here; we're accessing + * hpdata without synchronization, and therefore have no right to expect + * a consistent state. + */ + assert(!hpdata_alloc_allowed_get(hpdata)); + + if (purge_state->next_purge_search_begin == HUGEPAGE_PAGES) { + return false; + } + size_t purge_begin; + size_t purge_len; + bool found_range = fb_srange_iter(purge_state->to_purge, HUGEPAGE_PAGES, + purge_state->next_purge_search_begin, &purge_begin, &purge_len); + if (!found_range) { + return false; + } + + *r_purge_addr = (void *)( + (byte_t *)hpdata_addr_get(hpdata) + purge_begin * PAGE); + *r_purge_size = purge_len * PAGE; + + purge_state->next_purge_search_begin = purge_begin + purge_len; + purge_state->npurged += purge_len; + assert(purge_state->npurged <= HUGEPAGE_PAGES); + + return true; +} + +void +hpdata_purge_end(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) { + assert(!hpdata_alloc_allowed_get(hpdata)); + hpdata_assert_consistent(hpdata); + /* See the comment in reserve. */ + assert(!hpdata->h_in_psset || hpdata->h_updating); + + assert(purge_state->npurged == fb_scount(purge_state->to_purge, + HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES)); + assert(purge_state->npurged >= purge_state->ndirty_to_purge); + + fb_bit_not(purge_state->to_purge, purge_state->to_purge, + HUGEPAGE_PAGES); + fb_bit_and(hpdata->touched_pages, hpdata->touched_pages, + purge_state->to_purge, HUGEPAGE_PAGES); + assert(hpdata->h_ntouched >= purge_state->ndirty_to_purge); + hpdata->h_ntouched -= purge_state->ndirty_to_purge; + + hpdata_assert_consistent(hpdata); +} + +void +hpdata_hugify(hpdata_t *hpdata) { + hpdata_assert_consistent(hpdata); + hpdata->h_huge = true; + fb_set_range(hpdata->touched_pages, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES); + hpdata->h_ntouched = HUGEPAGE_PAGES; + hpdata_assert_consistent(hpdata); +} + +void +hpdata_dehugify(hpdata_t *hpdata) { + hpdata_assert_consistent(hpdata); + hpdata->h_huge = false; + hpdata_assert_consistent(hpdata); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/inspect.c b/src/duckdb/extension/jemalloc/jemalloc/src/inspect.c new file mode 100644 index 000000000..2575b5c1f --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/inspect.c @@ -0,0 +1,78 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" +#include "jemalloc/internal/inspect.h" + +void +inspect_extent_util_stats_get(tsdn_t *tsdn, const void *ptr, size_t *nfree, + size_t *nregs, size_t *size) { + assert(ptr != NULL && nfree != NULL && nregs != NULL && size != NULL); + + const edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, ptr); + if (unlikely(edata == NULL)) { + *nfree = *nregs = *size = 0; + return; + } + + *size = edata_size_get(edata); + if (!edata_slab_get(edata)) { + *nfree = 0; + *nregs = 1; + } else { + *nfree = edata_nfree_get(edata); + *nregs = bin_infos[edata_szind_get(edata)].nregs; + assert(*nfree <= *nregs); + assert(*nfree * edata_usize_get(edata) <= *size); + } +} + +void +inspect_extent_util_stats_verbose_get(tsdn_t *tsdn, const void *ptr, + size_t *nfree, size_t *nregs, size_t *size, size_t *bin_nfree, + size_t *bin_nregs, void **slabcur_addr) { + assert(ptr != NULL && nfree != NULL && nregs != NULL && size != NULL + && bin_nfree != NULL && bin_nregs != NULL && slabcur_addr != NULL); + + const edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, ptr); + if (unlikely(edata == NULL)) { + *nfree = *nregs = *size = *bin_nfree = *bin_nregs = 0; + *slabcur_addr = NULL; + return; + } + + *size = edata_size_get(edata); + if (!edata_slab_get(edata)) { + *nfree = *bin_nfree = *bin_nregs = 0; + *nregs = 1; + *slabcur_addr = NULL; + return; + } + + *nfree = edata_nfree_get(edata); + const szind_t szind = edata_szind_get(edata); + *nregs = bin_infos[szind].nregs; + assert(*nfree <= *nregs); + assert(*nfree * edata_usize_get(edata) <= *size); + + arena_t *arena = (arena_t *)atomic_load_p( + &arenas[edata_arena_ind_get(edata)], ATOMIC_RELAXED); + assert(arena != NULL); + const unsigned binshard = edata_binshard_get(edata); + bin_t *bin = arena_get_bin(arena, szind, binshard); + + malloc_mutex_lock(tsdn, &bin->lock); + if (config_stats) { + *bin_nregs = *nregs * bin->stats.curslabs; + assert(*bin_nregs >= bin->stats.curregs); + *bin_nfree = *bin_nregs - bin->stats.curregs; + } else { + *bin_nfree = *bin_nregs = 0; + } + edata_t *slab; + if (bin->slabcur != NULL) { + slab = bin->slabcur; + } else { + slab = edata_heap_first(&bin->slabs_nonfull); + } + *slabcur_addr = slab != NULL ? edata_addr_get(slab) : NULL; + malloc_mutex_unlock(tsdn, &bin->lock); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/jemalloc.c b/src/duckdb/extension/jemalloc/jemalloc/src/jemalloc.c new file mode 100644 index 000000000..292d553aa --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/jemalloc.c @@ -0,0 +1,4444 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/buf_writer.h" +#include "jemalloc/internal/ctl.h" +#include "jemalloc/internal/emap.h" +#include "jemalloc/internal/extent_dss.h" +#include "jemalloc/internal/extent_mmap.h" +#include "jemalloc/internal/fxp.h" +#include "jemalloc/internal/san.h" +#include "jemalloc/internal/hook.h" +#include "jemalloc/internal/jemalloc_internal_types.h" +#include "jemalloc/internal/log.h" +#include "jemalloc/internal/malloc_io.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/nstime.h" +#include "jemalloc/internal/rtree.h" +#include "jemalloc/internal/safety_check.h" +#include "jemalloc/internal/sc.h" +#include "jemalloc/internal/spin.h" +#include "jemalloc/internal/sz.h" +#include "jemalloc/internal/ticker.h" +#include "jemalloc/internal/thread_event.h" +#include "jemalloc/internal/util.h" + +#include "malloc_ncpus.h" + +/******************************************************************************/ +/* Data. */ + +/* Runtime configuration options. */ +#define JE_MALLOC_CONF_BUFFER_SIZE 200 +char JE_MALLOC_CONF_BUFFER[JE_MALLOC_CONF_BUFFER_SIZE]; + +const char *je_malloc_conf +#ifndef _WIN32 + JEMALLOC_ATTR(weak) +#endif + ; +/* + * The usual rule is that the closer to runtime you are, the higher priority + * your configuration settings are (so the jemalloc config options get lower + * priority than the per-binary setting, which gets lower priority than the /etc + * setting, which gets lower priority than the environment settings). + * + * But it's a fairly common use case in some testing environments for a user to + * be able to control the binary, but nothing else (e.g. a performancy canary + * uses the production OS and environment variables, but can run any binary in + * those circumstances). For these use cases, it's handy to have an in-binary + * mechanism for overriding environment variable settings, with the idea that if + * the results are positive they get promoted to the official settings, and + * moved from the binary to the environment variable. + * + * We don't actually want this to be widespread, so we'll give it a silly name + * and not mention it in headers or documentation. + */ +const char *je_malloc_conf_2_conf_harder +#ifndef _WIN32 + JEMALLOC_ATTR(weak) +#endif + ; + +const char *opt_malloc_conf_symlink = NULL; +const char *opt_malloc_conf_env_var = NULL; + +bool opt_abort = +#ifdef JEMALLOC_DEBUG + true +#else + false +#endif + ; +bool opt_abort_conf = +#ifdef JEMALLOC_DEBUG + true +#else + false +#endif + ; +/* Intentionally default off, even with debug builds. */ +bool opt_confirm_conf = false; +const char *opt_junk = +#if (defined(JEMALLOC_DEBUG) && defined(JEMALLOC_FILL)) + "true" +#else + "false" +#endif + ; +bool opt_junk_alloc = +#if (defined(JEMALLOC_DEBUG) && defined(JEMALLOC_FILL)) + true +#else + false +#endif + ; +bool opt_junk_free = +#if (defined(JEMALLOC_DEBUG) && defined(JEMALLOC_FILL)) + true +#else + false +#endif + ; +bool opt_trust_madvise = +#ifdef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS + false +#else + true +#endif + ; + +bool opt_cache_oblivious = +#ifdef JEMALLOC_CACHE_OBLIVIOUS + true +#else + false +#endif + ; + +zero_realloc_action_t opt_zero_realloc_action = +#ifdef JEMALLOC_ZERO_REALLOC_DEFAULT_FREE + zero_realloc_action_free +#else + zero_realloc_action_alloc +#endif + ; + +atomic_zu_t zero_realloc_count = ATOMIC_INIT(0); + +const char *const zero_realloc_mode_names[] = { + "alloc", + "free", + "abort", +}; + +/* + * These are the documented values for junk fill debugging facilities -- see the + * man page. + */ +static const uint8_t junk_alloc_byte = 0xa5; +static const uint8_t junk_free_byte = 0x5a; + +static void default_junk_alloc(void *ptr, size_t usize) { + memset(ptr, junk_alloc_byte, usize); +} + +static void default_junk_free(void *ptr, size_t usize) { + memset(ptr, junk_free_byte, usize); +} + +void (*JET_MUTABLE junk_alloc_callback)(void *ptr, size_t size) = &default_junk_alloc; +void (*JET_MUTABLE junk_free_callback)(void *ptr, size_t size) = &default_junk_free; +void (*JET_MUTABLE invalid_conf_abort)(void) = &abort; + +bool opt_utrace = false; +bool opt_xmalloc = false; +bool opt_experimental_infallible_new = false; +bool opt_zero = false; +unsigned opt_narenas = 0; +static fxp_t opt_narenas_ratio = FXP_INIT_INT(4); + +unsigned ncpus; + +unsigned opt_debug_double_free_max_scan = + SAFETY_CHECK_DOUBLE_FREE_MAX_SCAN_DEFAULT; + +size_t opt_calloc_madvise_threshold = 0; + +/* Protects arenas initialization. */ +static malloc_mutex_t arenas_lock; + +/* The global hpa, and whether it's on. */ +bool opt_hpa = false; +hpa_shard_opts_t opt_hpa_opts = HPA_SHARD_OPTS_DEFAULT; +sec_opts_t opt_hpa_sec_opts = SEC_OPTS_DEFAULT; + +/* + * Arenas that are used to service external requests. Not all elements of the + * arenas array are necessarily used; arenas are created lazily as needed. + * + * arenas[0..narenas_auto) are used for automatic multiplexing of threads and + * arenas. arenas[narenas_auto..narenas_total) are only used if the application + * takes some action to create them and allocate from them. + * + * Points to an arena_t. + */ +JEMALLOC_ALIGNED(CACHELINE) +atomic_p_t arenas[MALLOCX_ARENA_LIMIT]; +static atomic_u_t narenas_total; /* Use narenas_total_*(). */ +/* Below three are read-only after initialization. */ +static arena_t *a0; /* arenas[0]. */ +unsigned narenas_auto; +unsigned manual_arena_base; + +malloc_init_t malloc_init_state = malloc_init_uninitialized; + +/* False should be the common case. Set to true to trigger initialization. */ +bool malloc_slow = true; + +/* When malloc_slow is true, set the corresponding bits for sanity check. */ +enum { + flag_opt_junk_alloc = (1U), + flag_opt_junk_free = (1U << 1), + flag_opt_zero = (1U << 2), + flag_opt_utrace = (1U << 3), + flag_opt_xmalloc = (1U << 4) +}; +static uint8_t malloc_slow_flags; + +#ifdef JEMALLOC_THREADED_INIT +/* Used to let the initializing thread recursively allocate. */ +# define NO_INITIALIZER ((unsigned long)0) +# define INITIALIZER pthread_self() +# define IS_INITIALIZER (malloc_initializer == pthread_self()) +static pthread_t malloc_initializer = NO_INITIALIZER; +#else +# define NO_INITIALIZER false +# define INITIALIZER true +# define IS_INITIALIZER malloc_initializer +static bool malloc_initializer = NO_INITIALIZER; +#endif + +/* Used to avoid initialization races. */ +#ifdef _WIN32 +#if _WIN32_WINNT >= 0x0600 +static malloc_mutex_t init_lock = SRWLOCK_INIT; +#else +static malloc_mutex_t init_lock; +static bool init_lock_initialized = false; + +JEMALLOC_ATTR(constructor) +static void WINAPI +_init_init_lock(void) { + /* + * If another constructor in the same binary is using mallctl to e.g. + * set up extent hooks, it may end up running before this one, and + * malloc_init_hard will crash trying to lock the uninitialized lock. So + * we force an initialization of the lock in malloc_init_hard as well. + * We don't try to care about atomicity of the accessed to the + * init_lock_initialized boolean, since it really only matters early in + * the process creation, before any separate thread normally starts + * doing anything. + */ + if (!init_lock_initialized) { + malloc_mutex_init(&init_lock, "init", WITNESS_RANK_INIT, + malloc_mutex_rank_exclusive); + } + init_lock_initialized = true; +} + +#ifdef _MSC_VER +# pragma section(".CRT$XCU", read) +JEMALLOC_SECTION(".CRT$XCU") JEMALLOC_ATTR(used) +static const void (WINAPI *init_init_lock)(void) = _init_init_lock; +#endif +#endif +#else +static malloc_mutex_t init_lock = MALLOC_MUTEX_INITIALIZER; +#endif + +typedef struct { + void *p; /* Input pointer (as in realloc(p, s)). */ + size_t s; /* Request size. */ + void *r; /* Result pointer. */ +} malloc_utrace_t; + +#ifdef JEMALLOC_UTRACE +# define UTRACE(a, b, c) do { \ + if (unlikely(opt_utrace)) { \ + int utrace_serrno = errno; \ + malloc_utrace_t ut; \ + ut.p = (a); \ + ut.s = (b); \ + ut.r = (c); \ + UTRACE_CALL(&ut, sizeof(ut)); \ + errno = utrace_serrno; \ + } \ +} while (0) +#else +# define UTRACE(a, b, c) +#endif + +/* Whether encountered any invalid config options. */ +static bool had_conf_error = false; + +/******************************************************************************/ +/* + * Function prototypes for static functions that are referenced prior to + * definition. + */ + +static bool malloc_init_hard_a0(void); +static bool malloc_init_hard(void); + +/******************************************************************************/ +/* + * Begin miscellaneous support functions. + */ + +JEMALLOC_ALWAYS_INLINE bool +malloc_init_a0(void) { + if (unlikely(malloc_init_state == malloc_init_uninitialized)) { + return malloc_init_hard_a0(); + } + return false; +} + +JEMALLOC_ALWAYS_INLINE bool +malloc_init(void) { + if (unlikely(!malloc_initialized()) && malloc_init_hard()) { + return true; + } + return false; +} + +/* + * The a0*() functions are used instead of i{d,}alloc() in situations that + * cannot tolerate TLS variable access. + */ + +static void * +a0ialloc(size_t size, bool zero, bool is_internal) { + if (unlikely(malloc_init_a0())) { + return NULL; + } + + return iallocztm(TSDN_NULL, size, sz_size2index(size), zero, NULL, + is_internal, arena_get(TSDN_NULL, 0, true), true); +} + +static void +a0idalloc(void *ptr, bool is_internal) { + idalloctm(TSDN_NULL, ptr, NULL, NULL, is_internal, true); +} + +void * +a0malloc(size_t size) { + return a0ialloc(size, false, true); +} + +void +a0dalloc(void *ptr) { + a0idalloc(ptr, true); +} + +/* + * FreeBSD's libc uses the bootstrap_*() functions in bootstrap-sensitive + * situations that cannot tolerate TLS variable access (TLS allocation and very + * early internal data structure initialization). + */ + +void * +bootstrap_malloc(size_t size) { + if (unlikely(size == 0)) { + size = 1; + } + + return a0ialloc(size, false, false); +} + +void * +bootstrap_calloc(size_t num, size_t size) { + size_t num_size; + + num_size = num * size; + if (unlikely(num_size == 0)) { + assert(num == 0 || size == 0); + num_size = 1; + } + + return a0ialloc(num_size, true, false); +} + +void +bootstrap_free(void *ptr) { + if (unlikely(ptr == NULL)) { + return; + } + + a0idalloc(ptr, false); +} + +void +arena_set(unsigned ind, arena_t *arena) { + atomic_store_p(&arenas[ind], arena, ATOMIC_RELEASE); +} + +static void +narenas_total_set(unsigned narenas) { + atomic_store_u(&narenas_total, narenas, ATOMIC_RELEASE); +} + +static void +narenas_total_inc(void) { + atomic_fetch_add_u(&narenas_total, 1, ATOMIC_RELEASE); +} + +unsigned +narenas_total_get(void) { + return atomic_load_u(&narenas_total, ATOMIC_ACQUIRE); +} + +/* Create a new arena and insert it into the arenas array at index ind. */ +static arena_t * +arena_init_locked(tsdn_t *tsdn, unsigned ind, const arena_config_t *config) { + arena_t *arena; + + assert(ind <= narenas_total_get()); + if (ind >= MALLOCX_ARENA_LIMIT) { + return NULL; + } + if (ind == narenas_total_get()) { + narenas_total_inc(); + } + + /* + * Another thread may have already initialized arenas[ind] if it's an + * auto arena. + */ + arena = arena_get(tsdn, ind, false); + if (arena != NULL) { + assert(arena_is_auto(arena)); + return arena; + } + + /* Actually initialize the arena. */ + arena = arena_new(tsdn, ind, config); + + return arena; +} + +static void +arena_new_create_background_thread(tsdn_t *tsdn, unsigned ind) { + if (ind == 0) { + return; + } + + if (have_background_thread) { + if (background_thread_create(tsdn_tsd(tsdn), ind)) { + malloc_printf(": error in background thread " + "creation for arena %u. Abort.\n", ind); + abort(); + } + } +} + +arena_t * +arena_init(tsdn_t *tsdn, unsigned ind, const arena_config_t *config) { + arena_t *arena; + + malloc_mutex_lock(tsdn, &arenas_lock); + arena = arena_init_locked(tsdn, ind, config); + malloc_mutex_unlock(tsdn, &arenas_lock); + + arena_new_create_background_thread(tsdn, ind); + + return arena; +} + +static void +arena_bind(tsd_t *tsd, unsigned ind, bool internal) { + arena_t *arena = arena_get(tsd_tsdn(tsd), ind, false); + arena_nthreads_inc(arena, internal); + + if (internal) { + tsd_iarena_set(tsd, arena); + } else { + tsd_arena_set(tsd, arena); + /* + * While shard acts as a random seed, the cast below should + * not make much difference. + */ + uint8_t shard = (uint8_t)atomic_fetch_add_u( + &arena->binshard_next, 1, ATOMIC_RELAXED); + tsd_binshards_t *bins = tsd_binshardsp_get(tsd); + for (unsigned i = 0; i < SC_NBINS; i++) { + assert(bin_infos[i].n_shards > 0 && + bin_infos[i].n_shards <= BIN_SHARDS_MAX); + bins->binshard[i] = shard % bin_infos[i].n_shards; + } + } +} + +void +arena_migrate(tsd_t *tsd, arena_t *oldarena, arena_t *newarena) { + assert(oldarena != NULL); + assert(newarena != NULL); + + arena_nthreads_dec(oldarena, false); + arena_nthreads_inc(newarena, false); + tsd_arena_set(tsd, newarena); + + if (arena_nthreads_get(oldarena, false) == 0) { + /* Purge if the old arena has no associated threads anymore. */ + arena_decay(tsd_tsdn(tsd), oldarena, + /* is_background_thread */ false, /* all */ true); + } +} + +static void +arena_unbind(tsd_t *tsd, unsigned ind, bool internal) { + arena_t *arena; + + arena = arena_get(tsd_tsdn(tsd), ind, false); + arena_nthreads_dec(arena, internal); + + if (internal) { + tsd_iarena_set(tsd, NULL); + } else { + tsd_arena_set(tsd, NULL); + } +} + +/* Slow path, called only by arena_choose(). */ +arena_t * +arena_choose_hard(tsd_t *tsd, bool internal) { + arena_t *ret JEMALLOC_CC_SILENCE_INIT(NULL); + + if (have_percpu_arena && PERCPU_ARENA_ENABLED(opt_percpu_arena)) { + unsigned choose = percpu_arena_choose(); + ret = arena_get(tsd_tsdn(tsd), choose, true); + assert(ret != NULL); + arena_bind(tsd, arena_ind_get(ret), false); + arena_bind(tsd, arena_ind_get(ret), true); + + return ret; + } + + if (narenas_auto > 1) { + unsigned i, j, choose[2], first_null; + bool is_new_arena[2]; + + /* + * Determine binding for both non-internal and internal + * allocation. + * + * choose[0]: For application allocation. + * choose[1]: For internal metadata allocation. + */ + + for (j = 0; j < 2; j++) { + choose[j] = 0; + is_new_arena[j] = false; + } + + first_null = narenas_auto; + malloc_mutex_lock(tsd_tsdn(tsd), &arenas_lock); + assert(arena_get(tsd_tsdn(tsd), 0, false) != NULL); + for (i = 1; i < narenas_auto; i++) { + if (arena_get(tsd_tsdn(tsd), i, false) != NULL) { + /* + * Choose the first arena that has the lowest + * number of threads assigned to it. + */ + for (j = 0; j < 2; j++) { + if (arena_nthreads_get(arena_get( + tsd_tsdn(tsd), i, false), !!j) < + arena_nthreads_get(arena_get( + tsd_tsdn(tsd), choose[j], false), + !!j)) { + choose[j] = i; + } + } + } else if (first_null == narenas_auto) { + /* + * Record the index of the first uninitialized + * arena, in case all extant arenas are in use. + * + * NB: It is possible for there to be + * discontinuities in terms of initialized + * versus uninitialized arenas, due to the + * "thread.arena" mallctl. + */ + first_null = i; + } + } + + for (j = 0; j < 2; j++) { + if (arena_nthreads_get(arena_get(tsd_tsdn(tsd), + choose[j], false), !!j) == 0 || first_null == + narenas_auto) { + /* + * Use an unloaded arena, or the least loaded + * arena if all arenas are already initialized. + */ + if (!!j == internal) { + ret = arena_get(tsd_tsdn(tsd), + choose[j], false); + } + } else { + arena_t *arena; + + /* Initialize a new arena. */ + choose[j] = first_null; + arena = arena_init_locked(tsd_tsdn(tsd), + choose[j], &arena_config_default); + if (arena == NULL) { + malloc_mutex_unlock(tsd_tsdn(tsd), + &arenas_lock); + return NULL; + } + is_new_arena[j] = true; + if (!!j == internal) { + ret = arena; + } + } + arena_bind(tsd, choose[j], !!j); + } + malloc_mutex_unlock(tsd_tsdn(tsd), &arenas_lock); + + for (j = 0; j < 2; j++) { + if (is_new_arena[j]) { + assert(choose[j] > 0); + arena_new_create_background_thread( + tsd_tsdn(tsd), choose[j]); + } + } + + } else { + ret = arena_get(tsd_tsdn(tsd), 0, false); + arena_bind(tsd, 0, false); + arena_bind(tsd, 0, true); + } + + return ret; +} + +void +iarena_cleanup(tsd_t *tsd) { + arena_t *iarena; + + iarena = tsd_iarena_get(tsd); + if (iarena != NULL) { + arena_unbind(tsd, arena_ind_get(iarena), true); + } +} + +void +arena_cleanup(tsd_t *tsd) { + arena_t *arena; + + arena = tsd_arena_get(tsd); + if (arena != NULL) { + arena_unbind(tsd, arena_ind_get(arena), false); + } +} + +static void +stats_print_atexit(void) { + if (config_stats) { + tsdn_t *tsdn; + unsigned narenas, i; + + tsdn = tsdn_fetch(); + + /* + * Merge stats from extant threads. This is racy, since + * individual threads do not lock when recording tcache stats + * events. As a consequence, the final stats may be slightly + * out of date by the time they are reported, if other threads + * continue to allocate. + */ + for (i = 0, narenas = narenas_total_get(); i < narenas; i++) { + arena_t *arena = arena_get(tsdn, i, false); + if (arena != NULL) { + tcache_slow_t *tcache_slow; + + malloc_mutex_lock(tsdn, &arena->tcache_ql_mtx); + ql_foreach(tcache_slow, &arena->tcache_ql, + link) { + tcache_stats_merge(tsdn, + tcache_slow->tcache, arena); + } + malloc_mutex_unlock(tsdn, + &arena->tcache_ql_mtx); + } + } + } + je_malloc_stats_print(NULL, NULL, opt_stats_print_opts); +} + +/* + * Ensure that we don't hold any locks upon entry to or exit from allocator + * code (in a "broad" sense that doesn't count a reentrant allocation as an + * entrance or exit). + */ +JEMALLOC_ALWAYS_INLINE void +check_entry_exit_locking(tsdn_t *tsdn) { + if (!config_debug) { + return; + } + if (tsdn_null(tsdn)) { + return; + } + tsd_t *tsd = tsdn_tsd(tsdn); + /* + * It's possible we hold locks at entry/exit if we're in a nested + * allocation. + */ + int8_t reentrancy_level = tsd_reentrancy_level_get(tsd); + if (reentrancy_level != 0) { + return; + } + witness_assert_lockless(tsdn_witness_tsdp_get(tsdn)); +} + +/* + * End miscellaneous support functions. + */ +/******************************************************************************/ +/* + * Begin initialization functions. + */ + +static char * +jemalloc_getenv(const char *name) { +#ifdef JEMALLOC_FORCE_GETENV + return getenv(name); +#else +# ifdef JEMALLOC_HAVE_SECURE_GETENV + return secure_getenv(name); +# else +# ifdef JEMALLOC_HAVE_ISSETUGID + if (issetugid() != 0) { + return NULL; + } +# endif + return getenv(name); +# endif +#endif +} + +static unsigned +malloc_ncpus(void) { + long result; + +#ifdef _WIN32 + SYSTEM_INFO si; + GetSystemInfo(&si); + result = si.dwNumberOfProcessors; +#elif defined(CPU_COUNT) + /* + * glibc >= 2.6 has the CPU_COUNT macro. + * + * glibc's sysconf() uses isspace(). glibc allocates for the first time + * *before* setting up the isspace tables. Therefore we need a + * different method to get the number of CPUs. + * + * The getaffinity approach is also preferred when only a subset of CPUs + * is available, to avoid using more arenas than necessary. + */ + { +# if defined(__FreeBSD__) || defined(__DragonFly__) + cpuset_t set; +# else + cpu_set_t set; +# endif +# if defined(JEMALLOC_HAVE_SCHED_SETAFFINITY) + sched_getaffinity(0, sizeof(set), &set); +# else + pthread_getaffinity_np(pthread_self(), sizeof(set), &set); +# endif + result = CPU_COUNT(&set); + } +#else + result = sysconf(_SC_NPROCESSORS_ONLN); +#endif + return ((result == -1) ? 1 : (unsigned)result); +} + +/* + * Ensure that number of CPUs is determistinc, i.e. it is the same based on: + * - sched_getaffinity() + * - _SC_NPROCESSORS_ONLN + * - _SC_NPROCESSORS_CONF + * Since otherwise tricky things is possible with percpu arenas in use. + */ +static bool +malloc_cpu_count_is_deterministic(void) +{ +#ifdef _WIN32 + return true; +#else + long cpu_onln = sysconf(_SC_NPROCESSORS_ONLN); + long cpu_conf = sysconf(_SC_NPROCESSORS_CONF); + if (cpu_onln != cpu_conf) { + return false; + } +# if defined(CPU_COUNT) +# if defined(__FreeBSD__) || defined(__DragonFly__) + cpuset_t set; +# else + cpu_set_t set; +# endif /* __FreeBSD__ */ +# if defined(JEMALLOC_HAVE_SCHED_SETAFFINITY) + sched_getaffinity(0, sizeof(set), &set); +# else /* !JEMALLOC_HAVE_SCHED_SETAFFINITY */ + pthread_getaffinity_np(pthread_self(), sizeof(set), &set); +# endif /* JEMALLOC_HAVE_SCHED_SETAFFINITY */ + long cpu_affinity = CPU_COUNT(&set); + if (cpu_affinity != cpu_conf) { + return false; + } +# endif /* CPU_COUNT */ + return true; +#endif +} + +static void +init_opt_stats_opts(const char *v, size_t vlen, char *dest) { + size_t opts_len = strlen(dest); + assert(opts_len <= stats_print_tot_num_options); + + for (size_t i = 0; i < vlen; i++) { + switch (v[i]) { +#define OPTION(o, v, d, s) case o: break; + STATS_PRINT_OPTIONS +#undef OPTION + default: continue; + } + + if (strchr(dest, v[i]) != NULL) { + /* Ignore repeated. */ + continue; + } + + dest[opts_len++] = v[i]; + dest[opts_len] = '\0'; + assert(opts_len <= stats_print_tot_num_options); + } + assert(opts_len == strlen(dest)); +} + +static void +malloc_conf_format_error(const char *msg, const char *begin, const char *end) { + size_t len = end - begin + 1; + len = len > BUFERROR_BUF ? BUFERROR_BUF : len; + + malloc_printf(": %s -- %.*s\n", msg, (int)len, begin); +} + +static bool +malloc_conf_next(char const **opts_p, char const **k_p, size_t *klen_p, + char const **v_p, size_t *vlen_p) { + bool accept; + const char *opts = *opts_p; + + *k_p = opts; + + for (accept = false; !accept;) { + switch (*opts) { + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': + case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': + case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': + case 'Y': case 'Z': + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': + case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': + case 's': case 't': case 'u': case 'v': case 'w': case 'x': + case 'y': case 'z': + case '0': case '1': case '2': case '3': case '4': case '5': + case '6': case '7': case '8': case '9': + case '_': + opts++; + break; + case ':': + opts++; + *klen_p = (uintptr_t)opts - 1 - (uintptr_t)*k_p; + *v_p = opts; + accept = true; + break; + case '\0': + if (opts != *opts_p) { + malloc_conf_format_error( + "Conf string ends with key", + *opts_p, opts - 1); + had_conf_error = true; + } + return true; + default: + malloc_conf_format_error( + "Malformed conf string", *opts_p, opts); + had_conf_error = true; + return true; + } + } + + for (accept = false; !accept;) { + switch (*opts) { + case ',': + opts++; + /* + * Look ahead one character here, because the next time + * this function is called, it will assume that end of + * input has been cleanly reached if no input remains, + * but we have optimistically already consumed the + * comma if one exists. + */ + if (*opts == '\0') { + malloc_conf_format_error( + "Conf string ends with comma", + *opts_p, opts - 1); + had_conf_error = true; + } + *vlen_p = (uintptr_t)opts - 1 - (uintptr_t)*v_p; + accept = true; + break; + case '\0': + *vlen_p = (uintptr_t)opts - (uintptr_t)*v_p; + accept = true; + break; + default: + opts++; + break; + } + } + + *opts_p = opts; + return false; +} + +static void +malloc_abort_invalid_conf(void) { + assert(opt_abort_conf); + malloc_printf(": Abort (abort_conf:true) on invalid conf " + "value (see above).\n"); + invalid_conf_abort(); +} + +static void +malloc_conf_error(const char *msg, const char *k, size_t klen, const char *v, + size_t vlen) { + malloc_printf(": %s: %.*s:%.*s\n", msg, (int)klen, k, + (int)vlen, v); + /* If abort_conf is set, error out after processing all options. */ + const char *experimental = "experimental_"; + if (strncmp(k, experimental, strlen(experimental)) == 0) { + /* However, tolerate experimental features. */ + return; + } + had_conf_error = true; +} + +static void +malloc_slow_flag_init(void) { + /* + * Combine the runtime options into malloc_slow for fast path. Called + * after processing all the options. + */ + malloc_slow_flags |= (opt_junk_alloc ? flag_opt_junk_alloc : 0) + | (opt_junk_free ? flag_opt_junk_free : 0) + | (opt_zero ? flag_opt_zero : 0) + | (opt_utrace ? flag_opt_utrace : 0) + | (opt_xmalloc ? flag_opt_xmalloc : 0); + + malloc_slow = (malloc_slow_flags != 0); +} + +/* Number of sources for initializing malloc_conf */ +#define MALLOC_CONF_NSOURCES 5 + +static const char * +obtain_malloc_conf(unsigned which_source, char readlink_buf[PATH_MAX + 1]) { + if (config_debug) { + static unsigned read_source = 0; + /* + * Each source should only be read once, to minimize # of + * syscalls on init. + */ + assert(read_source == which_source); + read_source++; + } + assert(which_source < MALLOC_CONF_NSOURCES); + + const char *ret; + switch (which_source) { + case 0: + ret = config_malloc_conf; + break; + case 1: + if (je_malloc_conf != NULL) { + /* Use options that were compiled into the program. */ + ret = je_malloc_conf; + } else { + /* No configuration specified. */ + ret = NULL; + } + break; + case 2: { + ssize_t linklen = 0; +#ifndef _WIN32 + int saved_errno = errno; + const char *linkname = +# ifdef JEMALLOC_PREFIX + "/etc/"JEMALLOC_PREFIX"malloc.conf" +# else + "/etc/malloc.conf" +# endif + ; + + /* + * Try to use the contents of the "/etc/malloc.conf" symbolic + * link's name. + */ +#ifndef JEMALLOC_READLINKAT + linklen = readlink(linkname, readlink_buf, PATH_MAX); +#else + linklen = readlinkat(AT_FDCWD, linkname, readlink_buf, PATH_MAX); +#endif + if (linklen == -1) { + /* No configuration specified. */ + linklen = 0; + /* Restore errno. */ + set_errno(saved_errno); + } +#endif + readlink_buf[linklen] = '\0'; + ret = readlink_buf; + break; + } case 3: { + const char *envname = +#ifdef JEMALLOC_PREFIX + JEMALLOC_CPREFIX"MALLOC_CONF" +#else + "MALLOC_CONF" +#endif + ; + + if ((ret = jemalloc_getenv(envname)) != NULL) { + opt_malloc_conf_env_var = ret; + } else { + /* No configuration specified. */ + ret = NULL; + } + break; + } case 4: { + ret = je_malloc_conf_2_conf_harder; + break; + } default: + not_reached(); + ret = NULL; + } + return ret; +} + +static void +validate_hpa_settings(void) { + if (!hpa_supported() || !opt_hpa || opt_hpa_opts.dirty_mult == (fxp_t)-1) { + return; + } + size_t hpa_threshold = fxp_mul_frac(HUGEPAGE, opt_hpa_opts.dirty_mult) + + opt_hpa_opts.hugification_threshold; + if (hpa_threshold > HUGEPAGE) { + return; + } + + had_conf_error = true; + char hpa_dirty_mult[FXP_BUF_SIZE]; + char hugification_threshold[FXP_BUF_SIZE]; + char normalization_message[256] = {0}; + fxp_print(opt_hpa_opts.dirty_mult, hpa_dirty_mult); + fxp_print(fxp_div(FXP_INIT_INT((unsigned) + (opt_hpa_opts.hugification_threshold >> LG_PAGE)), + FXP_INIT_INT(HUGEPAGE_PAGES)), hugification_threshold); + if (!opt_abort_conf) { + char normalized_hugification_threshold[FXP_BUF_SIZE]; + opt_hpa_opts.hugification_threshold += + HUGEPAGE - hpa_threshold; + fxp_print(fxp_div(FXP_INIT_INT((unsigned) + (opt_hpa_opts.hugification_threshold >> LG_PAGE)), + FXP_INIT_INT(HUGEPAGE_PAGES)), + normalized_hugification_threshold); + malloc_snprintf(normalization_message, + sizeof(normalization_message), ": Normalizing " + "HPA settings to avoid pathological behavior, setting " + "hpa_hugification_threshold_ratio: to %s.\n", + normalized_hugification_threshold); + } + malloc_printf( + ": Invalid combination of options " + "hpa_hugification_threshold_ratio: %s and hpa_dirty_mult: %s. " + "These values should sum to > 1.0.\n%s", hugification_threshold, + hpa_dirty_mult, normalization_message); +} + +static void +malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS], + bool initial_call, const char *opts_cache[MALLOC_CONF_NSOURCES], + char readlink_buf[PATH_MAX + 1]) { + static const char *opts_explain[MALLOC_CONF_NSOURCES] = { + "string specified via --with-malloc-conf", + "string pointed to by the global variable malloc_conf", + "\"name\" of the file referenced by the symbolic link named " + "/etc/malloc.conf", + "value of the environment variable MALLOC_CONF", + "string pointed to by the global variable " + "malloc_conf_2_conf_harder", + }; + unsigned i; + const char *opts, *k, *v; + size_t klen, vlen; + + for (i = 0; i < MALLOC_CONF_NSOURCES; i++) { + /* Get runtime configuration. */ + if (initial_call) { + opts_cache[i] = obtain_malloc_conf(i, readlink_buf); + } + opts = opts_cache[i]; + if (!initial_call && opt_confirm_conf) { + malloc_printf( + ": malloc_conf #%u (%s): \"%s\"\n", + i + 1, opts_explain[i], opts != NULL ? opts : ""); + } + if (opts == NULL) { + continue; + } + + while (*opts != '\0' && !malloc_conf_next(&opts, &k, &klen, &v, + &vlen)) { + +#define CONF_ERROR(msg, k, klen, v, vlen) \ + if (!initial_call) { \ + malloc_conf_error( \ + msg, k, klen, v, vlen); \ + cur_opt_valid = false; \ + } +#define CONF_CONTINUE { \ + if (!initial_call && opt_confirm_conf \ + && cur_opt_valid) { \ + malloc_printf(": -- " \ + "Set conf value: %.*s:%.*s" \ + "\n", (int)klen, k, \ + (int)vlen, v); \ + } \ + continue; \ + } +#define CONF_MATCH(n) \ + (sizeof(n)-1 == klen && strncmp(n, k, klen) == 0) +#define CONF_MATCH_VALUE(n) \ + (sizeof(n)-1 == vlen && strncmp(n, v, vlen) == 0) +#define CONF_HANDLE_BOOL(o, n) \ + if (CONF_MATCH(n)) { \ + if (CONF_MATCH_VALUE("true")) { \ + o = true; \ + } else if (CONF_MATCH_VALUE("false")) { \ + o = false; \ + } else { \ + CONF_ERROR("Invalid conf value",\ + k, klen, v, vlen); \ + } \ + CONF_CONTINUE; \ + } + /* + * One of the CONF_MIN macros below expands, in one of the use points, + * to "unsigned integer < 0", which is always false, triggering the + * GCC -Wtype-limits warning, which we disable here and re-enable below. + */ + JEMALLOC_DIAGNOSTIC_PUSH + JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS + +#define CONF_DONT_CHECK_MIN(um, min) false +#define CONF_CHECK_MIN(um, min) ((um) < (min)) +#define CONF_DONT_CHECK_MAX(um, max) false +#define CONF_CHECK_MAX(um, max) ((um) > (max)) + +#define CONF_VALUE_READ(max_t, result) \ + char *end; \ + set_errno(0); \ + result = (max_t)malloc_strtoumax(v, &end, 0); +#define CONF_VALUE_READ_FAIL() \ + (get_errno() != 0 || (uintptr_t)end - (uintptr_t)v != vlen) + +#define CONF_HANDLE_T(t, max_t, o, n, min, max, check_min, check_max, clip) \ + if (CONF_MATCH(n)) { \ + max_t mv; \ + CONF_VALUE_READ(max_t, mv) \ + if (CONF_VALUE_READ_FAIL()) { \ + CONF_ERROR("Invalid conf value",\ + k, klen, v, vlen); \ + } else if (clip) { \ + if (check_min(mv, (t)(min))) { \ + o = (t)(min); \ + } else if ( \ + check_max(mv, (t)(max))) { \ + o = (t)(max); \ + } else { \ + o = (t)mv; \ + } \ + } else { \ + if (check_min(mv, (t)(min)) || \ + check_max(mv, (t)(max))) { \ + CONF_ERROR( \ + "Out-of-range " \ + "conf value", \ + k, klen, v, vlen); \ + } else { \ + o = (t)mv; \ + } \ + } \ + CONF_CONTINUE; \ + } +#define CONF_HANDLE_T_U(t, o, n, min, max, check_min, check_max, clip) \ + CONF_HANDLE_T(t, uintmax_t, o, n, min, max, check_min, \ + check_max, clip) +#define CONF_HANDLE_T_SIGNED(t, o, n, min, max, check_min, check_max, clip)\ + CONF_HANDLE_T(t, intmax_t, o, n, min, max, check_min, \ + check_max, clip) + +#define CONF_HANDLE_UNSIGNED(o, n, min, max, check_min, check_max, \ + clip) \ + CONF_HANDLE_T_U(unsigned, o, n, min, max, \ + check_min, check_max, clip) +#define CONF_HANDLE_SIZE_T(o, n, min, max, check_min, check_max, clip) \ + CONF_HANDLE_T_U(size_t, o, n, min, max, \ + check_min, check_max, clip) +#define CONF_HANDLE_INT64_T(o, n, min, max, check_min, check_max, clip) \ + CONF_HANDLE_T_SIGNED(int64_t, o, n, min, max, \ + check_min, check_max, clip) +#define CONF_HANDLE_UINT64_T(o, n, min, max, check_min, check_max, clip)\ + CONF_HANDLE_T_U(uint64_t, o, n, min, max, \ + check_min, check_max, clip) +#define CONF_HANDLE_SSIZE_T(o, n, min, max) \ + CONF_HANDLE_T_SIGNED(ssize_t, o, n, min, max, \ + CONF_CHECK_MIN, CONF_CHECK_MAX, false) +#define CONF_HANDLE_CHAR_P(o, n, d) \ + if (CONF_MATCH(n)) { \ + size_t cpylen = (vlen <= \ + sizeof(o)-1) ? vlen : \ + sizeof(o)-1; \ + strncpy(o, v, cpylen); \ + o[cpylen] = '\0'; \ + CONF_CONTINUE; \ + } + + bool cur_opt_valid = true; + + CONF_HANDLE_BOOL(opt_confirm_conf, "confirm_conf") + if (initial_call) { + continue; + } + + CONF_HANDLE_BOOL(opt_abort, "abort") + CONF_HANDLE_BOOL(opt_abort_conf, "abort_conf") + CONF_HANDLE_BOOL(opt_cache_oblivious, "cache_oblivious") + CONF_HANDLE_BOOL(opt_trust_madvise, "trust_madvise") + if (strncmp("metadata_thp", k, klen) == 0) { + int m; + bool match = false; + for (m = 0; m < metadata_thp_mode_limit; m++) { + if (strncmp(metadata_thp_mode_names[m], + v, vlen) == 0) { + opt_metadata_thp = m; + match = true; + break; + } + } + if (!match) { + CONF_ERROR("Invalid conf value", + k, klen, v, vlen); + } + CONF_CONTINUE; + } + CONF_HANDLE_BOOL(opt_retain, "retain") + if (strncmp("dss", k, klen) == 0) { + int m; + bool match = false; + for (m = 0; m < dss_prec_limit; m++) { + if (strncmp(dss_prec_names[m], v, vlen) + == 0) { + if (extent_dss_prec_set(m)) { + CONF_ERROR( + "Error setting dss", + k, klen, v, vlen); + } else { + opt_dss = + dss_prec_names[m]; + match = true; + break; + } + } + } + if (!match) { + CONF_ERROR("Invalid conf value", + k, klen, v, vlen); + } + CONF_CONTINUE; + } + if (CONF_MATCH("narenas")) { + if (CONF_MATCH_VALUE("default")) { + opt_narenas = 0; + CONF_CONTINUE; + } else { + CONF_HANDLE_UNSIGNED(opt_narenas, + "narenas", 1, UINT_MAX, + CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, + /* clip */ false) + } + } + if (CONF_MATCH("narenas_ratio")) { + char *end; + bool err = fxp_parse(&opt_narenas_ratio, v, + &end); + if (err || (size_t)(end - v) != vlen) { + CONF_ERROR("Invalid conf value", + k, klen, v, vlen); + } + CONF_CONTINUE; + } + if (CONF_MATCH("bin_shards")) { + const char *bin_shards_segment_cur = v; + size_t vlen_left = vlen; + do { + size_t size_start; + size_t size_end; + size_t nshards; + bool err = multi_setting_parse_next( + &bin_shards_segment_cur, &vlen_left, + &size_start, &size_end, &nshards); + if (err || bin_update_shard_size( + bin_shard_sizes, size_start, + size_end, nshards)) { + CONF_ERROR( + "Invalid settings for " + "bin_shards", k, klen, v, + vlen); + break; + } + } while (vlen_left > 0); + CONF_CONTINUE; + } + CONF_HANDLE_SIZE_T(opt_bin_info_max_batched_size, + "max_batched_size", 0, SIZE_T_MAX, + CONF_DONT_CHECK_MIN, CONF_DONT_CHECK_MAX, + /* clip */ true) + CONF_HANDLE_SIZE_T(opt_bin_info_remote_free_max_batch, + "remote_free_max_batch", 0, + BIN_REMOTE_FREE_ELEMS_MAX, + CONF_DONT_CHECK_MIN, CONF_CHECK_MAX, + /* clip */ true) + CONF_HANDLE_SIZE_T(opt_bin_info_remote_free_max, + "remote_free_max", 0, + BIN_REMOTE_FREE_ELEMS_MAX, + CONF_DONT_CHECK_MIN, CONF_CHECK_MAX, + /* clip */ true) + + if (CONF_MATCH("tcache_ncached_max")) { + bool err = tcache_bin_info_default_init( + v, vlen); + if (err) { + CONF_ERROR("Invalid settings for " + "tcache_ncached_max", k, klen, v, + vlen); + } + CONF_CONTINUE; + } + CONF_HANDLE_INT64_T(opt_mutex_max_spin, + "mutex_max_spin", -1, INT64_MAX, CONF_CHECK_MIN, + CONF_DONT_CHECK_MAX, false); + CONF_HANDLE_SSIZE_T(opt_dirty_decay_ms, + "dirty_decay_ms", -1, NSTIME_SEC_MAX * KQU(1000) < + QU(SSIZE_MAX) ? NSTIME_SEC_MAX * KQU(1000) : + SSIZE_MAX); + CONF_HANDLE_SSIZE_T(opt_muzzy_decay_ms, + "muzzy_decay_ms", -1, NSTIME_SEC_MAX * KQU(1000) < + QU(SSIZE_MAX) ? NSTIME_SEC_MAX * KQU(1000) : + SSIZE_MAX); + CONF_HANDLE_BOOL(opt_stats_print, "stats_print") + if (CONF_MATCH("stats_print_opts")) { + init_opt_stats_opts(v, vlen, + opt_stats_print_opts); + CONF_CONTINUE; + } + CONF_HANDLE_INT64_T(opt_stats_interval, + "stats_interval", -1, INT64_MAX, + CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, false) + if (CONF_MATCH("stats_interval_opts")) { + init_opt_stats_opts(v, vlen, + opt_stats_interval_opts); + CONF_CONTINUE; + } + if (config_fill) { + if (CONF_MATCH("junk")) { + if (CONF_MATCH_VALUE("true")) { + opt_junk = "true"; + opt_junk_alloc = opt_junk_free = + true; + } else if (CONF_MATCH_VALUE("false")) { + opt_junk = "false"; + opt_junk_alloc = opt_junk_free = + false; + } else if (CONF_MATCH_VALUE("alloc")) { + opt_junk = "alloc"; + opt_junk_alloc = true; + opt_junk_free = false; + } else if (CONF_MATCH_VALUE("free")) { + opt_junk = "free"; + opt_junk_alloc = false; + opt_junk_free = true; + } else { + CONF_ERROR( + "Invalid conf value", + k, klen, v, vlen); + } + CONF_CONTINUE; + } + CONF_HANDLE_BOOL(opt_zero, "zero") + } + if (config_utrace) { + CONF_HANDLE_BOOL(opt_utrace, "utrace") + } + if (config_xmalloc) { + CONF_HANDLE_BOOL(opt_xmalloc, "xmalloc") + } + if (config_enable_cxx) { + CONF_HANDLE_BOOL( + opt_experimental_infallible_new, + "experimental_infallible_new") + } + + CONF_HANDLE_BOOL(opt_tcache, "tcache") + CONF_HANDLE_SIZE_T(opt_tcache_max, "tcache_max", + 0, TCACHE_MAXCLASS_LIMIT, CONF_DONT_CHECK_MIN, + CONF_CHECK_MAX, /* clip */ true) + if (CONF_MATCH("lg_tcache_max")) { + size_t m; + CONF_VALUE_READ(size_t, m) + if (CONF_VALUE_READ_FAIL()) { + CONF_ERROR("Invalid conf value", + k, klen, v, vlen); + } else { + /* clip if necessary */ + if (m > TCACHE_LG_MAXCLASS_LIMIT) { + m = TCACHE_LG_MAXCLASS_LIMIT; + } + opt_tcache_max = (size_t)1 << m; + } + CONF_CONTINUE; + } + /* + * Anyone trying to set a value outside -16 to 16 is + * deeply confused. + */ + CONF_HANDLE_SSIZE_T(opt_lg_tcache_nslots_mul, + "lg_tcache_nslots_mul", -16, 16) + /* Ditto with values past 2048. */ + CONF_HANDLE_UNSIGNED(opt_tcache_nslots_small_min, + "tcache_nslots_small_min", 1, 2048, + CONF_CHECK_MIN, CONF_CHECK_MAX, /* clip */ true) + CONF_HANDLE_UNSIGNED(opt_tcache_nslots_small_max, + "tcache_nslots_small_max", 1, 2048, + CONF_CHECK_MIN, CONF_CHECK_MAX, /* clip */ true) + CONF_HANDLE_UNSIGNED(opt_tcache_nslots_large, + "tcache_nslots_large", 1, 2048, + CONF_CHECK_MIN, CONF_CHECK_MAX, /* clip */ true) + CONF_HANDLE_SIZE_T(opt_tcache_gc_incr_bytes, + "tcache_gc_incr_bytes", 1024, SIZE_T_MAX, + CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, + /* clip */ true) + CONF_HANDLE_SIZE_T(opt_tcache_gc_delay_bytes, + "tcache_gc_delay_bytes", 0, SIZE_T_MAX, + CONF_DONT_CHECK_MIN, CONF_DONT_CHECK_MAX, + /* clip */ false) + CONF_HANDLE_UNSIGNED(opt_lg_tcache_flush_small_div, + "lg_tcache_flush_small_div", 1, 16, + CONF_CHECK_MIN, CONF_CHECK_MAX, /* clip */ true) + CONF_HANDLE_UNSIGNED(opt_lg_tcache_flush_large_div, + "lg_tcache_flush_large_div", 1, 16, + CONF_CHECK_MIN, CONF_CHECK_MAX, /* clip */ true) + CONF_HANDLE_UNSIGNED(opt_debug_double_free_max_scan, + "debug_double_free_max_scan", 0, UINT_MAX, + CONF_DONT_CHECK_MIN, CONF_DONT_CHECK_MAX, + /* clip */ false) + CONF_HANDLE_SIZE_T(opt_calloc_madvise_threshold, + "calloc_madvise_threshold", 0, SC_LARGE_MAXCLASS, + CONF_DONT_CHECK_MIN, CONF_CHECK_MAX, /* clip */ false) + + /* + * The runtime option of oversize_threshold remains + * undocumented. It may be tweaked in the next major + * release (6.0). The default value 8M is rather + * conservative / safe. Tuning it further down may + * improve fragmentation a bit more, but may also cause + * contention on the huge arena. + */ + CONF_HANDLE_SIZE_T(opt_oversize_threshold, + "oversize_threshold", 0, SC_LARGE_MAXCLASS, + CONF_DONT_CHECK_MIN, CONF_CHECK_MAX, false) + CONF_HANDLE_SIZE_T(opt_lg_extent_max_active_fit, + "lg_extent_max_active_fit", 0, + (sizeof(size_t) << 3), CONF_DONT_CHECK_MIN, + CONF_CHECK_MAX, false) + + if (strncmp("percpu_arena", k, klen) == 0) { + bool match = false; + for (int m = percpu_arena_mode_names_base; m < + percpu_arena_mode_names_limit; m++) { + if (strncmp(percpu_arena_mode_names[m], + v, vlen) == 0) { + if (!have_percpu_arena) { + CONF_ERROR( + "No getcpu support", + k, klen, v, vlen); + } + opt_percpu_arena = m; + match = true; + break; + } + } + if (!match) { + CONF_ERROR("Invalid conf value", + k, klen, v, vlen); + } + CONF_CONTINUE; + } + CONF_HANDLE_BOOL(opt_background_thread, + "background_thread"); + CONF_HANDLE_SIZE_T(opt_max_background_threads, + "max_background_threads", 1, + opt_max_background_threads, + CONF_CHECK_MIN, CONF_CHECK_MAX, + true); + CONF_HANDLE_BOOL(opt_hpa, "hpa") + CONF_HANDLE_SIZE_T(opt_hpa_opts.slab_max_alloc, + "hpa_slab_max_alloc", PAGE, HUGEPAGE, + CONF_CHECK_MIN, CONF_CHECK_MAX, true); + + /* + * Accept either a ratio-based or an exact hugification + * threshold. + */ + CONF_HANDLE_SIZE_T(opt_hpa_opts.hugification_threshold, + "hpa_hugification_threshold", PAGE, HUGEPAGE, + CONF_CHECK_MIN, CONF_CHECK_MAX, true); + if (CONF_MATCH("hpa_hugification_threshold_ratio")) { + fxp_t ratio; + char *end; + bool err = fxp_parse(&ratio, v, + &end); + if (err || (size_t)(end - v) != vlen + || ratio > FXP_INIT_INT(1)) { + CONF_ERROR("Invalid conf value", + k, klen, v, vlen); + } else { + opt_hpa_opts.hugification_threshold = + fxp_mul_frac(HUGEPAGE, ratio); + } + CONF_CONTINUE; + } + + CONF_HANDLE_UINT64_T( + opt_hpa_opts.hugify_delay_ms, "hpa_hugify_delay_ms", + 0, 0, CONF_DONT_CHECK_MIN, CONF_DONT_CHECK_MAX, + false); + + CONF_HANDLE_UINT64_T( + opt_hpa_opts.min_purge_interval_ms, + "hpa_min_purge_interval_ms", 0, 0, + CONF_DONT_CHECK_MIN, CONF_DONT_CHECK_MAX, false); + + CONF_HANDLE_BOOL( + opt_hpa_opts.strict_min_purge_interval, + "hpa_strict_min_purge_interval"); + + if (CONF_MATCH("hpa_dirty_mult")) { + if (CONF_MATCH_VALUE("-1")) { + opt_hpa_opts.dirty_mult = (fxp_t)-1; + CONF_CONTINUE; + } + fxp_t ratio; + char *end; + bool err = fxp_parse(&ratio, v, + &end); + if (err || (size_t)(end - v) != vlen) { + CONF_ERROR("Invalid conf value", + k, klen, v, vlen); + } else { + opt_hpa_opts.dirty_mult = ratio; + } + CONF_CONTINUE; + } + + CONF_HANDLE_SIZE_T(opt_hpa_sec_opts.nshards, + "hpa_sec_nshards", 0, 0, CONF_CHECK_MIN, + CONF_DONT_CHECK_MAX, true); + CONF_HANDLE_SIZE_T(opt_hpa_sec_opts.max_alloc, + "hpa_sec_max_alloc", PAGE, 0, CONF_CHECK_MIN, + CONF_DONT_CHECK_MAX, true); + CONF_HANDLE_SIZE_T(opt_hpa_sec_opts.max_bytes, + "hpa_sec_max_bytes", PAGE, 0, CONF_CHECK_MIN, + CONF_DONT_CHECK_MAX, true); + CONF_HANDLE_SIZE_T(opt_hpa_sec_opts.bytes_after_flush, + "hpa_sec_bytes_after_flush", PAGE, 0, + CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, true); + CONF_HANDLE_SIZE_T(opt_hpa_sec_opts.batch_fill_extra, + "hpa_sec_batch_fill_extra", 0, HUGEPAGE_PAGES, + CONF_CHECK_MIN, CONF_CHECK_MAX, true); + + if (CONF_MATCH("slab_sizes")) { + if (CONF_MATCH_VALUE("default")) { + sc_data_init(sc_data); + CONF_CONTINUE; + } + bool err; + const char *slab_size_segment_cur = v; + size_t vlen_left = vlen; + do { + size_t slab_start; + size_t slab_end; + size_t pgs; + err = multi_setting_parse_next( + &slab_size_segment_cur, + &vlen_left, &slab_start, &slab_end, + &pgs); + if (!err) { + sc_data_update_slab_size( + sc_data, slab_start, + slab_end, (int)pgs); + } else { + CONF_ERROR("Invalid settings " + "for slab_sizes", + k, klen, v, vlen); + } + } while (!err && vlen_left > 0); + CONF_CONTINUE; + } + if (config_prof) { + CONF_HANDLE_BOOL(opt_prof, "prof") + CONF_HANDLE_CHAR_P(opt_prof_prefix, + "prof_prefix", "jeprof") + CONF_HANDLE_BOOL(opt_prof_active, "prof_active") + CONF_HANDLE_BOOL(opt_prof_thread_active_init, + "prof_thread_active_init") + CONF_HANDLE_SIZE_T(opt_lg_prof_sample, + "lg_prof_sample", 0, (sizeof(uint64_t) << 3) + - 1, CONF_DONT_CHECK_MIN, CONF_CHECK_MAX, + true) + CONF_HANDLE_BOOL(opt_prof_accum, "prof_accum") + CONF_HANDLE_UNSIGNED(opt_prof_bt_max, "prof_bt_max", + 1, PROF_BT_MAX_LIMIT, CONF_CHECK_MIN, CONF_CHECK_MAX, + /* clip */ true) + CONF_HANDLE_SSIZE_T(opt_lg_prof_interval, + "lg_prof_interval", -1, + (sizeof(uint64_t) << 3) - 1) + CONF_HANDLE_BOOL(opt_prof_gdump, "prof_gdump") + CONF_HANDLE_BOOL(opt_prof_final, "prof_final") + CONF_HANDLE_BOOL(opt_prof_leak, "prof_leak") + CONF_HANDLE_BOOL(opt_prof_leak_error, + "prof_leak_error") + CONF_HANDLE_BOOL(opt_prof_log, "prof_log") + CONF_HANDLE_BOOL(opt_prof_pid_namespace, "prof_pid_namespace") + CONF_HANDLE_SSIZE_T(opt_prof_recent_alloc_max, + "prof_recent_alloc_max", -1, SSIZE_MAX) + CONF_HANDLE_BOOL(opt_prof_stats, "prof_stats") + CONF_HANDLE_BOOL(opt_prof_sys_thread_name, + "prof_sys_thread_name") + if (CONF_MATCH("prof_time_resolution")) { + if (CONF_MATCH_VALUE("default")) { + opt_prof_time_res = + prof_time_res_default; + } else if (CONF_MATCH_VALUE("high")) { + if (!config_high_res_timer) { + CONF_ERROR( + "No high resolution" + " timer support", + k, klen, v, vlen); + } else { + opt_prof_time_res = + prof_time_res_high; + } + } else { + CONF_ERROR("Invalid conf value", + k, klen, v, vlen); + } + CONF_CONTINUE; + } + /* + * Undocumented. When set to false, don't + * correct for an unbiasing bug in jeprof + * attribution. This can be handy if you want + * to get consistent numbers from your binary + * across different jemalloc versions, even if + * those numbers are incorrect. The default is + * true. + */ + CONF_HANDLE_BOOL(opt_prof_unbias, "prof_unbias") + } + if (config_log) { + if (CONF_MATCH("log")) { + size_t cpylen = ( + vlen <= sizeof(log_var_names) ? + vlen : sizeof(log_var_names) - 1); + strncpy(log_var_names, v, cpylen); + log_var_names[cpylen] = '\0'; + CONF_CONTINUE; + } + } + if (CONF_MATCH("thp")) { + bool match = false; + for (int m = 0; m < thp_mode_names_limit; m++) { + if (strncmp(thp_mode_names[m],v, vlen) + == 0) { + if (!have_madvise_huge && !have_memcntl) { + CONF_ERROR( + "No THP support", + k, klen, v, vlen); + } + opt_thp = m; + match = true; + break; + } + } + if (!match) { + CONF_ERROR("Invalid conf value", + k, klen, v, vlen); + } + CONF_CONTINUE; + } + if (CONF_MATCH("zero_realloc")) { + if (CONF_MATCH_VALUE("alloc")) { + opt_zero_realloc_action + = zero_realloc_action_alloc; + } else if (CONF_MATCH_VALUE("free")) { + opt_zero_realloc_action + = zero_realloc_action_free; + } else if (CONF_MATCH_VALUE("abort")) { + opt_zero_realloc_action + = zero_realloc_action_abort; + } else { + CONF_ERROR("Invalid conf value", + k, klen, v, vlen); + } + CONF_CONTINUE; + } + if (config_uaf_detection && + CONF_MATCH("lg_san_uaf_align")) { + ssize_t a; + CONF_VALUE_READ(ssize_t, a) + if (CONF_VALUE_READ_FAIL() || a < -1) { + CONF_ERROR("Invalid conf value", + k, klen, v, vlen); + } + if (a == -1) { + opt_lg_san_uaf_align = -1; + CONF_CONTINUE; + } + + /* clip if necessary */ + ssize_t max_allowed = (sizeof(size_t) << 3) - 1; + ssize_t min_allowed = LG_PAGE; + if (a > max_allowed) { + a = max_allowed; + } else if (a < min_allowed) { + a = min_allowed; + } + + opt_lg_san_uaf_align = a; + CONF_CONTINUE; + } + + CONF_HANDLE_SIZE_T(opt_san_guard_small, + "san_guard_small", 0, SIZE_T_MAX, + CONF_DONT_CHECK_MIN, CONF_DONT_CHECK_MAX, false) + CONF_HANDLE_SIZE_T(opt_san_guard_large, + "san_guard_large", 0, SIZE_T_MAX, + CONF_DONT_CHECK_MIN, CONF_DONT_CHECK_MAX, false) + + CONF_ERROR("Invalid conf pair", k, klen, v, vlen); +#undef CONF_ERROR +#undef CONF_CONTINUE +#undef CONF_MATCH +#undef CONF_MATCH_VALUE +#undef CONF_HANDLE_BOOL +#undef CONF_DONT_CHECK_MIN +#undef CONF_CHECK_MIN +#undef CONF_DONT_CHECK_MAX +#undef CONF_CHECK_MAX +#undef CONF_HANDLE_T +#undef CONF_HANDLE_T_U +#undef CONF_HANDLE_T_SIGNED +#undef CONF_HANDLE_UNSIGNED +#undef CONF_HANDLE_SIZE_T +#undef CONF_HANDLE_SSIZE_T +#undef CONF_HANDLE_CHAR_P + /* Re-enable diagnostic "-Wtype-limits" */ + JEMALLOC_DIAGNOSTIC_POP + } + validate_hpa_settings(); + if (opt_abort_conf && had_conf_error) { + malloc_abort_invalid_conf(); + } + } + atomic_store_b(&log_init_done, true, ATOMIC_RELEASE); +} + +static bool +malloc_conf_init_check_deps(void) { + if (opt_prof_leak_error && !opt_prof_final) { + malloc_printf(": prof_leak_error is set w/o " + "prof_final.\n"); + return true; + } + /* To emphasize in the stats output that opt is disabled when !debug. */ + if (!config_debug) { + opt_debug_double_free_max_scan = 0; + } + + return false; +} + +static void +malloc_conf_init(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS], + char readlink_buf[PATH_MAX + 1]) { + const char *opts_cache[MALLOC_CONF_NSOURCES] = {NULL, NULL, NULL, NULL, + NULL}; + + /* The first call only set the confirm_conf option and opts_cache */ + malloc_conf_init_helper(NULL, NULL, true, opts_cache, readlink_buf); + malloc_conf_init_helper(sc_data, bin_shard_sizes, false, opts_cache, + NULL); + if (malloc_conf_init_check_deps()) { + /* check_deps does warning msg only; abort below if needed. */ + if (opt_abort_conf) { + malloc_abort_invalid_conf(); + } + } +} + +#undef MALLOC_CONF_NSOURCES + +static bool +malloc_init_hard_needed(void) { + if (malloc_initialized() || (IS_INITIALIZER && malloc_init_state == + malloc_init_recursible)) { + /* + * Another thread initialized the allocator before this one + * acquired init_lock, or this thread is the initializing + * thread, and it is recursively allocating. + */ + return false; + } +#ifdef JEMALLOC_THREADED_INIT + if (malloc_initializer != NO_INITIALIZER && !IS_INITIALIZER) { + /* Busy-wait until the initializing thread completes. */ + spin_t spinner = SPIN_INITIALIZER; + do { + malloc_mutex_unlock(TSDN_NULL, &init_lock); + spin_adaptive(&spinner); + malloc_mutex_lock(TSDN_NULL, &init_lock); + } while (!malloc_initialized()); + return false; + } +#endif + return true; +} + +static bool +malloc_init_hard_a0_locked(void) { + malloc_initializer = INITIALIZER; + + JEMALLOC_DIAGNOSTIC_PUSH + JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS + sc_data_t sc_data = {0}; + JEMALLOC_DIAGNOSTIC_POP + + /* + * Ordering here is somewhat tricky; we need sc_boot() first, since that + * determines what the size classes will be, and then + * malloc_conf_init(), since any slab size tweaking will need to be done + * before sz_boot and bin_info_boot, which assume that the values they + * read out of sc_data_global are final. + */ + sc_boot(&sc_data); + unsigned bin_shard_sizes[SC_NBINS]; + bin_shard_sizes_boot(bin_shard_sizes); + /* + * prof_boot0 only initializes opt_prof_prefix. We need to do it before + * we parse malloc_conf options, in case malloc_conf parsing overwrites + * it. + */ + if (config_prof) { + prof_boot0(); + } + char readlink_buf[PATH_MAX + 1]; + readlink_buf[0] = '\0'; + malloc_conf_init(&sc_data, bin_shard_sizes, readlink_buf); + san_init(opt_lg_san_uaf_align); + sz_boot(&sc_data, opt_cache_oblivious); + bin_info_boot(&sc_data, bin_shard_sizes); + + if (opt_stats_print) { + /* Print statistics at exit. */ + if (atexit(stats_print_atexit) != 0) { + malloc_write(": Error in atexit()\n"); + if (opt_abort) { + abort(); + } + } + } + + if (stats_boot()) { + return true; + } + if (pages_boot()) { + return true; + } + if (base_boot(TSDN_NULL)) { + return true; + } + /* emap_global is static, hence zeroed. */ + if (emap_init(&arena_emap_global, b0get(), /* zeroed */ true)) { + return true; + } + if (extent_boot()) { + return true; + } + if (ctl_boot()) { + return true; + } + if (config_prof) { + prof_boot1(); + } + if (opt_hpa && !hpa_supported()) { + malloc_printf(": HPA not supported in the current " + "configuration; %s.", + opt_abort_conf ? "aborting" : "disabling"); + if (opt_abort_conf) { + malloc_abort_invalid_conf(); + } else { + opt_hpa = false; + } + } + if (arena_boot(&sc_data, b0get(), opt_hpa)) { + return true; + } + if (tcache_boot(TSDN_NULL, b0get())) { + return true; + } + if (malloc_mutex_init(&arenas_lock, "arenas", WITNESS_RANK_ARENAS, + malloc_mutex_rank_exclusive)) { + return true; + } + hook_boot(); + /* + * Create enough scaffolding to allow recursive allocation in + * malloc_ncpus(). + */ + narenas_auto = 1; + manual_arena_base = narenas_auto + 1; + memset(arenas, 0, sizeof(arena_t *) * narenas_auto); + /* + * Initialize one arena here. The rest are lazily created in + * arena_choose_hard(). + */ + if (arena_init(TSDN_NULL, 0, &arena_config_default) == NULL) { + return true; + } + a0 = arena_get(TSDN_NULL, 0, false); + + if (opt_hpa && !hpa_supported()) { + malloc_printf(": HPA not supported in the current " + "configuration; %s.", + opt_abort_conf ? "aborting" : "disabling"); + if (opt_abort_conf) { + malloc_abort_invalid_conf(); + } else { + opt_hpa = false; + } + } else if (opt_hpa) { + hpa_shard_opts_t hpa_shard_opts = opt_hpa_opts; + hpa_shard_opts.deferral_allowed = background_thread_enabled(); + if (pa_shard_enable_hpa(TSDN_NULL, &a0->pa_shard, + &hpa_shard_opts, &opt_hpa_sec_opts)) { + return true; + } + } + + malloc_init_state = malloc_init_a0_initialized; + + size_t buf_len = strlen(readlink_buf); + if (buf_len > 0) { + void *readlink_allocated = a0ialloc(buf_len + 1, false, true); + if (readlink_allocated != NULL) { + memcpy(readlink_allocated, readlink_buf, buf_len + 1); + opt_malloc_conf_symlink = readlink_allocated; + } + } + + return false; +} + +static bool +malloc_init_hard_a0(void) { + bool ret; + + malloc_mutex_lock(TSDN_NULL, &init_lock); + ret = malloc_init_hard_a0_locked(); + malloc_mutex_unlock(TSDN_NULL, &init_lock); + return ret; +} + +/* Initialize data structures which may trigger recursive allocation. */ +static bool +malloc_init_hard_recursible(void) { + malloc_init_state = malloc_init_recursible; + + ncpus = malloc_ncpus(); + if (opt_percpu_arena != percpu_arena_disabled) { + bool cpu_count_is_deterministic = + malloc_cpu_count_is_deterministic(); + if (!cpu_count_is_deterministic) { + /* + * If # of CPU is not deterministic, and narenas not + * specified, disables per cpu arena since it may not + * detect CPU IDs properly. + */ + if (opt_narenas == 0) { + opt_percpu_arena = percpu_arena_disabled; + malloc_write(": Number of CPUs " + "detected is not deterministic. Per-CPU " + "arena disabled.\n"); + if (opt_abort_conf) { + malloc_abort_invalid_conf(); + } + if (opt_abort) { + abort(); + } + } + } + } + +#if (defined(JEMALLOC_HAVE_PTHREAD_ATFORK) && !defined(JEMALLOC_MUTEX_INIT_CB) \ + && !defined(JEMALLOC_ZONE) && !defined(_WIN32) && \ + !defined(__native_client__)) + /* LinuxThreads' pthread_atfork() allocates. */ + if (pthread_atfork(jemalloc_prefork, jemalloc_postfork_parent, + jemalloc_postfork_child) != 0) { + malloc_write(": Error in pthread_atfork()\n"); + if (opt_abort) { + abort(); + } + return true; + } +#endif + + if (background_thread_boot0()) { + return true; + } + + return false; +} + +static unsigned +malloc_narenas_default(void) { + assert(ncpus > 0); + /* + * For SMP systems, create more than one arena per CPU by + * default. + */ + if (ncpus > 1) { + fxp_t fxp_ncpus = FXP_INIT_INT(ncpus); + fxp_t goal = fxp_mul(fxp_ncpus, opt_narenas_ratio); + uint32_t int_goal = fxp_round_nearest(goal); + if (int_goal == 0) { + return 1; + } + return int_goal; + } else { + return 1; + } +} + +static percpu_arena_mode_t +percpu_arena_as_initialized(percpu_arena_mode_t mode) { + assert(!malloc_initialized()); + assert(mode <= percpu_arena_disabled); + + if (mode != percpu_arena_disabled) { + mode += percpu_arena_mode_enabled_base; + } + + return mode; +} + +static bool +malloc_init_narenas(void) { + assert(ncpus > 0); + + if (opt_percpu_arena != percpu_arena_disabled) { + if (!have_percpu_arena || malloc_getcpu() < 0) { + opt_percpu_arena = percpu_arena_disabled; + malloc_printf(": perCPU arena getcpu() not " + "available. Setting narenas to %u.\n", opt_narenas ? + opt_narenas : malloc_narenas_default()); + if (opt_abort) { + abort(); + } + } else { + if (ncpus >= MALLOCX_ARENA_LIMIT) { + malloc_printf(": narenas w/ percpu" + "arena beyond limit (%d)\n", ncpus); + if (opt_abort) { + abort(); + } + return true; + } + /* NB: opt_percpu_arena isn't fully initialized yet. */ + if (percpu_arena_as_initialized(opt_percpu_arena) == + per_phycpu_arena && ncpus % 2 != 0) { + malloc_printf(": invalid " + "configuration -- per physical CPU arena " + "with odd number (%u) of CPUs (no hyper " + "threading?).\n", ncpus); + if (opt_abort) + abort(); + } + unsigned n = percpu_arena_ind_limit( + percpu_arena_as_initialized(opt_percpu_arena)); + if (opt_narenas < n) { + /* + * If narenas is specified with percpu_arena + * enabled, actual narenas is set as the greater + * of the two. percpu_arena_choose will be free + * to use any of the arenas based on CPU + * id. This is conservative (at a small cost) + * but ensures correctness. + * + * If for some reason the ncpus determined at + * boot is not the actual number (e.g. because + * of affinity setting from numactl), reserving + * narenas this way provides a workaround for + * percpu_arena. + */ + opt_narenas = n; + } + } + } + if (opt_narenas == 0) { + opt_narenas = malloc_narenas_default(); + } + assert(opt_narenas > 0); + + narenas_auto = opt_narenas; + /* + * Limit the number of arenas to the indexing range of MALLOCX_ARENA(). + */ + if (narenas_auto >= MALLOCX_ARENA_LIMIT) { + narenas_auto = MALLOCX_ARENA_LIMIT - 1; + malloc_printf(": Reducing narenas to limit (%d)\n", + narenas_auto); + } + narenas_total_set(narenas_auto); + if (arena_init_huge(a0)) { + narenas_total_inc(); + } + manual_arena_base = narenas_total_get(); + + return false; +} + +static void +malloc_init_percpu(void) { + opt_percpu_arena = percpu_arena_as_initialized(opt_percpu_arena); +} + +static bool +malloc_init_hard_finish(void) { + if (malloc_mutex_boot()) { + return true; + } + + malloc_init_state = malloc_init_initialized; + malloc_slow_flag_init(); + + return false; +} + +static void +malloc_init_hard_cleanup(tsdn_t *tsdn, bool reentrancy_set) { + malloc_mutex_assert_owner(tsdn, &init_lock); + malloc_mutex_unlock(tsdn, &init_lock); + if (reentrancy_set) { + assert(!tsdn_null(tsdn)); + tsd_t *tsd = tsdn_tsd(tsdn); + assert(tsd_reentrancy_level_get(tsd) > 0); + post_reentrancy(tsd); + } +} + +static bool +malloc_init_hard(void) { + tsd_t *tsd; + +#if defined(_WIN32) && _WIN32_WINNT < 0x0600 + _init_init_lock(); +#endif + malloc_mutex_lock(TSDN_NULL, &init_lock); + +#define UNLOCK_RETURN(tsdn, ret, reentrancy) \ + malloc_init_hard_cleanup(tsdn, reentrancy); \ + return ret; + + if (!malloc_init_hard_needed()) { + UNLOCK_RETURN(TSDN_NULL, false, false) + } + + if (malloc_init_state != malloc_init_a0_initialized && + malloc_init_hard_a0_locked()) { + UNLOCK_RETURN(TSDN_NULL, true, false) + } + + malloc_mutex_unlock(TSDN_NULL, &init_lock); + /* Recursive allocation relies on functional tsd. */ + tsd = malloc_tsd_boot0(); + if (tsd == NULL) { + return true; + } + if (malloc_init_hard_recursible()) { + return true; + } + + malloc_mutex_lock(tsd_tsdn(tsd), &init_lock); + /* Set reentrancy level to 1 during init. */ + pre_reentrancy(tsd, NULL); + /* Initialize narenas before prof_boot2 (for allocation). */ + if (malloc_init_narenas() + || background_thread_boot1(tsd_tsdn(tsd), b0get())) { + UNLOCK_RETURN(tsd_tsdn(tsd), true, true) + } + if (config_prof && prof_boot2(tsd, b0get())) { + UNLOCK_RETURN(tsd_tsdn(tsd), true, true) + } + + malloc_init_percpu(); + + if (malloc_init_hard_finish()) { + UNLOCK_RETURN(tsd_tsdn(tsd), true, true) + } + post_reentrancy(tsd); + malloc_mutex_unlock(tsd_tsdn(tsd), &init_lock); + + witness_assert_lockless(witness_tsd_tsdn( + tsd_witness_tsdp_get_unsafe(tsd))); + malloc_tsd_boot1(); + /* Update TSD after tsd_boot1. */ + tsd = tsd_fetch(); + if (opt_background_thread) { + assert(have_background_thread); + /* + * Need to finish init & unlock first before creating background + * threads (pthread_create depends on malloc). ctl_init (which + * sets isthreaded) needs to be called without holding any lock. + */ + background_thread_ctl_init(tsd_tsdn(tsd)); + if (background_thread_create(tsd, 0)) { + return true; + } + } +#undef UNLOCK_RETURN + return false; +} + +/* + * End initialization functions. + */ +/******************************************************************************/ +/* + * Begin allocation-path internal functions and data structures. + */ + +/* + * Settings determined by the documented behavior of the allocation functions. + */ +typedef struct static_opts_s static_opts_t; +struct static_opts_s { + /* Whether or not allocation size may overflow. */ + bool may_overflow; + + /* + * Whether or not allocations (with alignment) of size 0 should be + * treated as size 1. + */ + bool bump_empty_aligned_alloc; + /* + * Whether to assert that allocations are not of size 0 (after any + * bumping). + */ + bool assert_nonempty_alloc; + + /* + * Whether or not to modify the 'result' argument to malloc in case of + * error. + */ + bool null_out_result_on_error; + /* Whether to set errno when we encounter an error condition. */ + bool set_errno_on_error; + + /* + * The minimum valid alignment for functions requesting aligned storage. + */ + size_t min_alignment; + + /* The error string to use if we oom. */ + const char *oom_string; + /* The error string to use if the passed-in alignment is invalid. */ + const char *invalid_alignment_string; + + /* + * False if we're configured to skip some time-consuming operations. + * + * This isn't really a malloc "behavior", but it acts as a useful + * summary of several other static (or at least, static after program + * initialization) options. + */ + bool slow; + /* + * Return size. + */ + bool usize; +}; + +JEMALLOC_ALWAYS_INLINE void +static_opts_init(static_opts_t *static_opts) { + static_opts->may_overflow = false; + static_opts->bump_empty_aligned_alloc = false; + static_opts->assert_nonempty_alloc = false; + static_opts->null_out_result_on_error = false; + static_opts->set_errno_on_error = false; + static_opts->min_alignment = 0; + static_opts->oom_string = ""; + static_opts->invalid_alignment_string = ""; + static_opts->slow = false; + static_opts->usize = false; +} + +typedef struct dynamic_opts_s dynamic_opts_t; +struct dynamic_opts_s { + void **result; + size_t usize; + size_t num_items; + size_t item_size; + size_t alignment; + bool zero; + unsigned tcache_ind; + unsigned arena_ind; +}; + +JEMALLOC_ALWAYS_INLINE void +dynamic_opts_init(dynamic_opts_t *dynamic_opts) { + dynamic_opts->result = NULL; + dynamic_opts->usize = 0; + dynamic_opts->num_items = 0; + dynamic_opts->item_size = 0; + dynamic_opts->alignment = 0; + dynamic_opts->zero = false; + dynamic_opts->tcache_ind = TCACHE_IND_AUTOMATIC; + dynamic_opts->arena_ind = ARENA_IND_AUTOMATIC; +} + +/* + * ind parameter is optional and is only checked and filled if alignment == 0; + * return true if result is out of range. + */ +JEMALLOC_ALWAYS_INLINE bool +aligned_usize_get(size_t size, size_t alignment, size_t *usize, szind_t *ind, + bool bump_empty_aligned_alloc) { + assert(usize != NULL); + if (alignment == 0) { + if (ind != NULL) { + *ind = sz_size2index(size); + if (unlikely(*ind >= SC_NSIZES)) { + return true; + } + *usize = sz_index2size(*ind); + assert(*usize > 0 && *usize <= SC_LARGE_MAXCLASS); + return false; + } + *usize = sz_s2u(size); + } else { + if (bump_empty_aligned_alloc && unlikely(size == 0)) { + size = 1; + } + *usize = sz_sa2u(size, alignment); + } + if (unlikely(*usize == 0 || *usize > SC_LARGE_MAXCLASS)) { + return true; + } + return false; +} + +JEMALLOC_ALWAYS_INLINE bool +zero_get(bool guarantee, bool slow) { + if (config_fill && slow && unlikely(opt_zero)) { + return true; + } else { + return guarantee; + } +} + +/* Return true if a manual arena is specified and arena_get() OOMs. */ +JEMALLOC_ALWAYS_INLINE bool +arena_get_from_ind(tsd_t *tsd, unsigned arena_ind, arena_t **arena_p) { + if (arena_ind == ARENA_IND_AUTOMATIC) { + /* + * In case of automatic arena management, we defer arena + * computation until as late as we can, hoping to fill the + * allocation out of the tcache. + */ + *arena_p = NULL; + } else { + *arena_p = arena_get(tsd_tsdn(tsd), arena_ind, true); + if (unlikely(*arena_p == NULL) && arena_ind >= narenas_auto) { + return true; + } + } + return false; +} + +/* ind is ignored if dopts->alignment > 0. */ +JEMALLOC_ALWAYS_INLINE void * +imalloc_no_sample(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd, + size_t size, size_t usize, szind_t ind, bool slab) { + /* Fill in the tcache. */ + tcache_t *tcache = tcache_get_from_ind(tsd, dopts->tcache_ind, + sopts->slow, /* is_alloc */ true); + + /* Fill in the arena. */ + arena_t *arena; + if (arena_get_from_ind(tsd, dopts->arena_ind, &arena)) { + return NULL; + } + + if (unlikely(dopts->alignment != 0)) { + return ipalloct_explicit_slab(tsd_tsdn(tsd), usize, + dopts->alignment, dopts->zero, slab, tcache, arena); + } + + return iallocztm_explicit_slab(tsd_tsdn(tsd), size, ind, dopts->zero, + slab, tcache, false, arena, sopts->slow); +} + +JEMALLOC_ALWAYS_INLINE void * +imalloc_sample(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd, + size_t usize, szind_t ind) { + void *ret; + + dopts->alignment = prof_sample_align(usize, dopts->alignment); + /* + * If the allocation is small enough that it would normally be allocated + * on a slab, we need to take additional steps to ensure that it gets + * its own extent instead. + */ + if (sz_can_use_slab(usize)) { + assert((dopts->alignment & PROF_SAMPLE_ALIGNMENT_MASK) == 0); + size_t bumped_usize = sz_sa2u(usize, dopts->alignment); + szind_t bumped_ind = sz_size2index(bumped_usize); + dopts->tcache_ind = TCACHE_IND_NONE; + ret = imalloc_no_sample(sopts, dopts, tsd, bumped_usize, + bumped_usize, bumped_ind, /* slab */ false); + if (unlikely(ret == NULL)) { + return NULL; + } + arena_prof_promote(tsd_tsdn(tsd), ret, usize, bumped_usize); + } else { + ret = imalloc_no_sample(sopts, dopts, tsd, usize, usize, ind, + /* slab */ false); + } + assert(prof_sample_aligned(ret)); + + return ret; +} + +/* + * Returns true if the allocation will overflow, and false otherwise. Sets + * *size to the product either way. + */ +JEMALLOC_ALWAYS_INLINE bool +compute_size_with_overflow(bool may_overflow, dynamic_opts_t *dopts, + size_t *size) { + /* + * This function is just num_items * item_size, except that we may have + * to check for overflow. + */ + + if (!may_overflow) { + assert(dopts->num_items == 1); + *size = dopts->item_size; + return false; + } + + /* A size_t with its high-half bits all set to 1. */ + static const size_t high_bits = SIZE_T_MAX << (sizeof(size_t) * 8 / 2); + + *size = dopts->item_size * dopts->num_items; + + if (unlikely(*size == 0)) { + return (dopts->num_items != 0 && dopts->item_size != 0); + } + + /* + * We got a non-zero size, but we don't know if we overflowed to get + * there. To avoid having to do a divide, we'll be clever and note that + * if both A and B can be represented in N/2 bits, then their product + * can be represented in N bits (without the possibility of overflow). + */ + if (likely((high_bits & (dopts->num_items | dopts->item_size)) == 0)) { + return false; + } + if (likely(*size / dopts->item_size == dopts->num_items)) { + return false; + } + return true; +} + +JEMALLOC_ALWAYS_INLINE int +imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) { + /* Where the actual allocated memory will live. */ + void *allocation = NULL; + /* Filled in by compute_size_with_overflow below. */ + size_t size = 0; + /* + * The zero initialization for ind is actually dead store, in that its + * value is reset before any branch on its value is taken. Sometimes + * though, it's convenient to pass it as arguments before this point. + * To avoid undefined behavior then, we initialize it with dummy stores. + */ + szind_t ind = 0; + /* usize will always be properly initialized. */ + size_t usize; + + /* Reentrancy is only checked on slow path. */ + int8_t reentrancy_level; + + /* Compute the amount of memory the user wants. */ + if (unlikely(compute_size_with_overflow(sopts->may_overflow, dopts, + &size))) { + goto label_oom; + } + + if (unlikely(dopts->alignment < sopts->min_alignment + || (dopts->alignment & (dopts->alignment - 1)) != 0)) { + goto label_invalid_alignment; + } + + /* This is the beginning of the "core" algorithm. */ + dopts->zero = zero_get(dopts->zero, sopts->slow); + if (aligned_usize_get(size, dopts->alignment, &usize, &ind, + sopts->bump_empty_aligned_alloc)) { + goto label_oom; + } + dopts->usize = usize; + /* Validate the user input. */ + if (sopts->assert_nonempty_alloc) { + assert (size != 0); + } + + check_entry_exit_locking(tsd_tsdn(tsd)); + + /* + * If we need to handle reentrancy, we can do it out of a + * known-initialized arena (i.e. arena 0). + */ + reentrancy_level = tsd_reentrancy_level_get(tsd); + if (sopts->slow && unlikely(reentrancy_level > 0)) { + /* + * We should never specify particular arenas or tcaches from + * within our internal allocations. + */ + assert(dopts->tcache_ind == TCACHE_IND_AUTOMATIC || + dopts->tcache_ind == TCACHE_IND_NONE); + assert(dopts->arena_ind == ARENA_IND_AUTOMATIC); + dopts->tcache_ind = TCACHE_IND_NONE; + /* We know that arena 0 has already been initialized. */ + dopts->arena_ind = 0; + } + + /* + * If dopts->alignment > 0, then ind is still 0, but usize was computed + * in the previous if statement. Down the positive alignment path, + * imalloc_no_sample and imalloc_sample will ignore ind. + */ + + /* If profiling is on, get our profiling context. */ + if (config_prof && opt_prof) { + bool prof_active = prof_active_get_unlocked(); + bool sample_event = te_prof_sample_event_lookahead(tsd, usize); + prof_tctx_t *tctx = prof_alloc_prep(tsd, prof_active, + sample_event); + + emap_alloc_ctx_t alloc_ctx; + if (likely(tctx == PROF_TCTX_SENTINEL)) { + alloc_ctx.slab = sz_can_use_slab(usize); + allocation = imalloc_no_sample( + sopts, dopts, tsd, usize, usize, ind, + alloc_ctx.slab); + } else if (tctx != NULL) { + allocation = imalloc_sample( + sopts, dopts, tsd, usize, ind); + alloc_ctx.slab = false; + } else { + allocation = NULL; + } + + if (unlikely(allocation == NULL)) { + prof_alloc_rollback(tsd, tctx); + goto label_oom; + } + prof_malloc(tsd, allocation, size, usize, &alloc_ctx, tctx); + } else { + assert(!opt_prof); + allocation = imalloc_no_sample(sopts, dopts, tsd, size, usize, + ind, sz_can_use_slab(usize)); + if (unlikely(allocation == NULL)) { + goto label_oom; + } + } + + /* + * Allocation has been done at this point. We still have some + * post-allocation work to do though. + */ + + thread_alloc_event(tsd, usize); + + assert(dopts->alignment == 0 + || ((uintptr_t)allocation & (dopts->alignment - 1)) == ZU(0)); + + assert(usize == isalloc(tsd_tsdn(tsd), allocation)); + + if (config_fill && sopts->slow && !dopts->zero + && unlikely(opt_junk_alloc)) { + junk_alloc_callback(allocation, usize); + } + + if (sopts->slow) { + UTRACE(0, size, allocation); + } + + /* Success! */ + check_entry_exit_locking(tsd_tsdn(tsd)); + *dopts->result = allocation; + return 0; + +label_oom: + if (unlikely(sopts->slow) && config_xmalloc && unlikely(opt_xmalloc)) { + malloc_write(sopts->oom_string); + abort(); + } + + if (sopts->slow) { + UTRACE(NULL, size, NULL); + } + + check_entry_exit_locking(tsd_tsdn(tsd)); + + if (sopts->set_errno_on_error) { + set_errno(ENOMEM); + } + + if (sopts->null_out_result_on_error) { + *dopts->result = NULL; + } + + return ENOMEM; + + /* + * This label is only jumped to by one goto; we move it out of line + * anyways to avoid obscuring the non-error paths, and for symmetry with + * the oom case. + */ +label_invalid_alignment: + if (config_xmalloc && unlikely(opt_xmalloc)) { + malloc_write(sopts->invalid_alignment_string); + abort(); + } + + if (sopts->set_errno_on_error) { + set_errno(EINVAL); + } + + if (sopts->slow) { + UTRACE(NULL, size, NULL); + } + + check_entry_exit_locking(tsd_tsdn(tsd)); + + if (sopts->null_out_result_on_error) { + *dopts->result = NULL; + } + + return EINVAL; +} + +JEMALLOC_ALWAYS_INLINE bool +imalloc_init_check(static_opts_t *sopts, dynamic_opts_t *dopts) { + if (unlikely(!malloc_initialized()) && unlikely(malloc_init())) { + if (config_xmalloc && unlikely(opt_xmalloc)) { + malloc_write(sopts->oom_string); + abort(); + } + UTRACE(NULL, dopts->num_items * dopts->item_size, NULL); + set_errno(ENOMEM); + *dopts->result = NULL; + + return false; + } + + return true; +} + +/* Returns the errno-style error code of the allocation. */ +JEMALLOC_ALWAYS_INLINE int +imalloc(static_opts_t *sopts, dynamic_opts_t *dopts) { + if (tsd_get_allocates() && !imalloc_init_check(sopts, dopts)) { + return ENOMEM; + } + + /* We always need the tsd. Let's grab it right away. */ + tsd_t *tsd = tsd_fetch(); + assert(tsd); + if (likely(tsd_fast(tsd))) { + /* Fast and common path. */ + tsd_assert_fast(tsd); + sopts->slow = false; + return imalloc_body(sopts, dopts, tsd); + } else { + if (!tsd_get_allocates() && !imalloc_init_check(sopts, dopts)) { + return ENOMEM; + } + + sopts->slow = true; + return imalloc_body(sopts, dopts, tsd); + } +} + +JEMALLOC_NOINLINE +void * +malloc_default(size_t size) { + void *ret; + static_opts_t sopts; + dynamic_opts_t dopts; + + /* + * This variant has logging hook on exit but not on entry. It's callled + * only by je_malloc, below, which emits the entry one for us (and, if + * it calls us, does so only via tail call). + */ + + static_opts_init(&sopts); + dynamic_opts_init(&dopts); + + sopts.null_out_result_on_error = true; + sopts.set_errno_on_error = true; + sopts.oom_string = ": Error in malloc(): out of memory\n"; + + dopts.result = &ret; + dopts.num_items = 1; + dopts.item_size = size; + + imalloc(&sopts, &dopts); + /* + * Note that this branch gets optimized away -- it immediately follows + * the check on tsd_fast that sets sopts.slow. + */ + if (sopts.slow) { + uintptr_t args[3] = {size}; + hook_invoke_alloc(hook_alloc_malloc, ret, (uintptr_t)ret, args); + } + + return ret; +} + +/******************************************************************************/ +/* + * Begin malloc(3)-compatible functions. + */ + +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN +void JEMALLOC_NOTHROW * +JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1) +je_malloc(size_t size) { + LOG("core.malloc.entry", "size: %zu", size); + + void * ret = imalloc_fastpath(size, &malloc_default); + + LOG("core.malloc.exit", "result: %p", ret); + return ret; +} + +JEMALLOC_EXPORT int JEMALLOC_NOTHROW +JEMALLOC_ATTR(nonnull(1)) +je_posix_memalign(void **memptr, size_t alignment, size_t size) { + int ret; + static_opts_t sopts; + dynamic_opts_t dopts; + + LOG("core.posix_memalign.entry", "mem ptr: %p, alignment: %zu, " + "size: %zu", memptr, alignment, size); + + static_opts_init(&sopts); + dynamic_opts_init(&dopts); + + sopts.bump_empty_aligned_alloc = true; + sopts.min_alignment = sizeof(void *); + sopts.oom_string = + ": Error allocating aligned memory: out of memory\n"; + sopts.invalid_alignment_string = + ": Error allocating aligned memory: invalid alignment\n"; + + dopts.result = memptr; + dopts.num_items = 1; + dopts.item_size = size; + dopts.alignment = alignment; + + ret = imalloc(&sopts, &dopts); + if (sopts.slow) { + uintptr_t args[3] = {(uintptr_t)memptr, (uintptr_t)alignment, + (uintptr_t)size}; + hook_invoke_alloc(hook_alloc_posix_memalign, *memptr, + (uintptr_t)ret, args); + } + + LOG("core.posix_memalign.exit", "result: %d, alloc ptr: %p", ret, + *memptr); + + return ret; +} + +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN +void JEMALLOC_NOTHROW * +JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(2) +je_aligned_alloc(size_t alignment, size_t size) { + void *ret; + + static_opts_t sopts; + dynamic_opts_t dopts; + + LOG("core.aligned_alloc.entry", "alignment: %zu, size: %zu\n", + alignment, size); + + static_opts_init(&sopts); + dynamic_opts_init(&dopts); + + sopts.bump_empty_aligned_alloc = true; + sopts.null_out_result_on_error = true; + sopts.set_errno_on_error = true; + sopts.min_alignment = 1; + sopts.oom_string = + ": Error allocating aligned memory: out of memory\n"; + sopts.invalid_alignment_string = + ": Error allocating aligned memory: invalid alignment\n"; + + dopts.result = &ret; + dopts.num_items = 1; + dopts.item_size = size; + dopts.alignment = alignment; + + imalloc(&sopts, &dopts); + if (sopts.slow) { + uintptr_t args[3] = {(uintptr_t)alignment, (uintptr_t)size}; + hook_invoke_alloc(hook_alloc_aligned_alloc, ret, + (uintptr_t)ret, args); + } + + LOG("core.aligned_alloc.exit", "result: %p", ret); + + return ret; +} + +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN +void JEMALLOC_NOTHROW * +JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2) +je_calloc(size_t num, size_t size) { + void *ret; + static_opts_t sopts; + dynamic_opts_t dopts; + + LOG("core.calloc.entry", "num: %zu, size: %zu", num, size); + + static_opts_init(&sopts); + dynamic_opts_init(&dopts); + + sopts.may_overflow = true; + sopts.null_out_result_on_error = true; + sopts.set_errno_on_error = true; + sopts.oom_string = ": Error in calloc(): out of memory\n"; + + dopts.result = &ret; + dopts.num_items = num; + dopts.item_size = size; + dopts.zero = true; + + imalloc(&sopts, &dopts); + if (sopts.slow) { + uintptr_t args[3] = {(uintptr_t)num, (uintptr_t)size}; + hook_invoke_alloc(hook_alloc_calloc, ret, (uintptr_t)ret, args); + } + + LOG("core.calloc.exit", "result: %p", ret); + + return ret; +} + +JEMALLOC_ALWAYS_INLINE void +ifree(tsd_t *tsd, void *ptr, tcache_t *tcache, bool slow_path) { + if (!slow_path) { + tsd_assert_fast(tsd); + } + check_entry_exit_locking(tsd_tsdn(tsd)); + if (tsd_reentrancy_level_get(tsd) != 0) { + assert(slow_path); + } + + assert(ptr != NULL); + assert(malloc_initialized() || IS_INITIALIZER); + + emap_alloc_ctx_t alloc_ctx; + emap_alloc_ctx_lookup(tsd_tsdn(tsd), &arena_emap_global, ptr, + &alloc_ctx); + assert(alloc_ctx.szind != SC_NSIZES); + + size_t usize = sz_index2size(alloc_ctx.szind); + if (config_prof && opt_prof) { + prof_free(tsd, ptr, usize, &alloc_ctx); + } + + if (likely(!slow_path)) { + idalloctm(tsd_tsdn(tsd), ptr, tcache, &alloc_ctx, false, + false); + } else { + if (config_fill && slow_path && opt_junk_free) { + junk_free_callback(ptr, usize); + } + idalloctm(tsd_tsdn(tsd), ptr, tcache, &alloc_ctx, false, + true); + } + thread_dalloc_event(tsd, usize); +} + +JEMALLOC_ALWAYS_INLINE void +isfree(tsd_t *tsd, void *ptr, size_t usize, tcache_t *tcache, bool slow_path) { + if (!slow_path) { + tsd_assert_fast(tsd); + } + check_entry_exit_locking(tsd_tsdn(tsd)); + if (tsd_reentrancy_level_get(tsd) != 0) { + assert(slow_path); + } + + assert(ptr != NULL); + assert(malloc_initialized() || IS_INITIALIZER); + + emap_alloc_ctx_t alloc_ctx; + if (!config_prof) { + alloc_ctx.szind = sz_size2index(usize); + alloc_ctx.slab = (alloc_ctx.szind < SC_NBINS); + } else { + if (likely(!prof_sample_aligned(ptr))) { + /* + * When the ptr is not page aligned, it was not sampled. + * usize can be trusted to determine szind and slab. + */ + alloc_ctx.szind = sz_size2index(usize); + alloc_ctx.slab = (alloc_ctx.szind < SC_NBINS); + } else if (opt_prof) { + emap_alloc_ctx_lookup(tsd_tsdn(tsd), &arena_emap_global, + ptr, &alloc_ctx); + + if (config_opt_safety_checks) { + /* Small alloc may have !slab (sampled). */ + if (unlikely(alloc_ctx.szind != + sz_size2index(usize))) { + safety_check_fail_sized_dealloc( + /* current_dealloc */ true, ptr, + /* true_size */ sz_index2size( + alloc_ctx.szind), + /* input_size */ usize); + } + } + } else { + alloc_ctx.szind = sz_size2index(usize); + alloc_ctx.slab = (alloc_ctx.szind < SC_NBINS); + } + } + bool fail = maybe_check_alloc_ctx(tsd, ptr, &alloc_ctx); + if (fail) { + /* + * This is a heap corruption bug. In real life we'll crash; for + * the unit test we just want to avoid breaking anything too + * badly to get a test result out. Let's leak instead of trying + * to free. + */ + return; + } + + if (config_prof && opt_prof) { + prof_free(tsd, ptr, usize, &alloc_ctx); + } + if (likely(!slow_path)) { + isdalloct(tsd_tsdn(tsd), ptr, usize, tcache, &alloc_ctx, + false); + } else { + if (config_fill && slow_path && opt_junk_free) { + junk_free_callback(ptr, usize); + } + isdalloct(tsd_tsdn(tsd), ptr, usize, tcache, &alloc_ctx, + true); + } + thread_dalloc_event(tsd, usize); +} + +JEMALLOC_NOINLINE +void +free_default(void *ptr) { + UTRACE(ptr, 0, 0); + if (likely(ptr != NULL)) { + /* + * We avoid setting up tsd fully (e.g. tcache, arena binding) + * based on only free() calls -- other activities trigger the + * minimal to full transition. This is because free() may + * happen during thread shutdown after tls deallocation: if a + * thread never had any malloc activities until then, a + * fully-setup tsd won't be destructed properly. + */ + tsd_t *tsd = tsd_fetch_min(); + check_entry_exit_locking(tsd_tsdn(tsd)); + + if (likely(tsd_fast(tsd))) { + tcache_t *tcache = tcache_get_from_ind(tsd, + TCACHE_IND_AUTOMATIC, /* slow */ false, + /* is_alloc */ false); + ifree(tsd, ptr, tcache, /* slow */ false); + } else { + tcache_t *tcache = tcache_get_from_ind(tsd, + TCACHE_IND_AUTOMATIC, /* slow */ true, + /* is_alloc */ false); + uintptr_t args_raw[3] = {(uintptr_t)ptr}; + hook_invoke_dalloc(hook_dalloc_free, ptr, args_raw); + ifree(tsd, ptr, tcache, /* slow */ true); + } + + check_entry_exit_locking(tsd_tsdn(tsd)); + } +} + +JEMALLOC_EXPORT void JEMALLOC_NOTHROW +je_free(void *ptr) { + LOG("core.free.entry", "ptr: %p", ptr); + + je_free_impl(ptr); + + LOG("core.free.exit", ""); +} + +JEMALLOC_EXPORT void JEMALLOC_NOTHROW +je_free_sized(void *ptr, size_t size) { + LOG("core.free_sized.entry", "ptr: %p, size: %zu", ptr, size); + + je_sdallocx_noflags(ptr, size); + + LOG("core.free_sized.exit", ""); +} + +JEMALLOC_EXPORT void JEMALLOC_NOTHROW +je_free_aligned_sized(void *ptr, size_t alignment, size_t size) { + return je_sdallocx(ptr, size, /* flags */ MALLOCX_ALIGN(alignment)); +} + +/* + * End malloc(3)-compatible functions. + */ +/******************************************************************************/ +/* + * Begin non-standard override functions. + */ + +#ifdef JEMALLOC_OVERRIDE_MEMALIGN +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN +void JEMALLOC_NOTHROW * +JEMALLOC_ATTR(malloc) +je_memalign(size_t alignment, size_t size) { + void *ret; + static_opts_t sopts; + dynamic_opts_t dopts; + + LOG("core.memalign.entry", "alignment: %zu, size: %zu\n", alignment, + size); + + static_opts_init(&sopts); + dynamic_opts_init(&dopts); + + sopts.bump_empty_aligned_alloc = true; + sopts.min_alignment = 1; + sopts.oom_string = + ": Error allocating aligned memory: out of memory\n"; + sopts.invalid_alignment_string = + ": Error allocating aligned memory: invalid alignment\n"; + sopts.null_out_result_on_error = true; + + dopts.result = &ret; + dopts.num_items = 1; + dopts.item_size = size; + dopts.alignment = alignment; + + imalloc(&sopts, &dopts); + if (sopts.slow) { + uintptr_t args[3] = {alignment, size}; + hook_invoke_alloc(hook_alloc_memalign, ret, (uintptr_t)ret, + args); + } + + LOG("core.memalign.exit", "result: %p", ret); + return ret; +} +#endif + +#ifdef JEMALLOC_OVERRIDE_VALLOC +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN +void JEMALLOC_NOTHROW * +JEMALLOC_ATTR(malloc) +je_valloc(size_t size) { + void *ret; + + static_opts_t sopts; + dynamic_opts_t dopts; + + LOG("core.valloc.entry", "size: %zu\n", size); + + static_opts_init(&sopts); + dynamic_opts_init(&dopts); + + sopts.null_out_result_on_error = true; + sopts.min_alignment = PAGE; + sopts.oom_string = + ": Error allocating aligned memory: out of memory\n"; + sopts.invalid_alignment_string = + ": Error allocating aligned memory: invalid alignment\n"; + + dopts.result = &ret; + dopts.num_items = 1; + dopts.item_size = size; + dopts.alignment = PAGE; + + imalloc(&sopts, &dopts); + if (sopts.slow) { + uintptr_t args[3] = {size}; + hook_invoke_alloc(hook_alloc_valloc, ret, (uintptr_t)ret, args); + } + + LOG("core.valloc.exit", "result: %p\n", ret); + return ret; +} +#endif + +#ifdef JEMALLOC_OVERRIDE_PVALLOC +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN +void JEMALLOC_NOTHROW * +JEMALLOC_ATTR(malloc) +je_pvalloc(size_t size) { + void *ret; + + static_opts_t sopts; + dynamic_opts_t dopts; + + LOG("core.pvalloc.entry", "size: %zu\n", size); + + static_opts_init(&sopts); + dynamic_opts_init(&dopts); + + sopts.null_out_result_on_error = true; + sopts.min_alignment = PAGE; + sopts.oom_string = + ": Error allocating aligned memory: out of memory\n"; + sopts.invalid_alignment_string = + ": Error allocating aligned memory: invalid alignment\n"; + + dopts.result = &ret; + dopts.num_items = 1; + /* + * This is the only difference from je_valloc - size is rounded up to + * a PAGE multiple. + */ + dopts.item_size = PAGE_CEILING(size); + dopts.alignment = PAGE; + + imalloc(&sopts, &dopts); + if (sopts.slow) { + uintptr_t args[3] = {size}; + hook_invoke_alloc(hook_alloc_pvalloc, ret, (uintptr_t)ret, + args); + } + + LOG("core.pvalloc.exit", "result: %p\n", ret); + return ret; +} +#endif + +#if defined(JEMALLOC_IS_MALLOC) && defined(JEMALLOC_GLIBC_MALLOC_HOOK) +/* + * glibc provides the RTLD_DEEPBIND flag for dlopen which can make it possible + * to inconsistently reference libc's malloc(3)-compatible functions + * (https://bugzilla.mozilla.org/show_bug.cgi?id=493541). + * + * These definitions interpose hooks in glibc. The functions are actually + * passed an extra argument for the caller return address, which will be + * ignored. + */ +#include // defines __GLIBC__ if we are compiling against glibc + +JEMALLOC_EXPORT void (*__free_hook)(void *ptr) = je_free; +JEMALLOC_EXPORT void *(*__malloc_hook)(size_t size) = je_malloc; +JEMALLOC_EXPORT void *(*__realloc_hook)(void *ptr, size_t size) = je_realloc; +# ifdef JEMALLOC_GLIBC_MEMALIGN_HOOK +JEMALLOC_EXPORT void *(*__memalign_hook)(size_t alignment, size_t size) = + je_memalign; +# endif + +# ifdef __GLIBC__ +/* + * To enable static linking with glibc, the libc specific malloc interface must + * be implemented also, so none of glibc's malloc.o functions are added to the + * link. + */ +# define ALIAS(je_fn) __attribute__((alias (#je_fn), used)) +/* To force macro expansion of je_ prefix before stringification. */ +# define PREALIAS(je_fn) ALIAS(je_fn) +# ifdef JEMALLOC_OVERRIDE___LIBC_CALLOC +void *__libc_calloc(size_t n, size_t size) PREALIAS(je_calloc); +# endif +# ifdef JEMALLOC_OVERRIDE___LIBC_FREE +void __libc_free(void* ptr) PREALIAS(je_free); +# endif +# ifdef JEMALLOC_OVERRIDE___LIBC_FREE_SIZED +void __libc_free_sized(void* ptr, size_t size) PREALIAS(je_free_sized); +# endif +# ifdef JEMALLOC_OVERRIDE___LIBC_FREE_ALIGNED_SIZED +void __libc_free_aligned_sized( + void* ptr, size_t alignment, size_t size) PREALIAS(je_free_aligned_sized); +# endif +# ifdef JEMALLOC_OVERRIDE___LIBC_MALLOC +void *__libc_malloc(size_t size) PREALIAS(je_malloc); +# endif +# ifdef JEMALLOC_OVERRIDE___LIBC_MEMALIGN +void *__libc_memalign(size_t align, size_t s) PREALIAS(je_memalign); +# endif +# ifdef JEMALLOC_OVERRIDE___LIBC_REALLOC +void *__libc_realloc(void* ptr, size_t size) PREALIAS(je_realloc); +# endif +# ifdef JEMALLOC_OVERRIDE___LIBC_VALLOC +void *__libc_valloc(size_t size) PREALIAS(je_valloc); +# endif +# ifdef JEMALLOC_OVERRIDE___LIBC_PVALLOC +void *__libc_pvalloc(size_t size) PREALIAS(je_pvalloc); +# endif +# ifdef JEMALLOC_OVERRIDE___POSIX_MEMALIGN +int __posix_memalign(void** r, size_t a, size_t s) PREALIAS(je_posix_memalign); +# endif +# undef PREALIAS +# undef ALIAS +# endif +#endif + +/* + * End non-standard override functions. + */ +/******************************************************************************/ +/* + * Begin non-standard functions. + */ + +JEMALLOC_ALWAYS_INLINE unsigned +mallocx_tcache_get(int flags) { + if (likely((flags & MALLOCX_TCACHE_MASK) == 0)) { + return TCACHE_IND_AUTOMATIC; + } else if ((flags & MALLOCX_TCACHE_MASK) == MALLOCX_TCACHE_NONE) { + return TCACHE_IND_NONE; + } else { + return MALLOCX_TCACHE_GET(flags); + } +} + +JEMALLOC_ALWAYS_INLINE unsigned +mallocx_arena_get(int flags) { + if (unlikely((flags & MALLOCX_ARENA_MASK) != 0)) { + return MALLOCX_ARENA_GET(flags); + } else { + return ARENA_IND_AUTOMATIC; + } +} + +#ifdef JEMALLOC_EXPERIMENTAL_SMALLOCX_API + +#define JEMALLOC_SMALLOCX_CONCAT_HELPER(x, y) x ## y +#define JEMALLOC_SMALLOCX_CONCAT_HELPER2(x, y) \ + JEMALLOC_SMALLOCX_CONCAT_HELPER(x, y) + +typedef struct { + void *ptr; + size_t size; +} smallocx_return_t; + +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN +smallocx_return_t JEMALLOC_NOTHROW +/* + * The attribute JEMALLOC_ATTR(malloc) cannot be used due to: + * - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86488 + */ +JEMALLOC_SMALLOCX_CONCAT_HELPER2(je_smallocx_, JEMALLOC_VERSION_GID_IDENT) + (size_t size, int flags) { + /* + * Note: the attribute JEMALLOC_ALLOC_SIZE(1) cannot be + * used here because it makes writing beyond the `size` + * of the `ptr` undefined behavior, but the objective + * of this function is to allow writing beyond `size` + * up to `smallocx_return_t::size`. + */ + smallocx_return_t ret; + static_opts_t sopts; + dynamic_opts_t dopts; + + LOG("core.smallocx.entry", "size: %zu, flags: %d", size, flags); + + static_opts_init(&sopts); + dynamic_opts_init(&dopts); + + sopts.assert_nonempty_alloc = true; + sopts.null_out_result_on_error = true; + sopts.oom_string = ": Error in mallocx(): out of memory\n"; + sopts.usize = true; + + dopts.result = &ret.ptr; + dopts.num_items = 1; + dopts.item_size = size; + if (unlikely(flags != 0)) { + dopts.alignment = MALLOCX_ALIGN_GET(flags); + dopts.zero = MALLOCX_ZERO_GET(flags); + dopts.tcache_ind = mallocx_tcache_get(flags); + dopts.arena_ind = mallocx_arena_get(flags); + } + + imalloc(&sopts, &dopts); + assert(dopts.usize == je_nallocx(size, flags)); + ret.size = dopts.usize; + + LOG("core.smallocx.exit", "result: %p, size: %zu", ret.ptr, ret.size); + return ret; +} +#undef JEMALLOC_SMALLOCX_CONCAT_HELPER +#undef JEMALLOC_SMALLOCX_CONCAT_HELPER2 +#endif + +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN +void JEMALLOC_NOTHROW * +JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1) +je_mallocx(size_t size, int flags) { + void *ret; + static_opts_t sopts; + dynamic_opts_t dopts; + + LOG("core.mallocx.entry", "size: %zu, flags: %d", size, flags); + + static_opts_init(&sopts); + dynamic_opts_init(&dopts); + + sopts.assert_nonempty_alloc = true; + sopts.null_out_result_on_error = true; + sopts.oom_string = ": Error in mallocx(): out of memory\n"; + + dopts.result = &ret; + dopts.num_items = 1; + dopts.item_size = size; + if (unlikely(flags != 0)) { + dopts.alignment = MALLOCX_ALIGN_GET(flags); + dopts.zero = MALLOCX_ZERO_GET(flags); + dopts.tcache_ind = mallocx_tcache_get(flags); + dopts.arena_ind = mallocx_arena_get(flags); + } + + imalloc(&sopts, &dopts); + if (sopts.slow) { + uintptr_t args[3] = {size, flags}; + hook_invoke_alloc(hook_alloc_mallocx, ret, (uintptr_t)ret, + args); + } + + LOG("core.mallocx.exit", "result: %p", ret); + return ret; +} + +static void * +irallocx_prof_sample(tsdn_t *tsdn, void *old_ptr, size_t old_usize, + size_t usize, size_t alignment, bool zero, tcache_t *tcache, arena_t *arena, + prof_tctx_t *tctx, hook_ralloc_args_t *hook_args) { + void *p; + + if (tctx == NULL) { + return NULL; + } + + alignment = prof_sample_align(usize, alignment); + /* + * If the allocation is small enough that it would normally be allocated + * on a slab, we need to take additional steps to ensure that it gets + * its own extent instead. + */ + if (sz_can_use_slab(usize)) { + size_t bumped_usize = sz_sa2u(usize, alignment); + p = iralloct_explicit_slab(tsdn, old_ptr, old_usize, + bumped_usize, alignment, zero, /* slab */ false, + tcache, arena, hook_args); + if (p == NULL) { + return NULL; + } + arena_prof_promote(tsdn, p, usize, bumped_usize); + } else { + p = iralloct_explicit_slab(tsdn, old_ptr, old_usize, usize, + alignment, zero, /* slab */ false, tcache, arena, + hook_args); + } + assert(prof_sample_aligned(p)); + + return p; +} + +JEMALLOC_ALWAYS_INLINE void * +irallocx_prof(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t size, + size_t alignment, size_t usize, bool zero, tcache_t *tcache, + arena_t *arena, emap_alloc_ctx_t *alloc_ctx, + hook_ralloc_args_t *hook_args) { + prof_info_t old_prof_info; + prof_info_get_and_reset_recent(tsd, old_ptr, alloc_ctx, &old_prof_info); + bool prof_active = prof_active_get_unlocked(); + bool sample_event = te_prof_sample_event_lookahead(tsd, usize); + prof_tctx_t *tctx = prof_alloc_prep(tsd, prof_active, sample_event); + void *p; + if (unlikely(tctx != PROF_TCTX_SENTINEL)) { + p = irallocx_prof_sample(tsd_tsdn(tsd), old_ptr, old_usize, + usize, alignment, zero, tcache, arena, tctx, hook_args); + } else { + p = iralloct(tsd_tsdn(tsd), old_ptr, old_usize, size, alignment, + usize, zero, tcache, arena, hook_args); + } + if (unlikely(p == NULL)) { + prof_alloc_rollback(tsd, tctx); + return NULL; + } + assert(usize == isalloc(tsd_tsdn(tsd), p)); + prof_realloc(tsd, p, size, usize, tctx, prof_active, old_ptr, + old_usize, &old_prof_info, sample_event); + + return p; +} + +static void * +do_rallocx(void *ptr, size_t size, int flags, bool is_realloc) { + void *p; + tsd_t *tsd; + size_t usize; + size_t old_usize; + size_t alignment = MALLOCX_ALIGN_GET(flags); + arena_t *arena; + + assert(ptr != NULL); + assert(size != 0); + assert(malloc_initialized() || IS_INITIALIZER); + tsd = tsd_fetch(); + check_entry_exit_locking(tsd_tsdn(tsd)); + + bool zero = zero_get(MALLOCX_ZERO_GET(flags), /* slow */ true); + + unsigned arena_ind = mallocx_arena_get(flags); + if (arena_get_from_ind(tsd, arena_ind, &arena)) { + goto label_oom; + } + + unsigned tcache_ind = mallocx_tcache_get(flags); + tcache_t *tcache = tcache_get_from_ind(tsd, tcache_ind, + /* slow */ true, /* is_alloc */ true); + + emap_alloc_ctx_t alloc_ctx; + emap_alloc_ctx_lookup(tsd_tsdn(tsd), &arena_emap_global, ptr, + &alloc_ctx); + assert(alloc_ctx.szind != SC_NSIZES); + old_usize = sz_index2size(alloc_ctx.szind); + assert(old_usize == isalloc(tsd_tsdn(tsd), ptr)); + if (aligned_usize_get(size, alignment, &usize, NULL, false)) { + goto label_oom; + } + + hook_ralloc_args_t hook_args = {is_realloc, {(uintptr_t)ptr, size, + flags, 0}}; + if (config_prof && opt_prof) { + p = irallocx_prof(tsd, ptr, old_usize, size, alignment, usize, + zero, tcache, arena, &alloc_ctx, &hook_args); + if (unlikely(p == NULL)) { + goto label_oom; + } + } else { + p = iralloct(tsd_tsdn(tsd), ptr, old_usize, size, alignment, + usize, zero, tcache, arena, &hook_args); + if (unlikely(p == NULL)) { + goto label_oom; + } + assert(usize == isalloc(tsd_tsdn(tsd), p)); + } + assert(alignment == 0 || ((uintptr_t)p & (alignment - 1)) == ZU(0)); + thread_alloc_event(tsd, usize); + thread_dalloc_event(tsd, old_usize); + + UTRACE(ptr, size, p); + check_entry_exit_locking(tsd_tsdn(tsd)); + + if (config_fill && unlikely(opt_junk_alloc) && usize > old_usize + && !zero) { + size_t excess_len = usize - old_usize; + void *excess_start = (void *)((byte_t *)p + old_usize); + junk_alloc_callback(excess_start, excess_len); + } + + return p; +label_oom: + if (is_realloc) { + set_errno(ENOMEM); + } + if (config_xmalloc && unlikely(opt_xmalloc)) { + malloc_write(": Error in rallocx(): out of memory\n"); + abort(); + } + UTRACE(ptr, size, 0); + check_entry_exit_locking(tsd_tsdn(tsd)); + + return NULL; +} + +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN +void JEMALLOC_NOTHROW * +JEMALLOC_ALLOC_SIZE(2) +je_rallocx(void *ptr, size_t size, int flags) { + LOG("core.rallocx.entry", "ptr: %p, size: %zu, flags: %d", ptr, + size, flags); + void *ret = do_rallocx(ptr, size, flags, false); + LOG("core.rallocx.exit", "result: %p", ret); + return ret; +} + +static void * +do_realloc_nonnull_zero(void *ptr) { + if (config_stats) { + atomic_fetch_add_zu(&zero_realloc_count, 1, ATOMIC_RELAXED); + } + if (opt_zero_realloc_action == zero_realloc_action_alloc) { + /* + * The user might have gotten an alloc setting while expecting a + * free setting. If that's the case, we at least try to + * reduce the harm, and turn off the tcache while allocating, so + * that we'll get a true first fit. + */ + return do_rallocx(ptr, 1, MALLOCX_TCACHE_NONE, true); + } else if (opt_zero_realloc_action == zero_realloc_action_free) { + UTRACE(ptr, 0, 0); + tsd_t *tsd = tsd_fetch(); + check_entry_exit_locking(tsd_tsdn(tsd)); + + tcache_t *tcache = tcache_get_from_ind(tsd, + TCACHE_IND_AUTOMATIC, /* slow */ true, + /* is_alloc */ false); + uintptr_t args[3] = {(uintptr_t)ptr, 0}; + hook_invoke_dalloc(hook_dalloc_realloc, ptr, args); + ifree(tsd, ptr, tcache, true); + + check_entry_exit_locking(tsd_tsdn(tsd)); + return NULL; + } else { + safety_check_fail("Called realloc(non-null-ptr, 0) with " + "zero_realloc:abort set\n"); + /* In real code, this will never run; the safety check failure + * will call abort. In the unit test, we just want to bail out + * without corrupting internal state that the test needs to + * finish. + */ + return NULL; + } +} + +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN +void JEMALLOC_NOTHROW * +JEMALLOC_ALLOC_SIZE(2) +je_realloc(void *ptr, size_t size) { + LOG("core.realloc.entry", "ptr: %p, size: %zu\n", ptr, size); + + if (likely(ptr != NULL && size != 0)) { + void *ret = do_rallocx(ptr, size, 0, true); + LOG("core.realloc.exit", "result: %p", ret); + return ret; + } else if (ptr != NULL && size == 0) { + void *ret = do_realloc_nonnull_zero(ptr); + LOG("core.realloc.exit", "result: %p", ret); + return ret; + } else { + /* realloc(NULL, size) is equivalent to malloc(size). */ + void *ret; + + static_opts_t sopts; + dynamic_opts_t dopts; + + static_opts_init(&sopts); + dynamic_opts_init(&dopts); + + sopts.null_out_result_on_error = true; + sopts.set_errno_on_error = true; + sopts.oom_string = + ": Error in realloc(): out of memory\n"; + + dopts.result = &ret; + dopts.num_items = 1; + dopts.item_size = size; + + imalloc(&sopts, &dopts); + if (sopts.slow) { + uintptr_t args[3] = {(uintptr_t)ptr, size}; + hook_invoke_alloc(hook_alloc_realloc, ret, + (uintptr_t)ret, args); + } + LOG("core.realloc.exit", "result: %p", ret); + return ret; + } +} + +JEMALLOC_ALWAYS_INLINE size_t +ixallocx_helper(tsdn_t *tsdn, void *ptr, size_t old_usize, size_t size, + size_t extra, size_t alignment, bool zero) { + size_t newsize; + + if (ixalloc(tsdn, ptr, old_usize, size, extra, alignment, zero, + &newsize)) { + return old_usize; + } + + return newsize; +} + +static size_t +ixallocx_prof_sample(tsdn_t *tsdn, void *ptr, size_t old_usize, size_t size, + size_t extra, size_t alignment, bool zero, prof_tctx_t *tctx) { + /* Sampled allocation needs to be page aligned. */ + if (tctx == NULL || !prof_sample_aligned(ptr)) { + return old_usize; + } + + return ixallocx_helper(tsdn, ptr, old_usize, size, extra, alignment, + zero); +} + +JEMALLOC_ALWAYS_INLINE size_t +ixallocx_prof(tsd_t *tsd, void *ptr, size_t old_usize, size_t size, + size_t extra, size_t alignment, bool zero, emap_alloc_ctx_t *alloc_ctx) { + /* + * old_prof_info is only used for asserting that the profiling info + * isn't changed by the ixalloc() call. + */ + prof_info_t old_prof_info; + prof_info_get(tsd, ptr, alloc_ctx, &old_prof_info); + + /* + * usize isn't knowable before ixalloc() returns when extra is non-zero. + * Therefore, compute its maximum possible value and use that in + * prof_alloc_prep() to decide whether to capture a backtrace. + * prof_realloc() will use the actual usize to decide whether to sample. + */ + size_t usize_max; + if (aligned_usize_get(size + extra, alignment, &usize_max, NULL, + false)) { + /* + * usize_max is out of range, and chances are that allocation + * will fail, but use the maximum possible value and carry on + * with prof_alloc_prep(), just in case allocation succeeds. + */ + usize_max = SC_LARGE_MAXCLASS; + } + bool prof_active = prof_active_get_unlocked(); + bool sample_event = te_prof_sample_event_lookahead(tsd, usize_max); + prof_tctx_t *tctx = prof_alloc_prep(tsd, prof_active, sample_event); + + size_t usize; + if (unlikely(tctx != PROF_TCTX_SENTINEL)) { + usize = ixallocx_prof_sample(tsd_tsdn(tsd), ptr, old_usize, + size, extra, alignment, zero, tctx); + } else { + usize = ixallocx_helper(tsd_tsdn(tsd), ptr, old_usize, size, + extra, alignment, zero); + } + + /* + * At this point we can still safely get the original profiling + * information associated with the ptr, because (a) the edata_t object + * associated with the ptr still lives and (b) the profiling info + * fields are not touched. "(a)" is asserted in the outer je_xallocx() + * function, and "(b)" is indirectly verified below by checking that + * the alloc_tctx field is unchanged. + */ + prof_info_t prof_info; + if (usize == old_usize) { + prof_info_get(tsd, ptr, alloc_ctx, &prof_info); + prof_alloc_rollback(tsd, tctx); + } else { + prof_info_get_and_reset_recent(tsd, ptr, alloc_ctx, &prof_info); + assert(usize <= usize_max); + sample_event = te_prof_sample_event_lookahead(tsd, usize); + prof_realloc(tsd, ptr, size, usize, tctx, prof_active, ptr, + old_usize, &prof_info, sample_event); + } + + assert(old_prof_info.alloc_tctx == prof_info.alloc_tctx); + return usize; +} + +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW +je_xallocx(void *ptr, size_t size, size_t extra, int flags) { + tsd_t *tsd; + size_t usize, old_usize; + size_t alignment = MALLOCX_ALIGN_GET(flags); + bool zero = zero_get(MALLOCX_ZERO_GET(flags), /* slow */ true); + + LOG("core.xallocx.entry", "ptr: %p, size: %zu, extra: %zu, " + "flags: %d", ptr, size, extra, flags); + + assert(ptr != NULL); + assert(size != 0); + assert(SIZE_T_MAX - size >= extra); + assert(malloc_initialized() || IS_INITIALIZER); + tsd = tsd_fetch(); + check_entry_exit_locking(tsd_tsdn(tsd)); + + /* + * old_edata is only for verifying that xallocx() keeps the edata_t + * object associated with the ptr (though the content of the edata_t + * object can be changed). + */ + edata_t *old_edata = emap_edata_lookup(tsd_tsdn(tsd), + &arena_emap_global, ptr); + + emap_alloc_ctx_t alloc_ctx; + emap_alloc_ctx_lookup(tsd_tsdn(tsd), &arena_emap_global, ptr, + &alloc_ctx); + assert(alloc_ctx.szind != SC_NSIZES); + old_usize = sz_index2size(alloc_ctx.szind); + assert(old_usize == isalloc(tsd_tsdn(tsd), ptr)); + /* + * The API explicitly absolves itself of protecting against (size + + * extra) numerical overflow, but we may need to clamp extra to avoid + * exceeding SC_LARGE_MAXCLASS. + * + * Ordinarily, size limit checking is handled deeper down, but here we + * have to check as part of (size + extra) clamping, since we need the + * clamped value in the above helper functions. + */ + if (unlikely(size > SC_LARGE_MAXCLASS)) { + usize = old_usize; + goto label_not_resized; + } + if (unlikely(SC_LARGE_MAXCLASS - size < extra)) { + extra = SC_LARGE_MAXCLASS - size; + } + + if (config_prof && opt_prof) { + usize = ixallocx_prof(tsd, ptr, old_usize, size, extra, + alignment, zero, &alloc_ctx); + } else { + usize = ixallocx_helper(tsd_tsdn(tsd), ptr, old_usize, size, + extra, alignment, zero); + } + + /* + * xallocx() should keep using the same edata_t object (though its + * content can be changed). + */ + assert(emap_edata_lookup(tsd_tsdn(tsd), &arena_emap_global, ptr) + == old_edata); + + if (unlikely(usize == old_usize)) { + goto label_not_resized; + } + thread_alloc_event(tsd, usize); + thread_dalloc_event(tsd, old_usize); + + if (config_fill && unlikely(opt_junk_alloc) && usize > old_usize && + !zero) { + size_t excess_len = usize - old_usize; + void *excess_start = (void *)((byte_t *)ptr + old_usize); + junk_alloc_callback(excess_start, excess_len); + } +label_not_resized: + if (unlikely(!tsd_fast(tsd))) { + uintptr_t args[4] = {(uintptr_t)ptr, size, extra, flags}; + hook_invoke_expand(hook_expand_xallocx, ptr, old_usize, + usize, (uintptr_t)usize, args); + } + + UTRACE(ptr, size, ptr); + check_entry_exit_locking(tsd_tsdn(tsd)); + + LOG("core.xallocx.exit", "result: %zu", usize); + return usize; +} + +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW +JEMALLOC_ATTR(pure) +je_sallocx(const void *ptr, int flags) { + size_t usize; + tsdn_t *tsdn; + + LOG("core.sallocx.entry", "ptr: %p, flags: %d", ptr, flags); + + assert(malloc_initialized() || IS_INITIALIZER); + assert(ptr != NULL); + + tsdn = tsdn_fetch(); + check_entry_exit_locking(tsdn); + + if (config_debug || force_ivsalloc) { + usize = ivsalloc(tsdn, ptr); + assert(force_ivsalloc || usize != 0); + } else { + usize = isalloc(tsdn, ptr); + } + + check_entry_exit_locking(tsdn); + + LOG("core.sallocx.exit", "result: %zu", usize); + return usize; +} + +JEMALLOC_EXPORT void JEMALLOC_NOTHROW +je_dallocx(void *ptr, int flags) { + LOG("core.dallocx.entry", "ptr: %p, flags: %d", ptr, flags); + + assert(ptr != NULL); + assert(malloc_initialized() || IS_INITIALIZER); + + tsd_t *tsd = tsd_fetch_min(); + bool fast = tsd_fast(tsd); + check_entry_exit_locking(tsd_tsdn(tsd)); + + unsigned tcache_ind = mallocx_tcache_get(flags); + tcache_t *tcache = tcache_get_from_ind(tsd, tcache_ind, !fast, + /* is_alloc */ false); + + UTRACE(ptr, 0, 0); + if (likely(fast)) { + tsd_assert_fast(tsd); + ifree(tsd, ptr, tcache, false); + } else { + uintptr_t args_raw[3] = {(uintptr_t)ptr, flags}; + hook_invoke_dalloc(hook_dalloc_dallocx, ptr, args_raw); + ifree(tsd, ptr, tcache, true); + } + check_entry_exit_locking(tsd_tsdn(tsd)); + + LOG("core.dallocx.exit", ""); +} + +JEMALLOC_ALWAYS_INLINE size_t +inallocx(tsdn_t *tsdn, size_t size, int flags) { + check_entry_exit_locking(tsdn); + size_t usize; + /* In case of out of range, let the user see it rather than fail. */ + aligned_usize_get(size, MALLOCX_ALIGN_GET(flags), &usize, NULL, false); + check_entry_exit_locking(tsdn); + return usize; +} + +JEMALLOC_NOINLINE void +sdallocx_default(void *ptr, size_t size, int flags) { + assert(ptr != NULL); + assert(malloc_initialized() || IS_INITIALIZER); + + tsd_t *tsd = tsd_fetch_min(); + bool fast = tsd_fast(tsd); + size_t usize = inallocx(tsd_tsdn(tsd), size, flags); + check_entry_exit_locking(tsd_tsdn(tsd)); + + unsigned tcache_ind = mallocx_tcache_get(flags); + tcache_t *tcache = tcache_get_from_ind(tsd, tcache_ind, !fast, + /* is_alloc */ false); + + UTRACE(ptr, 0, 0); + if (likely(fast)) { + tsd_assert_fast(tsd); + isfree(tsd, ptr, usize, tcache, false); + } else { + uintptr_t args_raw[3] = {(uintptr_t)ptr, size, flags}; + hook_invoke_dalloc(hook_dalloc_sdallocx, ptr, args_raw); + isfree(tsd, ptr, usize, tcache, true); + } + check_entry_exit_locking(tsd_tsdn(tsd)); +} + +JEMALLOC_EXPORT void JEMALLOC_NOTHROW +je_sdallocx(void *ptr, size_t size, int flags) { + LOG("core.sdallocx.entry", "ptr: %p, size: %zu, flags: %d", ptr, + size, flags); + + je_sdallocx_impl(ptr, size, flags); + + LOG("core.sdallocx.exit", ""); +} + +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW +JEMALLOC_ATTR(pure) +je_nallocx(size_t size, int flags) { + size_t usize; + tsdn_t *tsdn; + + assert(size != 0); + + if (unlikely(malloc_init())) { + LOG("core.nallocx.exit", "result: %zu", ZU(0)); + return 0; + } + + tsdn = tsdn_fetch(); + check_entry_exit_locking(tsdn); + + usize = inallocx(tsdn, size, flags); + if (unlikely(usize > SC_LARGE_MAXCLASS)) { + LOG("core.nallocx.exit", "result: %zu", ZU(0)); + return 0; + } + + check_entry_exit_locking(tsdn); + LOG("core.nallocx.exit", "result: %zu", usize); + return usize; +} + +JEMALLOC_EXPORT int JEMALLOC_NOTHROW +je_mallctl(const char *name, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + tsd_t *tsd; + + LOG("core.mallctl.entry", "name: %s", name); + + if (unlikely(malloc_init())) { + LOG("core.mallctl.exit", "result: %d", EAGAIN); + return EAGAIN; + } + + tsd = tsd_fetch(); + check_entry_exit_locking(tsd_tsdn(tsd)); + ret = ctl_byname(tsd, name, oldp, oldlenp, newp, newlen); + check_entry_exit_locking(tsd_tsdn(tsd)); + + LOG("core.mallctl.exit", "result: %d", ret); + return ret; +} + +JEMALLOC_EXPORT int JEMALLOC_NOTHROW +je_mallctlnametomib(const char *name, size_t *mibp, size_t *miblenp) { + int ret; + + LOG("core.mallctlnametomib.entry", "name: %s", name); + + if (unlikely(malloc_init())) { + LOG("core.mallctlnametomib.exit", "result: %d", EAGAIN); + return EAGAIN; + } + + tsd_t *tsd = tsd_fetch(); + check_entry_exit_locking(tsd_tsdn(tsd)); + ret = ctl_nametomib(tsd, name, mibp, miblenp); + check_entry_exit_locking(tsd_tsdn(tsd)); + + LOG("core.mallctlnametomib.exit", "result: %d", ret); + return ret; +} + +JEMALLOC_EXPORT int JEMALLOC_NOTHROW +je_mallctlbymib(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp, + void *newp, size_t newlen) { + int ret; + tsd_t *tsd; + + LOG("core.mallctlbymib.entry", ""); + + if (unlikely(malloc_init())) { + LOG("core.mallctlbymib.exit", "result: %d", EAGAIN); + return EAGAIN; + } + + tsd = tsd_fetch(); + check_entry_exit_locking(tsd_tsdn(tsd)); + ret = ctl_bymib(tsd, mib, miblen, oldp, oldlenp, newp, newlen); + check_entry_exit_locking(tsd_tsdn(tsd)); + LOG("core.mallctlbymib.exit", "result: %d", ret); + return ret; +} + +#define STATS_PRINT_BUFSIZE 65536 +JEMALLOC_EXPORT void JEMALLOC_NOTHROW +je_malloc_stats_print(void (*write_cb)(void *, const char *), void *cbopaque, + const char *opts) { + tsdn_t *tsdn; + + LOG("core.malloc_stats_print.entry", ""); + + tsdn = tsdn_fetch(); + check_entry_exit_locking(tsdn); + + if (config_debug) { + stats_print(write_cb, cbopaque, opts); + } else { + buf_writer_t buf_writer; + buf_writer_init(tsdn, &buf_writer, write_cb, cbopaque, NULL, + STATS_PRINT_BUFSIZE); + stats_print(buf_writer_cb, &buf_writer, opts); + buf_writer_terminate(tsdn, &buf_writer); + } + + check_entry_exit_locking(tsdn); + LOG("core.malloc_stats_print.exit", ""); +} +#undef STATS_PRINT_BUFSIZE + +JEMALLOC_ALWAYS_INLINE size_t +je_malloc_usable_size_impl(JEMALLOC_USABLE_SIZE_CONST void *ptr) { + assert(malloc_initialized() || IS_INITIALIZER); + + tsdn_t *tsdn = tsdn_fetch(); + check_entry_exit_locking(tsdn); + + size_t ret; + if (unlikely(ptr == NULL)) { + ret = 0; + } else { + if (config_debug || force_ivsalloc) { + ret = ivsalloc(tsdn, ptr); + assert(force_ivsalloc || ret != 0); + } else { + ret = isalloc(tsdn, ptr); + } + } + check_entry_exit_locking(tsdn); + + return ret; +} + +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW +je_malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *ptr) { + LOG("core.malloc_usable_size.entry", "ptr: %p", ptr); + + size_t ret = je_malloc_usable_size_impl(ptr); + + LOG("core.malloc_usable_size.exit", "result: %zu", ret); + return ret; +} + +#ifdef JEMALLOC_HAVE_MALLOC_SIZE +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW +je_malloc_size(const void *ptr) { + LOG("core.malloc_size.entry", "ptr: %p", ptr); + + size_t ret = je_malloc_usable_size_impl(ptr); + + LOG("core.malloc_size.exit", "result: %zu", ret); + return ret; +} +#endif + +static void +batch_alloc_prof_sample_assert(tsd_t *tsd, size_t batch, size_t usize) { + assert(config_prof && opt_prof); + bool prof_sample_event = te_prof_sample_event_lookahead(tsd, + batch * usize); + assert(!prof_sample_event); + size_t surplus; + prof_sample_event = te_prof_sample_event_lookahead_surplus(tsd, + (batch + 1) * usize, &surplus); + assert(prof_sample_event); + assert(surplus < usize); +} + +size_t +batch_alloc(void **ptrs, size_t num, size_t size, int flags) { + LOG("core.batch_alloc.entry", + "ptrs: %p, num: %zu, size: %zu, flags: %d", ptrs, num, size, flags); + + tsd_t *tsd = tsd_fetch(); + check_entry_exit_locking(tsd_tsdn(tsd)); + + size_t filled = 0; + + if (unlikely(tsd == NULL || tsd_reentrancy_level_get(tsd) > 0)) { + goto label_done; + } + + size_t alignment = MALLOCX_ALIGN_GET(flags); + size_t usize; + if (aligned_usize_get(size, alignment, &usize, NULL, false)) { + goto label_done; + } + szind_t ind = sz_size2index(usize); + bool zero = zero_get(MALLOCX_ZERO_GET(flags), /* slow */ true); + + /* + * The cache bin and arena will be lazily initialized; it's hard to + * know in advance whether each of them needs to be initialized. + */ + cache_bin_t *bin = NULL; + arena_t *arena = NULL; + + size_t nregs = 0; + if (likely(ind < SC_NBINS)) { + nregs = bin_infos[ind].nregs; + assert(nregs > 0); + } + + while (filled < num) { + size_t batch = num - filled; + size_t surplus = SIZE_MAX; /* Dead store. */ + bool prof_sample_event = config_prof && opt_prof + && prof_active_get_unlocked() + && te_prof_sample_event_lookahead_surplus(tsd, + batch * usize, &surplus); + + if (prof_sample_event) { + /* + * Adjust so that the batch does not trigger prof + * sampling. + */ + batch -= surplus / usize + 1; + batch_alloc_prof_sample_assert(tsd, batch, usize); + } + + size_t progress = 0; + + if (likely(ind < SC_NBINS) && batch >= nregs) { + if (arena == NULL) { + unsigned arena_ind = mallocx_arena_get(flags); + if (arena_get_from_ind(tsd, arena_ind, + &arena)) { + goto label_done; + } + if (arena == NULL) { + arena = arena_choose(tsd, NULL); + } + if (unlikely(arena == NULL)) { + goto label_done; + } + } + size_t arena_batch = batch - batch % nregs; + size_t n = arena_fill_small_fresh(tsd_tsdn(tsd), arena, + ind, ptrs + filled, arena_batch, zero); + progress += n; + filled += n; + } + + unsigned tcache_ind = mallocx_tcache_get(flags); + tcache_t *tcache = tcache_get_from_ind(tsd, tcache_ind, + /* slow */ true, /* is_alloc */ true); + if (likely(tcache != NULL && + ind < tcache_nbins_get(tcache->tcache_slow) && + !tcache_bin_disabled(ind, &tcache->bins[ind], + tcache->tcache_slow)) && progress < batch) { + if (bin == NULL) { + bin = &tcache->bins[ind]; + } + /* + * If we don't have a tcache bin, we don't want to + * immediately give up, because there's the possibility + * that the user explicitly requested to bypass the + * tcache, or that the user explicitly turned off the + * tcache; in such cases, we go through the slow path, + * i.e. the mallocx() call at the end of the while loop. + */ + if (bin != NULL) { + size_t bin_batch = batch - progress; + /* + * n can be less than bin_batch, meaning that + * the cache bin does not have enough memory. + * In such cases, we rely on the slow path, + * i.e. the mallocx() call at the end of the + * while loop, to fill in the cache, and in the + * next iteration of the while loop, the tcache + * will contain a lot of memory, and we can + * harvest them here. Compared to the + * alternative approach where we directly go to + * the arena bins here, the overhead of our + * current approach should usually be minimal, + * since we never try to fetch more memory than + * what a slab contains via the tcache. An + * additional benefit is that the tcache will + * not be empty for the next allocation request. + */ + size_t n = cache_bin_alloc_batch(bin, bin_batch, + ptrs + filled); + if (config_stats) { + bin->tstats.nrequests += n; + } + if (zero) { + for (size_t i = 0; i < n; ++i) { + memset(ptrs[filled + i], 0, + usize); + } + } + if (config_prof && opt_prof + && unlikely(ind >= SC_NBINS)) { + for (size_t i = 0; i < n; ++i) { + prof_tctx_reset_sampled(tsd, + ptrs[filled + i]); + } + } + progress += n; + filled += n; + } + } + + /* + * For thread events other than prof sampling, trigger them as + * if there's a single allocation of size (n * usize). This is + * fine because: + * (a) these events do not alter the allocation itself, and + * (b) it's possible that some event would have been triggered + * multiple times, instead of only once, if the allocations + * were handled individually, but it would do no harm (or + * even be beneficial) to coalesce the triggerings. + */ + thread_alloc_event(tsd, progress * usize); + + if (progress < batch || prof_sample_event) { + void *p = je_mallocx(size, flags); + if (p == NULL) { /* OOM */ + break; + } + if (progress == batch) { + assert(prof_sampled(tsd, p)); + } + ptrs[filled++] = p; + } + } + +label_done: + check_entry_exit_locking(tsd_tsdn(tsd)); + LOG("core.batch_alloc.exit", "result: %zu", filled); + return filled; +} + +/* + * End non-standard functions. + */ +/******************************************************************************/ +/* + * The following functions are used by threading libraries for protection of + * malloc during fork(). + */ + +/* + * If an application creates a thread before doing any allocation in the main + * thread, then calls fork(2) in the main thread followed by memory allocation + * in the child process, a race can occur that results in deadlock within the + * child: the main thread may have forked while the created thread had + * partially initialized the allocator. Ordinarily jemalloc prevents + * fork/malloc races via the following functions it registers during + * initialization using pthread_atfork(), but of course that does no good if + * the allocator isn't fully initialized at fork time. The following library + * constructor is a partial solution to this problem. It may still be possible + * to trigger the deadlock described above, but doing so would involve forking + * via a library constructor that runs before jemalloc's runs. + */ +#ifndef JEMALLOC_JET +JEMALLOC_ATTR(constructor) +static void +jemalloc_constructor(void) { + unsigned long long cpu_count = malloc_ncpus(); + if (cpu_count == 0) { + cpu_count = duckdb_malloc_ncpus(); + } + unsigned long long bgt_count = cpu_count / 16; + if (bgt_count == 0) { + bgt_count = 1; + } + // decay is in ms + unsigned long long decay = DUCKDB_JEMALLOC_DECAY * 1000; +#ifdef DEBUG + snprintf(JE_MALLOC_CONF_BUFFER, JE_MALLOC_CONF_BUFFER_SIZE, "junk:true,oversize_threshold:268435456,dirty_decay_ms:%llu,muzzy_decay_ms:%llu,narenas:%llu,max_background_threads:%llu", decay, decay, cpu_count / 2, bgt_count); +#else + snprintf(JE_MALLOC_CONF_BUFFER, JE_MALLOC_CONF_BUFFER_SIZE, "oversize_threshold:268435456,dirty_decay_ms:%llu,muzzy_decay_ms:%llu,narenas:%llu,max_background_threads:%llu", decay, decay, cpu_count / 2, bgt_count); +#endif + je_malloc_conf = JE_MALLOC_CONF_BUFFER; + malloc_init(); +} +#endif + +#ifndef JEMALLOC_MUTEX_INIT_CB +void +jemalloc_prefork(void) +#else +JEMALLOC_EXPORT void +_malloc_prefork(void) +#endif +{ + tsd_t *tsd; + unsigned i, j, narenas; + arena_t *arena; + +#ifdef JEMALLOC_MUTEX_INIT_CB + if (!malloc_initialized()) { + return; + } +#endif + assert(malloc_initialized()); + + tsd = tsd_fetch(); + + narenas = narenas_total_get(); + + witness_prefork(tsd_witness_tsdp_get(tsd)); + /* Acquire all mutexes in a safe order. */ + ctl_prefork(tsd_tsdn(tsd)); + tcache_prefork(tsd_tsdn(tsd)); + malloc_mutex_prefork(tsd_tsdn(tsd), &arenas_lock); + if (have_background_thread) { + background_thread_prefork0(tsd_tsdn(tsd)); + } + prof_prefork0(tsd_tsdn(tsd)); + if (have_background_thread) { + background_thread_prefork1(tsd_tsdn(tsd)); + } + /* Break arena prefork into stages to preserve lock order. */ + for (i = 0; i < 9; i++) { + for (j = 0; j < narenas; j++) { + if ((arena = arena_get(tsd_tsdn(tsd), j, false)) != + NULL) { + switch (i) { + case 0: + arena_prefork0(tsd_tsdn(tsd), arena); + break; + case 1: + arena_prefork1(tsd_tsdn(tsd), arena); + break; + case 2: + arena_prefork2(tsd_tsdn(tsd), arena); + break; + case 3: + arena_prefork3(tsd_tsdn(tsd), arena); + break; + case 4: + arena_prefork4(tsd_tsdn(tsd), arena); + break; + case 5: + arena_prefork5(tsd_tsdn(tsd), arena); + break; + case 6: + arena_prefork6(tsd_tsdn(tsd), arena); + break; + case 7: + arena_prefork7(tsd_tsdn(tsd), arena); + break; + case 8: + arena_prefork8(tsd_tsdn(tsd), arena); + break; + default: not_reached(); + } + } + } + + } + prof_prefork1(tsd_tsdn(tsd)); + stats_prefork(tsd_tsdn(tsd)); + tsd_prefork(tsd); +} + +#ifndef JEMALLOC_MUTEX_INIT_CB +void +jemalloc_postfork_parent(void) +#else +JEMALLOC_EXPORT void +_malloc_postfork(void) +#endif +{ + tsd_t *tsd; + unsigned i, narenas; + +#ifdef JEMALLOC_MUTEX_INIT_CB + if (!malloc_initialized()) { + return; + } +#endif + assert(malloc_initialized()); + + tsd = tsd_fetch(); + + tsd_postfork_parent(tsd); + + witness_postfork_parent(tsd_witness_tsdp_get(tsd)); + /* Release all mutexes, now that fork() has completed. */ + stats_postfork_parent(tsd_tsdn(tsd)); + for (i = 0, narenas = narenas_total_get(); i < narenas; i++) { + arena_t *arena; + + if ((arena = arena_get(tsd_tsdn(tsd), i, false)) != NULL) { + arena_postfork_parent(tsd_tsdn(tsd), arena); + } + } + prof_postfork_parent(tsd_tsdn(tsd)); + if (have_background_thread) { + background_thread_postfork_parent(tsd_tsdn(tsd)); + } + malloc_mutex_postfork_parent(tsd_tsdn(tsd), &arenas_lock); + tcache_postfork_parent(tsd_tsdn(tsd)); + ctl_postfork_parent(tsd_tsdn(tsd)); +} + +void +jemalloc_postfork_child(void) { + tsd_t *tsd; + unsigned i, narenas; + + assert(malloc_initialized()); + + tsd = tsd_fetch(); + + tsd_postfork_child(tsd); + + witness_postfork_child(tsd_witness_tsdp_get(tsd)); + /* Release all mutexes, now that fork() has completed. */ + stats_postfork_child(tsd_tsdn(tsd)); + for (i = 0, narenas = narenas_total_get(); i < narenas; i++) { + arena_t *arena; + + if ((arena = arena_get(tsd_tsdn(tsd), i, false)) != NULL) { + arena_postfork_child(tsd_tsdn(tsd), arena); + } + } + prof_postfork_child(tsd_tsdn(tsd)); + if (have_background_thread) { + background_thread_postfork_child(tsd_tsdn(tsd)); + } + malloc_mutex_postfork_child(tsd_tsdn(tsd), &arenas_lock); + tcache_postfork_child(tsd_tsdn(tsd)); + ctl_postfork_child(tsd_tsdn(tsd)); +} + +/******************************************************************************/ diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/large.c b/src/duckdb/extension/jemalloc/jemalloc/src/large.c new file mode 100644 index 000000000..d78085f03 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/large.c @@ -0,0 +1,322 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/emap.h" +#include "jemalloc/internal/extent_mmap.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/prof_recent.h" +#include "jemalloc/internal/util.h" + +/******************************************************************************/ + +void * +large_malloc(tsdn_t *tsdn, arena_t *arena, size_t usize, bool zero) { + assert(usize == sz_s2u(usize)); + + return large_palloc(tsdn, arena, usize, CACHELINE, zero); +} + +void * +large_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment, + bool zero) { + size_t ausize; + edata_t *edata; + UNUSED bool idump JEMALLOC_CC_SILENCE_INIT(false); + + assert(!tsdn_null(tsdn) || arena != NULL); + + ausize = sz_sa2u(usize, alignment); + if (unlikely(ausize == 0 || ausize > SC_LARGE_MAXCLASS)) { + return NULL; + } + + if (likely(!tsdn_null(tsdn))) { + arena = arena_choose_maybe_huge(tsdn_tsd(tsdn), arena, usize); + } + if (unlikely(arena == NULL) || (edata = arena_extent_alloc_large(tsdn, + arena, usize, alignment, zero)) == NULL) { + return NULL; + } + + /* See comments in arena_bin_slabs_full_insert(). */ + if (!arena_is_auto(arena)) { + /* Insert edata into large. */ + malloc_mutex_lock(tsdn, &arena->large_mtx); + edata_list_active_append(&arena->large, edata); + malloc_mutex_unlock(tsdn, &arena->large_mtx); + } + + arena_decay_tick(tsdn, arena); + return edata_addr_get(edata); +} + +static bool +large_ralloc_no_move_shrink(tsdn_t *tsdn, edata_t *edata, size_t usize) { + arena_t *arena = arena_get_from_edata(edata); + ehooks_t *ehooks = arena_get_ehooks(arena); + size_t old_size = edata_size_get(edata); + size_t old_usize = edata_usize_get(edata); + + assert(old_usize > usize); + + if (ehooks_split_will_fail(ehooks)) { + return true; + } + + bool deferred_work_generated = false; + bool err = pa_shrink(tsdn, &arena->pa_shard, edata, old_size, + usize + sz_large_pad, sz_size2index(usize), + &deferred_work_generated); + if (err) { + return true; + } + if (deferred_work_generated) { + arena_handle_deferred_work(tsdn, arena); + } + arena_extent_ralloc_large_shrink(tsdn, arena, edata, old_usize); + + return false; +} + +static bool +large_ralloc_no_move_expand(tsdn_t *tsdn, edata_t *edata, size_t usize, + bool zero) { + arena_t *arena = arena_get_from_edata(edata); + + size_t old_size = edata_size_get(edata); + size_t old_usize = edata_usize_get(edata); + size_t new_size = usize + sz_large_pad; + + szind_t szind = sz_size2index(usize); + + bool deferred_work_generated = false; + bool err = pa_expand(tsdn, &arena->pa_shard, edata, old_size, new_size, + szind, zero, &deferred_work_generated); + + if (deferred_work_generated) { + arena_handle_deferred_work(tsdn, arena); + } + + if (err) { + return true; + } + + if (zero) { + if (opt_cache_oblivious) { + assert(sz_large_pad == PAGE); + /* + * Zero the trailing bytes of the original allocation's + * last page, since they are in an indeterminate state. + * There will always be trailing bytes, because ptr's + * offset from the beginning of the extent is a multiple + * of CACHELINE in [0 .. PAGE). + */ + void *zbase = (void *) + ((byte_t *)edata_addr_get(edata) + old_usize); + void *zpast = PAGE_ADDR2BASE((void *)((byte_t *)zbase + + PAGE)); + size_t nzero = (byte_t *)zpast - (byte_t *)zbase; + assert(nzero > 0); + memset(zbase, 0, nzero); + } + } + arena_extent_ralloc_large_expand(tsdn, arena, edata, old_usize); + + return false; +} + +bool +large_ralloc_no_move(tsdn_t *tsdn, edata_t *edata, size_t usize_min, + size_t usize_max, bool zero) { + size_t oldusize = edata_usize_get(edata); + + /* The following should have been caught by callers. */ + assert(usize_min > 0 && usize_max <= SC_LARGE_MAXCLASS); + /* Both allocation sizes must be large to avoid a move. */ + assert(oldusize >= SC_LARGE_MINCLASS + && usize_max >= SC_LARGE_MINCLASS); + + if (usize_max > oldusize) { + /* Attempt to expand the allocation in-place. */ + if (!large_ralloc_no_move_expand(tsdn, edata, usize_max, + zero)) { + arena_decay_tick(tsdn, arena_get_from_edata(edata)); + return false; + } + /* Try again, this time with usize_min. */ + if (usize_min < usize_max && usize_min > oldusize && + large_ralloc_no_move_expand(tsdn, edata, usize_min, zero)) { + arena_decay_tick(tsdn, arena_get_from_edata(edata)); + return false; + } + } + + /* + * Avoid moving the allocation if the existing extent size accommodates + * the new size. + */ + if (oldusize >= usize_min && oldusize <= usize_max) { + arena_decay_tick(tsdn, arena_get_from_edata(edata)); + return false; + } + + /* Attempt to shrink the allocation in-place. */ + if (oldusize > usize_max) { + if (!large_ralloc_no_move_shrink(tsdn, edata, usize_max)) { + arena_decay_tick(tsdn, arena_get_from_edata(edata)); + return false; + } + } + return true; +} + +static void * +large_ralloc_move_helper(tsdn_t *tsdn, arena_t *arena, size_t usize, + size_t alignment, bool zero) { + if (alignment <= CACHELINE) { + return large_malloc(tsdn, arena, usize, zero); + } + return large_palloc(tsdn, arena, usize, alignment, zero); +} + +void * +large_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t usize, + size_t alignment, bool zero, tcache_t *tcache, + hook_ralloc_args_t *hook_args) { + edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, ptr); + + size_t oldusize = edata_usize_get(edata); + /* The following should have been caught by callers. */ + assert(usize > 0 && usize <= SC_LARGE_MAXCLASS); + /* Both allocation sizes must be large to avoid a move. */ + assert(oldusize >= SC_LARGE_MINCLASS + && usize >= SC_LARGE_MINCLASS); + + /* Try to avoid moving the allocation. */ + if (!large_ralloc_no_move(tsdn, edata, usize, usize, zero)) { + hook_invoke_expand(hook_args->is_realloc + ? hook_expand_realloc : hook_expand_rallocx, ptr, oldusize, + usize, (uintptr_t)ptr, hook_args->args); + return edata_addr_get(edata); + } + + /* + * usize and old size are different enough that we need to use a + * different size class. In that case, fall back to allocating new + * space and copying. + */ + void *ret = large_ralloc_move_helper(tsdn, arena, usize, alignment, + zero); + if (ret == NULL) { + return NULL; + } + + hook_invoke_alloc(hook_args->is_realloc + ? hook_alloc_realloc : hook_alloc_rallocx, ret, (uintptr_t)ret, + hook_args->args); + hook_invoke_dalloc(hook_args->is_realloc + ? hook_dalloc_realloc : hook_dalloc_rallocx, ptr, hook_args->args); + + size_t copysize = (usize < oldusize) ? usize : oldusize; + memcpy(ret, edata_addr_get(edata), copysize); + isdalloct(tsdn, edata_addr_get(edata), oldusize, tcache, NULL, true); + return ret; +} + +/* + * locked indicates whether the arena's large_mtx is currently held. + */ +static void +large_dalloc_prep_impl(tsdn_t *tsdn, arena_t *arena, edata_t *edata, + bool locked) { + if (!locked) { + /* See comments in arena_bin_slabs_full_insert(). */ + if (!arena_is_auto(arena)) { + malloc_mutex_lock(tsdn, &arena->large_mtx); + edata_list_active_remove(&arena->large, edata); + malloc_mutex_unlock(tsdn, &arena->large_mtx); + } + } else { + /* Only hold the large_mtx if necessary. */ + if (!arena_is_auto(arena)) { + malloc_mutex_assert_owner(tsdn, &arena->large_mtx); + edata_list_active_remove(&arena->large, edata); + } + } + arena_extent_dalloc_large_prep(tsdn, arena, edata); +} + +static void +large_dalloc_finish_impl(tsdn_t *tsdn, arena_t *arena, edata_t *edata) { + bool deferred_work_generated = false; + pa_dalloc(tsdn, &arena->pa_shard, edata, &deferred_work_generated); + if (deferred_work_generated) { + arena_handle_deferred_work(tsdn, arena); + } +} + +void +large_dalloc_prep_locked(tsdn_t *tsdn, edata_t *edata) { + large_dalloc_prep_impl(tsdn, arena_get_from_edata(edata), edata, true); +} + +void +large_dalloc_finish(tsdn_t *tsdn, edata_t *edata) { + large_dalloc_finish_impl(tsdn, arena_get_from_edata(edata), edata); +} + +void +large_dalloc(tsdn_t *tsdn, edata_t *edata) { + arena_t *arena = arena_get_from_edata(edata); + large_dalloc_prep_impl(tsdn, arena, edata, false); + large_dalloc_finish_impl(tsdn, arena, edata); + arena_decay_tick(tsdn, arena); +} + +size_t +large_salloc(tsdn_t *tsdn, const edata_t *edata) { + return edata_usize_get(edata); +} + +void +large_prof_info_get(tsd_t *tsd, edata_t *edata, prof_info_t *prof_info, + bool reset_recent) { + assert(prof_info != NULL); + + prof_tctx_t *alloc_tctx = edata_prof_tctx_get(edata); + prof_info->alloc_tctx = alloc_tctx; + + if (prof_tctx_is_valid(alloc_tctx)) { + nstime_copy(&prof_info->alloc_time, + edata_prof_alloc_time_get(edata)); + prof_info->alloc_size = edata_prof_alloc_size_get(edata); + if (reset_recent) { + /* + * Reset the pointer on the recent allocation record, + * so that this allocation is recorded as released. + */ + prof_recent_alloc_reset(tsd, edata); + } + } +} + +static void +large_prof_tctx_set(edata_t *edata, prof_tctx_t *tctx) { + edata_prof_tctx_set(edata, tctx); +} + +void +large_prof_tctx_reset(edata_t *edata) { + large_prof_tctx_set(edata, PROF_TCTX_SENTINEL); +} + +void +large_prof_info_set(edata_t *edata, prof_tctx_t *tctx, size_t size) { + nstime_t t; + nstime_prof_init_update(&t); + edata_prof_alloc_time_set(edata, &t); + edata_prof_alloc_size_set(edata, size); + edata_prof_recent_alloc_init(edata); + large_prof_tctx_set(edata, tctx); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/log.c b/src/duckdb/extension/jemalloc/jemalloc/src/log.c new file mode 100644 index 000000000..778902fb9 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/log.c @@ -0,0 +1,78 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/log.h" + +char log_var_names[JEMALLOC_LOG_VAR_BUFSIZE]; +atomic_b_t log_init_done = ATOMIC_INIT(false); + +/* + * Returns true if we were able to pick out a segment. Fills in r_segment_end + * with a pointer to the first character after the end of the string. + */ +static const char * +log_var_extract_segment(const char* segment_begin) { + const char *end; + for (end = segment_begin; *end != '\0' && *end != '|'; end++) { + } + return end; +} + +static bool +log_var_matches_segment(const char *segment_begin, const char *segment_end, + const char *log_var_begin, const char *log_var_end) { + assert(segment_begin <= segment_end); + assert(log_var_begin < log_var_end); + + ptrdiff_t segment_len = segment_end - segment_begin; + ptrdiff_t log_var_len = log_var_end - log_var_begin; + /* The special '.' segment matches everything. */ + if (segment_len == 1 && *segment_begin == '.') { + return true; + } + if (segment_len == log_var_len) { + return strncmp(segment_begin, log_var_begin, segment_len) == 0; + } else if (segment_len < log_var_len) { + return strncmp(segment_begin, log_var_begin, segment_len) == 0 + && log_var_begin[segment_len] == '.'; + } else { + return false; + } +} + +unsigned +log_var_update_state(log_var_t *log_var) { + const char *log_var_begin = log_var->name; + const char *log_var_end = log_var->name + strlen(log_var->name); + + /* Pointer to one before the beginning of the current segment. */ + const char *segment_begin = log_var_names; + + /* + * If log_init done is false, we haven't parsed the malloc conf yet. To + * avoid log-spew, we default to not displaying anything. + */ + if (!atomic_load_b(&log_init_done, ATOMIC_ACQUIRE)) { + return LOG_INITIALIZED_NOT_ENABLED; + } + + while (true) { + const char *segment_end = log_var_extract_segment( + segment_begin); + assert(segment_end < log_var_names + JEMALLOC_LOG_VAR_BUFSIZE); + if (log_var_matches_segment(segment_begin, segment_end, + log_var_begin, log_var_end)) { + atomic_store_u(&log_var->state, LOG_ENABLED, + ATOMIC_RELAXED); + return LOG_ENABLED; + } + if (*segment_end == '\0') { + /* Hit the end of the segment string with no match. */ + atomic_store_u(&log_var->state, + LOG_INITIALIZED_NOT_ENABLED, ATOMIC_RELAXED); + return LOG_INITIALIZED_NOT_ENABLED; + } + /* Otherwise, skip the delimiter and continue. */ + segment_begin = segment_end + 1; + } +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/malloc_io.c b/src/duckdb/extension/jemalloc/jemalloc/src/malloc_io.c new file mode 100644 index 000000000..d067bc8e4 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/malloc_io.c @@ -0,0 +1,696 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/malloc_io.h" +#include "jemalloc/internal/util.h" + +#ifdef assert +# undef assert +#endif +#ifdef not_reached +# undef not_reached +#endif +#ifdef not_implemented +# undef not_implemented +#endif +#ifdef assert_not_implemented +# undef assert_not_implemented +#endif + +/* + * Define simple versions of assertion macros that won't recurse in case + * of assertion failures in malloc_*printf(). + */ +#define assert(e) do { \ + if (config_debug && !(e)) { \ + malloc_write(": Failed assertion\n"); \ + abort(); \ + } \ +} while (0) + +#define not_reached() do { \ + if (config_debug) { \ + malloc_write(": Unreachable code reached\n"); \ + abort(); \ + } \ + unreachable(); \ +} while (0) + +#define not_implemented() do { \ + if (config_debug) { \ + malloc_write(": Not implemented\n"); \ + abort(); \ + } \ +} while (0) + +#define assert_not_implemented(e) do { \ + if (unlikely(config_debug && !(e))) { \ + not_implemented(); \ + } \ +} while (0) + +/******************************************************************************/ +/* Function prototypes for non-inline static functions. */ + +#define U2S_BUFSIZE ((1U << (LG_SIZEOF_INTMAX_T + 3)) + 1) +static char *u2s(uintmax_t x, unsigned base, bool uppercase, char *s, + size_t *slen_p); +#define D2S_BUFSIZE (1 + U2S_BUFSIZE) +static char *d2s(intmax_t x, char sign, char *s, size_t *slen_p); +#define O2S_BUFSIZE (1 + U2S_BUFSIZE) +static char *o2s(uintmax_t x, bool alt_form, char *s, size_t *slen_p); +#define X2S_BUFSIZE (2 + U2S_BUFSIZE) +static char *x2s(uintmax_t x, bool alt_form, bool uppercase, char *s, + size_t *slen_p); + +/******************************************************************************/ + +/* malloc_message() setup. */ +void +wrtmessage(void *cbopaque, const char *s) { + malloc_write_fd(STDERR_FILENO, s, strlen(s)); +} + +JEMALLOC_EXPORT void (*je_malloc_message)(void *, const char *s); + +/* + * Wrapper around malloc_message() that avoids the need for + * je_malloc_message(...) throughout the code. + */ +void +malloc_write(const char *s) { +#ifdef DEBUG + if (je_malloc_message != NULL) { + je_malloc_message(NULL, s); + } else { + wrtmessage(NULL, s); + } +#endif +} + +/* + * glibc provides a non-standard strerror_r() when _GNU_SOURCE is defined, so + * provide a wrapper. + */ +int +buferror(int err, char *buf, size_t buflen) { +#ifdef _WIN32 + FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM, NULL, err, 0, + (LPSTR)buf, (DWORD)buflen, NULL); + return 0; +#elif defined(JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE) && defined(_GNU_SOURCE) + char *b = strerror_r(err, buf, buflen); + if (b != buf) { + strncpy(buf, b, buflen); + buf[buflen-1] = '\0'; + } + return 0; +#else + return strerror_r(err, buf, buflen); +#endif +} + +uintmax_t +malloc_strtoumax(const char *restrict nptr, char **restrict endptr, int base) { + uintmax_t ret, digit; + unsigned b; + bool neg; + const char *p, *ns; + + p = nptr; + if (base < 0 || base == 1 || base > 36) { + ns = p; + set_errno(EINVAL); + ret = UINTMAX_MAX; + goto label_return; + } + b = base; + + /* Swallow leading whitespace and get sign, if any. */ + neg = false; + while (true) { + switch (*p) { + case '\t': case '\n': case '\v': case '\f': case '\r': case ' ': + p++; + break; + case '-': + neg = true; + JEMALLOC_FALLTHROUGH; + case '+': + p++; + JEMALLOC_FALLTHROUGH; + default: + goto label_prefix; + } + } + + /* Get prefix, if any. */ + label_prefix: + /* + * Note where the first non-whitespace/sign character is so that it is + * possible to tell whether any digits are consumed (e.g., " 0" vs. + * " -x"). + */ + ns = p; + if (*p == '0') { + switch (p[1]) { + case '0': case '1': case '2': case '3': case '4': case '5': + case '6': case '7': + if (b == 0) { + b = 8; + } + if (b == 8) { + p++; + } + break; + case 'X': case 'x': + switch (p[2]) { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + case 'A': case 'B': case 'C': case 'D': case 'E': + case 'F': + case 'a': case 'b': case 'c': case 'd': case 'e': + case 'f': + if (b == 0) { + b = 16; + } + if (b == 16) { + p += 2; + } + break; + default: + break; + } + break; + default: + p++; + ret = 0; + goto label_return; + } + } + if (b == 0) { + b = 10; + } + + /* Convert. */ + ret = 0; + while ((*p >= '0' && *p <= '9' && (digit = *p - '0') < b) + || (*p >= 'A' && *p <= 'Z' && (digit = 10 + *p - 'A') < b) + || (*p >= 'a' && *p <= 'z' && (digit = 10 + *p - 'a') < b)) { + uintmax_t pret = ret; + ret *= b; + ret += digit; + if (ret < pret) { + /* Overflow. */ + set_errno(ERANGE); + ret = UINTMAX_MAX; + goto label_return; + } + p++; + } + if (neg) { + ret = (uintmax_t)(-((intmax_t)ret)); + } + + if (p == ns) { + /* No conversion performed. */ + set_errno(EINVAL); + ret = UINTMAX_MAX; + goto label_return; + } + +label_return: + if (endptr != NULL) { + if (p == ns) { + /* No characters were converted. */ + *endptr = (char *)nptr; + } else { + *endptr = (char *)p; + } + } + return ret; +} + +static char * +u2s(uintmax_t x, unsigned base, bool uppercase, char *s, size_t *slen_p) { + unsigned i; + + i = U2S_BUFSIZE - 1; + s[i] = '\0'; + switch (base) { + case 10: + do { + i--; + s[i] = "0123456789"[x % (uint64_t)10]; + x /= (uint64_t)10; + } while (x > 0); + break; + case 16: { + const char *digits = (uppercase) + ? "0123456789ABCDEF" + : "0123456789abcdef"; + + do { + i--; + s[i] = digits[x & 0xf]; + x >>= 4; + } while (x > 0); + break; + } default: { + const char *digits = (uppercase) + ? "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" + : "0123456789abcdefghijklmnopqrstuvwxyz"; + + assert(base >= 2 && base <= 36); + do { + i--; + s[i] = digits[x % (uint64_t)base]; + x /= (uint64_t)base; + } while (x > 0); + }} + + *slen_p = U2S_BUFSIZE - 1 - i; + return &s[i]; +} + +static char * +d2s(intmax_t x, char sign, char *s, size_t *slen_p) { + bool neg; + + if ((neg = (x < 0))) { + x = -x; + } + s = u2s(x, 10, false, s, slen_p); + if (neg) { + sign = '-'; + } + switch (sign) { + case '-': + if (!neg) { + break; + } + JEMALLOC_FALLTHROUGH; + case ' ': + case '+': + s--; + (*slen_p)++; + *s = sign; + break; + default: not_reached(); + } + return s; +} + +static char * +o2s(uintmax_t x, bool alt_form, char *s, size_t *slen_p) { + s = u2s(x, 8, false, s, slen_p); + if (alt_form && *s != '0') { + s--; + (*slen_p)++; + *s = '0'; + } + return s; +} + +static char * +x2s(uintmax_t x, bool alt_form, bool uppercase, char *s, size_t *slen_p) { + s = u2s(x, 16, uppercase, s, slen_p); + if (alt_form) { + s -= 2; + (*slen_p) += 2; + s[0] = '0'; + s[1] = uppercase ? 'X' : 'x'; + } + return s; +} + +JEMALLOC_COLD +size_t +malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap) { + size_t i; + const char *f; + +#define APPEND_C(c) do { \ + if (i < size) { \ + str[i] = (c); \ + } \ + i++; \ +} while (0) +#define APPEND_S(s, slen) do { \ + if (i < size) { \ + size_t cpylen = (slen <= size - i) ? slen : size - i; \ + memcpy(&str[i], s, cpylen); \ + } \ + i += slen; \ +} while (0) +#define APPEND_PADDED_S(s, slen, width, left_justify) do { \ + /* Left padding. */ \ + size_t pad_len = (width == -1) ? 0 : ((slen < (size_t)width) ? \ + (size_t)width - slen : 0); \ + if (!left_justify && pad_len != 0) { \ + size_t j; \ + for (j = 0; j < pad_len; j++) { \ + if (pad_zero) { \ + APPEND_C('0'); \ + } else { \ + APPEND_C(' '); \ + } \ + } \ + } \ + /* Value. */ \ + APPEND_S(s, slen); \ + /* Right padding. */ \ + if (left_justify && pad_len != 0) { \ + size_t j; \ + for (j = 0; j < pad_len; j++) { \ + APPEND_C(' '); \ + } \ + } \ +} while (0) +#define GET_ARG_NUMERIC(val, len) do { \ + switch ((unsigned char)len) { \ + case '?': \ + val = va_arg(ap, int); \ + break; \ + case '?' | 0x80: \ + val = va_arg(ap, unsigned int); \ + break; \ + case 'l': \ + val = va_arg(ap, long); \ + break; \ + case 'l' | 0x80: \ + val = va_arg(ap, unsigned long); \ + break; \ + case 'q': \ + val = va_arg(ap, long long); \ + break; \ + case 'q' | 0x80: \ + val = va_arg(ap, unsigned long long); \ + break; \ + case 'j': \ + val = va_arg(ap, intmax_t); \ + break; \ + case 'j' | 0x80: \ + val = va_arg(ap, uintmax_t); \ + break; \ + case 't': \ + val = va_arg(ap, ptrdiff_t); \ + break; \ + case 'z': \ + val = va_arg(ap, ssize_t); \ + break; \ + case 'z' | 0x80: \ + val = va_arg(ap, size_t); \ + break; \ + case 'p': /* Synthetic; used for %p. */ \ + val = va_arg(ap, uintptr_t); \ + break; \ + default: \ + not_reached(); \ + val = 0; \ + } \ +} while (0) + + i = 0; + f = format; + while (true) { + switch (*f) { + case '\0': goto label_out; + case '%': { + bool alt_form = false; + bool left_justify = false; + bool plus_space = false; + bool plus_plus = false; + int prec = -1; + int width = -1; + unsigned char len = '?'; + char *s; + size_t slen; + bool pad_zero = false; + + f++; + /* Flags. */ + while (true) { + switch (*f) { + case '#': + assert(!alt_form); + alt_form = true; + break; + case '-': + assert(!left_justify); + left_justify = true; + break; + case ' ': + assert(!plus_space); + plus_space = true; + break; + case '+': + assert(!plus_plus); + plus_plus = true; + break; + default: goto label_width; + } + f++; + } + /* Width. */ + label_width: + switch (*f) { + case '*': + width = va_arg(ap, int); + f++; + if (width < 0) { + left_justify = true; + width = -width; + } + break; + case '0': + pad_zero = true; + JEMALLOC_FALLTHROUGH; + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': { + uintmax_t uwidth; + set_errno(0); + uwidth = malloc_strtoumax(f, (char **)&f, 10); + assert(uwidth != UINTMAX_MAX || get_errno() != + ERANGE); + width = (int)uwidth; + break; + } default: + break; + } + /* Width/precision separator. */ + if (*f == '.') { + f++; + } else { + goto label_length; + } + /* Precision. */ + switch (*f) { + case '*': + prec = va_arg(ap, int); + f++; + break; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': { + uintmax_t uprec; + set_errno(0); + uprec = malloc_strtoumax(f, (char **)&f, 10); + assert(uprec != UINTMAX_MAX || get_errno() != + ERANGE); + prec = (int)uprec; + break; + } + default: break; + } + /* Length. */ + label_length: + switch (*f) { + case 'l': + f++; + if (*f == 'l') { + len = 'q'; + f++; + } else { + len = 'l'; + } + break; + case 'q': case 'j': case 't': case 'z': + len = *f; + f++; + break; + default: break; + } + /* Conversion specifier. */ + switch (*f) { + case '%': + /* %% */ + APPEND_C(*f); + f++; + break; + case 'd': case 'i': { + intmax_t val JEMALLOC_CC_SILENCE_INIT(0); + char buf[D2S_BUFSIZE]; + + /* + * Outputting negative, zero-padded numbers + * would require a nontrivial rework of the + * interaction between the width and padding + * (since 0 padding goes between the '-' and the + * number, while ' ' padding goes either before + * the - or after the number. Since we + * currently don't ever need 0-padded negative + * numbers, just don't bother supporting it. + */ + assert(!pad_zero); + + GET_ARG_NUMERIC(val, len); + s = d2s(val, (plus_plus ? '+' : (plus_space ? + ' ' : '-')), buf, &slen); + APPEND_PADDED_S(s, slen, width, left_justify); + f++; + break; + } case 'o': { + uintmax_t val JEMALLOC_CC_SILENCE_INIT(0); + char buf[O2S_BUFSIZE]; + + GET_ARG_NUMERIC(val, len | 0x80); + s = o2s(val, alt_form, buf, &slen); + APPEND_PADDED_S(s, slen, width, left_justify); + f++; + break; + } case 'u': { + uintmax_t val JEMALLOC_CC_SILENCE_INIT(0); + char buf[U2S_BUFSIZE]; + + GET_ARG_NUMERIC(val, len | 0x80); + s = u2s(val, 10, false, buf, &slen); + APPEND_PADDED_S(s, slen, width, left_justify); + f++; + break; + } case 'x': case 'X': { + uintmax_t val JEMALLOC_CC_SILENCE_INIT(0); + char buf[X2S_BUFSIZE]; + + GET_ARG_NUMERIC(val, len | 0x80); + s = x2s(val, alt_form, *f == 'X', buf, &slen); + APPEND_PADDED_S(s, slen, width, left_justify); + f++; + break; + } case 'c': { + unsigned char val; + char buf[2]; + + assert(len == '?' || len == 'l'); + assert_not_implemented(len != 'l'); + val = va_arg(ap, int); + buf[0] = val; + buf[1] = '\0'; + APPEND_PADDED_S(buf, 1, width, left_justify); + f++; + break; + } case 's': + assert(len == '?' || len == 'l'); + assert_not_implemented(len != 'l'); + s = va_arg(ap, char *); + slen = (prec < 0) ? strlen(s) : (size_t)prec; + APPEND_PADDED_S(s, slen, width, left_justify); + f++; + break; + case 'p': { + uintmax_t val; + char buf[X2S_BUFSIZE]; + + GET_ARG_NUMERIC(val, 'p'); + s = x2s(val, true, false, buf, &slen); + APPEND_PADDED_S(s, slen, width, left_justify); + f++; + break; + } default: not_reached(); + } + break; + } default: { + APPEND_C(*f); + f++; + break; + }} + } + label_out: + if (i < size) { + str[i] = '\0'; + } else { + str[size - 1] = '\0'; + } + +#undef APPEND_C +#undef APPEND_S +#undef APPEND_PADDED_S +#undef GET_ARG_NUMERIC + return i; +} + +JEMALLOC_FORMAT_PRINTF(3, 4) +size_t +malloc_snprintf(char *str, size_t size, const char *format, ...) { + size_t ret; + va_list ap; + + va_start(ap, format); + ret = malloc_vsnprintf(str, size, format, ap); + va_end(ap); + + return ret; +} + +void +malloc_vcprintf(write_cb_t *write_cb, void *cbopaque, const char *format, + va_list ap) { + char buf[MALLOC_PRINTF_BUFSIZE]; + + if (write_cb == NULL) { + /* + * The caller did not provide an alternate write_cb callback + * function, so use the default one. malloc_write() is an + * inline function, so use malloc_message() directly here. + */ + write_cb = (je_malloc_message != NULL) ? je_malloc_message : + wrtmessage; + } + + malloc_vsnprintf(buf, sizeof(buf), format, ap); + write_cb(cbopaque, buf); +} + +/* + * Print to a callback function in such a way as to (hopefully) avoid memory + * allocation. + */ +JEMALLOC_FORMAT_PRINTF(3, 4) +void +malloc_cprintf(write_cb_t *write_cb, void *cbopaque, const char *format, ...) { + va_list ap; + + va_start(ap, format); + malloc_vcprintf(write_cb, cbopaque, format, ap); + va_end(ap); +} + +/* Print to stderr in such a way as to avoid memory allocation. */ +JEMALLOC_FORMAT_PRINTF(1, 2) +void +malloc_printf(const char *format, ...) { + va_list ap; + + va_start(ap, format); + malloc_vcprintf(NULL, NULL, format, ap); + va_end(ap); +} + +/* + * Restore normal assertion macros, in order to make it possible to compile all + * C files as a single concatenation. + */ +#undef assert +#undef not_reached +#undef not_implemented +#undef assert_not_implemented +#include "jemalloc/internal/assert.h" diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/mutex.c b/src/duckdb/extension/jemalloc/jemalloc/src/mutex.c new file mode 100644 index 000000000..5655100de --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/mutex.c @@ -0,0 +1,228 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/malloc_io.h" +#include "jemalloc/internal/spin.h" + +#if defined(_WIN32) && !defined(_CRT_SPINCOUNT) +#define _CRT_SPINCOUNT 4000 +#endif + +/* + * Based on benchmark results, a fixed spin with this amount of retries works + * well for our critical sections. + */ +int64_t opt_mutex_max_spin = 600; + +/******************************************************************************/ +/* Data. */ + +#ifdef JEMALLOC_LAZY_LOCK +bool isthreaded = false; +#endif +#ifdef JEMALLOC_MUTEX_INIT_CB +static bool postpone_init = true; +static malloc_mutex_t *postponed_mutexes = NULL; +#endif + +/******************************************************************************/ +/* + * We intercept pthread_create() calls in order to toggle isthreaded if the + * process goes multi-threaded. + */ + +#if defined(JEMALLOC_LAZY_LOCK) && !defined(_WIN32) +JEMALLOC_EXPORT int +pthread_create(pthread_t *__restrict thread, + const pthread_attr_t *__restrict attr, void *(*start_routine)(void *), + void *__restrict arg) { + return pthread_create_wrapper(thread, attr, start_routine, arg); +} +#endif + +/******************************************************************************/ + +#ifdef JEMALLOC_MUTEX_INIT_CB +JEMALLOC_EXPORT int _pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex, + void *(calloc_cb)(size_t, size_t)); +#endif + +void +malloc_mutex_lock_slow(malloc_mutex_t *mutex) { + mutex_prof_data_t *data = &mutex->prof_data; + nstime_t before; + + if (ncpus == 1) { + goto label_spin_done; + } + + int cnt = 0; + do { + spin_cpu_spinwait(); + if (!atomic_load_b(&mutex->locked, ATOMIC_RELAXED) + && !malloc_mutex_trylock_final(mutex)) { + data->n_spin_acquired++; + return; + } + } while (cnt++ < opt_mutex_max_spin || opt_mutex_max_spin == -1); + + if (!config_stats) { + /* Only spin is useful when stats is off. */ + malloc_mutex_lock_final(mutex); + return; + } +label_spin_done: + nstime_init_update(&before); + /* Copy before to after to avoid clock skews. */ + nstime_t after; + nstime_copy(&after, &before); + uint32_t n_thds = atomic_fetch_add_u32(&data->n_waiting_thds, 1, + ATOMIC_RELAXED) + 1; + /* One last try as above two calls may take quite some cycles. */ + if (!malloc_mutex_trylock_final(mutex)) { + atomic_fetch_sub_u32(&data->n_waiting_thds, 1, ATOMIC_RELAXED); + data->n_spin_acquired++; + return; + } + + /* True slow path. */ + malloc_mutex_lock_final(mutex); + /* Update more slow-path only counters. */ + atomic_fetch_sub_u32(&data->n_waiting_thds, 1, ATOMIC_RELAXED); + nstime_update(&after); + + nstime_t delta; + nstime_copy(&delta, &after); + nstime_subtract(&delta, &before); + + data->n_wait_times++; + nstime_add(&data->tot_wait_time, &delta); + if (nstime_compare(&data->max_wait_time, &delta) < 0) { + nstime_copy(&data->max_wait_time, &delta); + } + if (n_thds > data->max_n_thds) { + data->max_n_thds = n_thds; + } +} + +static void +mutex_prof_data_init(mutex_prof_data_t *data) { + memset(data, 0, sizeof(mutex_prof_data_t)); + nstime_init_zero(&data->max_wait_time); + nstime_init_zero(&data->tot_wait_time); + data->prev_owner = NULL; +} + +void +malloc_mutex_prof_data_reset(tsdn_t *tsdn, malloc_mutex_t *mutex) { + malloc_mutex_assert_owner(tsdn, mutex); + mutex_prof_data_init(&mutex->prof_data); +} + +static int +mutex_addr_comp(const witness_t *witness1, void *mutex1, + const witness_t *witness2, void *mutex2) { + assert(mutex1 != NULL); + assert(mutex2 != NULL); + uintptr_t mu1int = (uintptr_t)mutex1; + uintptr_t mu2int = (uintptr_t)mutex2; + if (mu1int < mu2int) { + return -1; + } else if (mu1int == mu2int) { + return 0; + } else { + return 1; + } +} + +bool +malloc_mutex_init(malloc_mutex_t *mutex, const char *name, + witness_rank_t rank, malloc_mutex_lock_order_t lock_order) { + mutex_prof_data_init(&mutex->prof_data); +#ifdef _WIN32 +# if _WIN32_WINNT >= 0x0600 + InitializeSRWLock(&mutex->lock); +# else + if (!InitializeCriticalSectionAndSpinCount(&mutex->lock, + _CRT_SPINCOUNT)) { + return true; + } +# endif +#elif (defined(JEMALLOC_OS_UNFAIR_LOCK)) + mutex->lock = OS_UNFAIR_LOCK_INIT; +#elif (defined(JEMALLOC_MUTEX_INIT_CB)) + if (postpone_init) { + mutex->postponed_next = postponed_mutexes; + postponed_mutexes = mutex; + } else { + if (_pthread_mutex_init_calloc_cb(&mutex->lock, + bootstrap_calloc) != 0) { + return true; + } + } +#else + pthread_mutexattr_t attr; + + if (pthread_mutexattr_init(&attr) != 0) { + return true; + } + pthread_mutexattr_settype(&attr, MALLOC_MUTEX_TYPE); + if (pthread_mutex_init(&mutex->lock, &attr) != 0) { + pthread_mutexattr_destroy(&attr); + return true; + } + pthread_mutexattr_destroy(&attr); +#endif + if (config_debug) { + mutex->lock_order = lock_order; + if (lock_order == malloc_mutex_address_ordered) { + witness_init(&mutex->witness, name, rank, + mutex_addr_comp, mutex); + } else { + witness_init(&mutex->witness, name, rank, NULL, NULL); + } + } + return false; +} + +void +malloc_mutex_prefork(tsdn_t *tsdn, malloc_mutex_t *mutex) { + malloc_mutex_lock(tsdn, mutex); +} + +void +malloc_mutex_postfork_parent(tsdn_t *tsdn, malloc_mutex_t *mutex) { + malloc_mutex_unlock(tsdn, mutex); +} + +void +malloc_mutex_postfork_child(tsdn_t *tsdn, malloc_mutex_t *mutex) { +#ifdef JEMALLOC_MUTEX_INIT_CB + malloc_mutex_unlock(tsdn, mutex); +#else + if (malloc_mutex_init(mutex, mutex->witness.name, + mutex->witness.rank, mutex->lock_order)) { + malloc_printf(": Error re-initializing mutex in " + "child\n"); + if (opt_abort) { + abort(); + } + } +#endif +} + +bool +malloc_mutex_boot(void) { +#ifdef JEMALLOC_MUTEX_INIT_CB + postpone_init = false; + while (postponed_mutexes != NULL) { + if (_pthread_mutex_init_calloc_cb(&postponed_mutexes->lock, + bootstrap_calloc) != 0) { + return true; + } + postponed_mutexes = postponed_mutexes->postponed_next; + } +#endif + return false; +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/nstime.c b/src/duckdb/extension/jemalloc/jemalloc/src/nstime.c new file mode 100644 index 000000000..72f042272 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/nstime.c @@ -0,0 +1,289 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/nstime.h" + +#include "jemalloc/internal/assert.h" + +#define BILLION UINT64_C(1000000000) +#define MILLION UINT64_C(1000000) + +static void +nstime_set_initialized(nstime_t *time) { +#ifdef JEMALLOC_DEBUG + time->magic = NSTIME_MAGIC; +#endif +} + +static void +nstime_assert_initialized(const nstime_t *time) { +#ifdef JEMALLOC_DEBUG + /* + * Some parts (e.g. stats) rely on memset to zero initialize. Treat + * these as valid initialization. + */ + assert(time->magic == NSTIME_MAGIC || + (time->magic == 0 && time->ns == 0)); +#endif +} + +static void +nstime_pair_assert_initialized(const nstime_t *t1, const nstime_t *t2) { + nstime_assert_initialized(t1); + nstime_assert_initialized(t2); +} + +static void +nstime_initialize_operand(nstime_t *time) { + /* + * Operations like nstime_add may have the initial operand being zero + * initialized (covered by the assert below). Full-initialize needed + * before changing it to non-zero. + */ + nstime_assert_initialized(time); + nstime_set_initialized(time); +} + +void +nstime_init(nstime_t *time, uint64_t ns) { + nstime_set_initialized(time); + time->ns = ns; +} + +void +nstime_init2(nstime_t *time, uint64_t sec, uint64_t nsec) { + nstime_set_initialized(time); + time->ns = sec * BILLION + nsec; +} + +uint64_t +nstime_ns(const nstime_t *time) { + nstime_assert_initialized(time); + return time->ns; +} + +uint64_t +nstime_msec(const nstime_t *time) { + nstime_assert_initialized(time); + return time->ns / MILLION; +} + +uint64_t +nstime_sec(const nstime_t *time) { + nstime_assert_initialized(time); + return time->ns / BILLION; +} + +uint64_t +nstime_nsec(const nstime_t *time) { + nstime_assert_initialized(time); + return time->ns % BILLION; +} + +void +nstime_copy(nstime_t *time, const nstime_t *source) { + /* Source is required to be initialized. */ + nstime_assert_initialized(source); + *time = *source; + nstime_assert_initialized(time); +} + +int +nstime_compare(const nstime_t *a, const nstime_t *b) { + nstime_pair_assert_initialized(a, b); + return (a->ns > b->ns) - (a->ns < b->ns); +} + +void +nstime_add(nstime_t *time, const nstime_t *addend) { + nstime_pair_assert_initialized(time, addend); + assert(UINT64_MAX - time->ns >= addend->ns); + + nstime_initialize_operand(time); + time->ns += addend->ns; +} + +void +nstime_iadd(nstime_t *time, uint64_t addend) { + nstime_assert_initialized(time); + assert(UINT64_MAX - time->ns >= addend); + + nstime_initialize_operand(time); + time->ns += addend; +} + +void +nstime_subtract(nstime_t *time, const nstime_t *subtrahend) { + nstime_pair_assert_initialized(time, subtrahend); + assert(nstime_compare(time, subtrahend) >= 0); + + /* No initialize operand -- subtraction must be initialized. */ + time->ns -= subtrahend->ns; +} + +void +nstime_isubtract(nstime_t *time, uint64_t subtrahend) { + nstime_assert_initialized(time); + assert(time->ns >= subtrahend); + + /* No initialize operand -- subtraction must be initialized. */ + time->ns -= subtrahend; +} + +void +nstime_imultiply(nstime_t *time, uint64_t multiplier) { + nstime_assert_initialized(time); + assert((((time->ns | multiplier) & (UINT64_MAX << (sizeof(uint64_t) << + 2))) == 0) || ((time->ns * multiplier) / multiplier == time->ns)); + + nstime_initialize_operand(time); + time->ns *= multiplier; +} + +void +nstime_idivide(nstime_t *time, uint64_t divisor) { + nstime_assert_initialized(time); + assert(divisor != 0); + + nstime_initialize_operand(time); + time->ns /= divisor; +} + +uint64_t +nstime_divide(const nstime_t *time, const nstime_t *divisor) { + nstime_pair_assert_initialized(time, divisor); + assert(divisor->ns != 0); + + /* No initialize operand -- *time itself remains unchanged. */ + return time->ns / divisor->ns; +} + +/* Returns time since *past, w/o updating *past. */ +uint64_t +nstime_ns_since(const nstime_t *past) { + nstime_assert_initialized(past); + + nstime_t now; + nstime_copy(&now, past); + nstime_update(&now); + + assert(nstime_compare(&now, past) >= 0); + return now.ns - past->ns; +} + +#ifdef _WIN32 +# define NSTIME_MONOTONIC false +static void +nstime_get(nstime_t *time) { + FILETIME ft; + uint64_t ticks_100ns; + + GetSystemTimeAsFileTime(&ft); + ticks_100ns = (((uint64_t)ft.dwHighDateTime) << 32) | ft.dwLowDateTime; + + nstime_init(time, ticks_100ns * 100); +} +#elif defined(JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE) +# define NSTIME_MONOTONIC true +static void +nstime_get(nstime_t *time) { + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC_COARSE, &ts); + nstime_init2(time, ts.tv_sec, ts.tv_nsec); +} +#elif defined(JEMALLOC_HAVE_CLOCK_MONOTONIC) +# define NSTIME_MONOTONIC true +static void +nstime_get(nstime_t *time) { + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + nstime_init2(time, ts.tv_sec, ts.tv_nsec); +} +#elif defined(JEMALLOC_HAVE_MACH_ABSOLUTE_TIME) +# define NSTIME_MONOTONIC true +static void +nstime_get(nstime_t *time) { + nstime_init(time, mach_absolute_time()); +} +#else +# define NSTIME_MONOTONIC false +static void +nstime_get(nstime_t *time) { + struct timeval tv; + + gettimeofday(&tv, NULL); + nstime_init2(time, tv.tv_sec, tv.tv_usec * 1000); +} +#endif + +static bool +nstime_monotonic_impl(void) { + return NSTIME_MONOTONIC; +#undef NSTIME_MONOTONIC +} +nstime_monotonic_t *JET_MUTABLE nstime_monotonic = nstime_monotonic_impl; + +prof_time_res_t opt_prof_time_res = + prof_time_res_default; + +const char *const prof_time_res_mode_names[] = { + "default", + "high", +}; + + +static void +nstime_get_realtime(nstime_t *time) { +#if defined(JEMALLOC_HAVE_CLOCK_REALTIME) && !defined(_WIN32) + struct timespec ts; + + clock_gettime(CLOCK_REALTIME, &ts); + nstime_init2(time, ts.tv_sec, ts.tv_nsec); +#else + unreachable(); +#endif +} + +static void +nstime_prof_update_impl(nstime_t *time) { + nstime_t old_time; + + nstime_copy(&old_time, time); + + if (opt_prof_time_res == prof_time_res_high) { + nstime_get_realtime(time); + } else { + nstime_get(time); + } +} +nstime_prof_update_t *JET_MUTABLE nstime_prof_update = nstime_prof_update_impl; + +static void +nstime_update_impl(nstime_t *time) { + nstime_t old_time; + + nstime_copy(&old_time, time); + nstime_get(time); + + /* Handle non-monotonic clocks. */ + if (unlikely(nstime_compare(&old_time, time) > 0)) { + nstime_copy(time, &old_time); + } +} +nstime_update_t *JET_MUTABLE nstime_update = nstime_update_impl; + +void +nstime_init_update(nstime_t *time) { + nstime_init_zero(time); + nstime_update(time); +} + +void +nstime_prof_init_update(nstime_t *time) { + nstime_init_zero(time); + nstime_prof_update(time); +} + + diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/pa.c b/src/duckdb/extension/jemalloc/jemalloc/src/pa.c new file mode 100644 index 000000000..7a24ae65b --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/pa.c @@ -0,0 +1,270 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/san.h" +#include "jemalloc/internal/hpa.h" + +static void +pa_nactive_add(pa_shard_t *shard, size_t add_pages) { + atomic_fetch_add_zu(&shard->nactive, add_pages, ATOMIC_RELAXED); +} + +static void +pa_nactive_sub(pa_shard_t *shard, size_t sub_pages) { + assert(pa_shard_nactive(shard) >= sub_pages); + atomic_fetch_sub_zu(&shard->nactive, sub_pages, ATOMIC_RELAXED); +} + +bool +pa_central_init(pa_central_t *central, base_t *base, bool hpa, + const hpa_hooks_t *hpa_hooks) { + bool err; + if (hpa) { + err = hpa_central_init(¢ral->hpa, base, hpa_hooks); + if (err) { + return true; + } + } + return false; +} + +bool +pa_shard_init(tsdn_t *tsdn, pa_shard_t *shard, pa_central_t *central, + emap_t *emap, base_t *base, unsigned ind, pa_shard_stats_t *stats, + malloc_mutex_t *stats_mtx, nstime_t *cur_time, + size_t pac_oversize_threshold, ssize_t dirty_decay_ms, + ssize_t muzzy_decay_ms) { + /* This will change eventually, but for now it should hold. */ + assert(base_ind_get(base) == ind); + if (edata_cache_init(&shard->edata_cache, base)) { + return true; + } + + if (pac_init(tsdn, &shard->pac, base, emap, &shard->edata_cache, + cur_time, pac_oversize_threshold, dirty_decay_ms, muzzy_decay_ms, + &stats->pac_stats, stats_mtx)) { + return true; + } + + shard->ind = ind; + + shard->ever_used_hpa = false; + atomic_store_b(&shard->use_hpa, false, ATOMIC_RELAXED); + + atomic_store_zu(&shard->nactive, 0, ATOMIC_RELAXED); + + shard->stats_mtx = stats_mtx; + shard->stats = stats; + memset(shard->stats, 0, sizeof(*shard->stats)); + + shard->central = central; + shard->emap = emap; + shard->base = base; + + return false; +} + +bool +pa_shard_enable_hpa(tsdn_t *tsdn, pa_shard_t *shard, + const hpa_shard_opts_t *hpa_opts, const sec_opts_t *hpa_sec_opts) { + if (hpa_shard_init(&shard->hpa_shard, &shard->central->hpa, shard->emap, + shard->base, &shard->edata_cache, shard->ind, hpa_opts)) { + return true; + } + if (sec_init(tsdn, &shard->hpa_sec, shard->base, &shard->hpa_shard.pai, + hpa_sec_opts)) { + return true; + } + shard->ever_used_hpa = true; + atomic_store_b(&shard->use_hpa, true, ATOMIC_RELAXED); + + return false; +} + +void +pa_shard_disable_hpa(tsdn_t *tsdn, pa_shard_t *shard) { + atomic_store_b(&shard->use_hpa, false, ATOMIC_RELAXED); + if (shard->ever_used_hpa) { + sec_disable(tsdn, &shard->hpa_sec); + hpa_shard_disable(tsdn, &shard->hpa_shard); + } +} + +void +pa_shard_reset(tsdn_t *tsdn, pa_shard_t *shard) { + atomic_store_zu(&shard->nactive, 0, ATOMIC_RELAXED); + if (shard->ever_used_hpa) { + sec_flush(tsdn, &shard->hpa_sec); + } +} + +static bool +pa_shard_uses_hpa(pa_shard_t *shard) { + return atomic_load_b(&shard->use_hpa, ATOMIC_RELAXED); +} + +void +pa_shard_destroy(tsdn_t *tsdn, pa_shard_t *shard) { + pac_destroy(tsdn, &shard->pac); + if (shard->ever_used_hpa) { + sec_flush(tsdn, &shard->hpa_sec); + hpa_shard_destroy(tsdn, &shard->hpa_shard); + } +} + +static pai_t * +pa_get_pai(pa_shard_t *shard, edata_t *edata) { + return (edata_pai_get(edata) == EXTENT_PAI_PAC + ? &shard->pac.pai : &shard->hpa_sec.pai); +} + +edata_t * +pa_alloc(tsdn_t *tsdn, pa_shard_t *shard, size_t size, size_t alignment, + bool slab, szind_t szind, bool zero, bool guarded, + bool *deferred_work_generated) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + assert(!guarded || alignment <= PAGE); + + edata_t *edata = NULL; + if (!guarded && pa_shard_uses_hpa(shard)) { + edata = pai_alloc(tsdn, &shard->hpa_sec.pai, size, alignment, + zero, /* guarded */ false, slab, deferred_work_generated); + } + /* + * Fall back to the PAC if the HPA is off or couldn't serve the given + * allocation request. + */ + if (edata == NULL) { + edata = pai_alloc(tsdn, &shard->pac.pai, size, alignment, zero, + guarded, slab, deferred_work_generated); + } + if (edata != NULL) { + assert(edata_size_get(edata) == size); + pa_nactive_add(shard, size >> LG_PAGE); + emap_remap(tsdn, shard->emap, edata, szind, slab); + edata_szind_set(edata, szind); + edata_slab_set(edata, slab); + if (slab && (size > 2 * PAGE)) { + emap_register_interior(tsdn, shard->emap, edata, szind); + } + assert(edata_arena_ind_get(edata) == shard->ind); + } + return edata; +} + +bool +pa_expand(tsdn_t *tsdn, pa_shard_t *shard, edata_t *edata, size_t old_size, + size_t new_size, szind_t szind, bool zero, bool *deferred_work_generated) { + assert(new_size > old_size); + assert(edata_size_get(edata) == old_size); + assert((new_size & PAGE_MASK) == 0); + if (edata_guarded_get(edata)) { + return true; + } + size_t expand_amount = new_size - old_size; + + pai_t *pai = pa_get_pai(shard, edata); + + bool error = pai_expand(tsdn, pai, edata, old_size, new_size, zero, + deferred_work_generated); + if (error) { + return true; + } + + pa_nactive_add(shard, expand_amount >> LG_PAGE); + edata_szind_set(edata, szind); + emap_remap(tsdn, shard->emap, edata, szind, /* slab */ false); + return false; +} + +bool +pa_shrink(tsdn_t *tsdn, pa_shard_t *shard, edata_t *edata, size_t old_size, + size_t new_size, szind_t szind, bool *deferred_work_generated) { + assert(new_size < old_size); + assert(edata_size_get(edata) == old_size); + assert((new_size & PAGE_MASK) == 0); + if (edata_guarded_get(edata)) { + return true; + } + size_t shrink_amount = old_size - new_size; + + pai_t *pai = pa_get_pai(shard, edata); + bool error = pai_shrink(tsdn, pai, edata, old_size, new_size, + deferred_work_generated); + if (error) { + return true; + } + pa_nactive_sub(shard, shrink_amount >> LG_PAGE); + + edata_szind_set(edata, szind); + emap_remap(tsdn, shard->emap, edata, szind, /* slab */ false); + return false; +} + +void +pa_dalloc(tsdn_t *tsdn, pa_shard_t *shard, edata_t *edata, + bool *deferred_work_generated) { + emap_remap(tsdn, shard->emap, edata, SC_NSIZES, /* slab */ false); + if (edata_slab_get(edata)) { + emap_deregister_interior(tsdn, shard->emap, edata); + /* + * The slab state of the extent isn't cleared. It may be used + * by the pai implementation, e.g. to make caching decisions. + */ + } + edata_addr_set(edata, edata_base_get(edata)); + edata_szind_set(edata, SC_NSIZES); + pa_nactive_sub(shard, edata_size_get(edata) >> LG_PAGE); + pai_t *pai = pa_get_pai(shard, edata); + pai_dalloc(tsdn, pai, edata, deferred_work_generated); +} + +bool +pa_decay_ms_set(tsdn_t *tsdn, pa_shard_t *shard, extent_state_t state, + ssize_t decay_ms, pac_purge_eagerness_t eagerness) { + return pac_decay_ms_set(tsdn, &shard->pac, state, decay_ms, eagerness); +} + +ssize_t +pa_decay_ms_get(pa_shard_t *shard, extent_state_t state) { + return pac_decay_ms_get(&shard->pac, state); +} + +void +pa_shard_set_deferral_allowed(tsdn_t *tsdn, pa_shard_t *shard, + bool deferral_allowed) { + if (pa_shard_uses_hpa(shard)) { + hpa_shard_set_deferral_allowed(tsdn, &shard->hpa_shard, + deferral_allowed); + } +} + +void +pa_shard_do_deferred_work(tsdn_t *tsdn, pa_shard_t *shard) { + if (pa_shard_uses_hpa(shard)) { + hpa_shard_do_deferred_work(tsdn, &shard->hpa_shard); + } +} + +/* + * Get time until next deferred work ought to happen. If there are multiple + * things that have been deferred, this function calculates the time until + * the soonest of those things. + */ +uint64_t +pa_shard_time_until_deferred_work(tsdn_t *tsdn, pa_shard_t *shard) { + uint64_t time = pai_time_until_deferred_work(tsdn, &shard->pac.pai); + if (time == BACKGROUND_THREAD_DEFERRED_MIN) { + return time; + } + + if (pa_shard_uses_hpa(shard)) { + uint64_t hpa = + pai_time_until_deferred_work(tsdn, &shard->hpa_shard.pai); + if (hpa < time) { + time = hpa; + } + } + return time; +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/pa_extra.c b/src/duckdb/extension/jemalloc/jemalloc/src/pa_extra.c new file mode 100644 index 000000000..765070397 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/pa_extra.c @@ -0,0 +1,210 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +/* + * This file is logically part of the PA module. While pa.c contains the core + * allocator functionality, this file contains boring integration functionality; + * things like the pre- and post- fork handlers, and stats merging for CTL + * refreshes. + */ + +void +pa_shard_prefork0(tsdn_t *tsdn, pa_shard_t *shard) { + malloc_mutex_prefork(tsdn, &shard->pac.decay_dirty.mtx); + malloc_mutex_prefork(tsdn, &shard->pac.decay_muzzy.mtx); +} + +void +pa_shard_prefork2(tsdn_t *tsdn, pa_shard_t *shard) { + if (shard->ever_used_hpa) { + sec_prefork2(tsdn, &shard->hpa_sec); + } +} + +void +pa_shard_prefork3(tsdn_t *tsdn, pa_shard_t *shard) { + malloc_mutex_prefork(tsdn, &shard->pac.grow_mtx); + if (shard->ever_used_hpa) { + hpa_shard_prefork3(tsdn, &shard->hpa_shard); + } +} + +void +pa_shard_prefork4(tsdn_t *tsdn, pa_shard_t *shard) { + ecache_prefork(tsdn, &shard->pac.ecache_dirty); + ecache_prefork(tsdn, &shard->pac.ecache_muzzy); + ecache_prefork(tsdn, &shard->pac.ecache_retained); + if (shard->ever_used_hpa) { + hpa_shard_prefork4(tsdn, &shard->hpa_shard); + } +} + +void +pa_shard_prefork5(tsdn_t *tsdn, pa_shard_t *shard) { + edata_cache_prefork(tsdn, &shard->edata_cache); +} + +void +pa_shard_postfork_parent(tsdn_t *tsdn, pa_shard_t *shard) { + edata_cache_postfork_parent(tsdn, &shard->edata_cache); + ecache_postfork_parent(tsdn, &shard->pac.ecache_dirty); + ecache_postfork_parent(tsdn, &shard->pac.ecache_muzzy); + ecache_postfork_parent(tsdn, &shard->pac.ecache_retained); + malloc_mutex_postfork_parent(tsdn, &shard->pac.grow_mtx); + malloc_mutex_postfork_parent(tsdn, &shard->pac.decay_dirty.mtx); + malloc_mutex_postfork_parent(tsdn, &shard->pac.decay_muzzy.mtx); + if (shard->ever_used_hpa) { + sec_postfork_parent(tsdn, &shard->hpa_sec); + hpa_shard_postfork_parent(tsdn, &shard->hpa_shard); + } +} + +void +pa_shard_postfork_child(tsdn_t *tsdn, pa_shard_t *shard) { + edata_cache_postfork_child(tsdn, &shard->edata_cache); + ecache_postfork_child(tsdn, &shard->pac.ecache_dirty); + ecache_postfork_child(tsdn, &shard->pac.ecache_muzzy); + ecache_postfork_child(tsdn, &shard->pac.ecache_retained); + malloc_mutex_postfork_child(tsdn, &shard->pac.grow_mtx); + malloc_mutex_postfork_child(tsdn, &shard->pac.decay_dirty.mtx); + malloc_mutex_postfork_child(tsdn, &shard->pac.decay_muzzy.mtx); + if (shard->ever_used_hpa) { + sec_postfork_child(tsdn, &shard->hpa_sec); + hpa_shard_postfork_child(tsdn, &shard->hpa_shard); + } +} + +size_t +pa_shard_nactive(pa_shard_t *shard) { + return atomic_load_zu(&shard->nactive, ATOMIC_RELAXED); +} + +size_t +pa_shard_ndirty(pa_shard_t *shard) { + size_t ndirty = ecache_npages_get(&shard->pac.ecache_dirty); + if (shard->ever_used_hpa) { + ndirty += psset_ndirty(&shard->hpa_shard.psset); + } + return ndirty; +} + +size_t +pa_shard_nmuzzy(pa_shard_t *shard) { + return ecache_npages_get(&shard->pac.ecache_muzzy); +} + +void +pa_shard_basic_stats_merge(pa_shard_t *shard, size_t *nactive, size_t *ndirty, + size_t *nmuzzy) { + *nactive += pa_shard_nactive(shard); + *ndirty += pa_shard_ndirty(shard); + *nmuzzy += pa_shard_nmuzzy(shard); +} + +void +pa_shard_stats_merge(tsdn_t *tsdn, pa_shard_t *shard, + pa_shard_stats_t *pa_shard_stats_out, pac_estats_t *estats_out, + hpa_shard_stats_t *hpa_stats_out, sec_stats_t *sec_stats_out, + size_t *resident) { + cassert(config_stats); + + pa_shard_stats_out->pac_stats.retained += + ecache_npages_get(&shard->pac.ecache_retained) << LG_PAGE; + pa_shard_stats_out->edata_avail += atomic_load_zu( + &shard->edata_cache.count, ATOMIC_RELAXED); + + size_t resident_pgs = 0; + resident_pgs += pa_shard_nactive(shard); + resident_pgs += pa_shard_ndirty(shard); + *resident += (resident_pgs << LG_PAGE); + + /* Dirty decay stats */ + locked_inc_u64_unsynchronized( + &pa_shard_stats_out->pac_stats.decay_dirty.npurge, + locked_read_u64(tsdn, LOCKEDINT_MTX(*shard->stats_mtx), + &shard->pac.stats->decay_dirty.npurge)); + locked_inc_u64_unsynchronized( + &pa_shard_stats_out->pac_stats.decay_dirty.nmadvise, + locked_read_u64(tsdn, LOCKEDINT_MTX(*shard->stats_mtx), + &shard->pac.stats->decay_dirty.nmadvise)); + locked_inc_u64_unsynchronized( + &pa_shard_stats_out->pac_stats.decay_dirty.purged, + locked_read_u64(tsdn, LOCKEDINT_MTX(*shard->stats_mtx), + &shard->pac.stats->decay_dirty.purged)); + + /* Muzzy decay stats */ + locked_inc_u64_unsynchronized( + &pa_shard_stats_out->pac_stats.decay_muzzy.npurge, + locked_read_u64(tsdn, LOCKEDINT_MTX(*shard->stats_mtx), + &shard->pac.stats->decay_muzzy.npurge)); + locked_inc_u64_unsynchronized( + &pa_shard_stats_out->pac_stats.decay_muzzy.nmadvise, + locked_read_u64(tsdn, LOCKEDINT_MTX(*shard->stats_mtx), + &shard->pac.stats->decay_muzzy.nmadvise)); + locked_inc_u64_unsynchronized( + &pa_shard_stats_out->pac_stats.decay_muzzy.purged, + locked_read_u64(tsdn, LOCKEDINT_MTX(*shard->stats_mtx), + &shard->pac.stats->decay_muzzy.purged)); + + atomic_load_add_store_zu(&pa_shard_stats_out->pac_stats.abandoned_vm, + atomic_load_zu(&shard->pac.stats->abandoned_vm, ATOMIC_RELAXED)); + + for (pszind_t i = 0; i < SC_NPSIZES; i++) { + size_t dirty, muzzy, retained, dirty_bytes, muzzy_bytes, + retained_bytes; + dirty = ecache_nextents_get(&shard->pac.ecache_dirty, i); + muzzy = ecache_nextents_get(&shard->pac.ecache_muzzy, i); + retained = ecache_nextents_get(&shard->pac.ecache_retained, i); + dirty_bytes = ecache_nbytes_get(&shard->pac.ecache_dirty, i); + muzzy_bytes = ecache_nbytes_get(&shard->pac.ecache_muzzy, i); + retained_bytes = ecache_nbytes_get(&shard->pac.ecache_retained, + i); + + estats_out[i].ndirty = dirty; + estats_out[i].nmuzzy = muzzy; + estats_out[i].nretained = retained; + estats_out[i].dirty_bytes = dirty_bytes; + estats_out[i].muzzy_bytes = muzzy_bytes; + estats_out[i].retained_bytes = retained_bytes; + } + + if (shard->ever_used_hpa) { + hpa_shard_stats_merge(tsdn, &shard->hpa_shard, hpa_stats_out); + sec_stats_merge(tsdn, &shard->hpa_sec, sec_stats_out); + } +} + +static void +pa_shard_mtx_stats_read_single(tsdn_t *tsdn, mutex_prof_data_t *mutex_prof_data, + malloc_mutex_t *mtx, int ind) { + malloc_mutex_lock(tsdn, mtx); + malloc_mutex_prof_read(tsdn, &mutex_prof_data[ind], mtx); + malloc_mutex_unlock(tsdn, mtx); +} + +void +pa_shard_mtx_stats_read(tsdn_t *tsdn, pa_shard_t *shard, + mutex_prof_data_t mutex_prof_data[mutex_prof_num_arena_mutexes]) { + pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data, + &shard->edata_cache.mtx, arena_prof_mutex_extent_avail); + pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data, + &shard->pac.ecache_dirty.mtx, arena_prof_mutex_extents_dirty); + pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data, + &shard->pac.ecache_muzzy.mtx, arena_prof_mutex_extents_muzzy); + pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data, + &shard->pac.ecache_retained.mtx, arena_prof_mutex_extents_retained); + pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data, + &shard->pac.decay_dirty.mtx, arena_prof_mutex_decay_dirty); + pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data, + &shard->pac.decay_muzzy.mtx, arena_prof_mutex_decay_muzzy); + + if (shard->ever_used_hpa) { + pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data, + &shard->hpa_shard.mtx, arena_prof_mutex_hpa_shard); + pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data, + &shard->hpa_shard.grow_mtx, + arena_prof_mutex_hpa_shard_grow); + sec_mutex_stats_read(tsdn, &shard->hpa_sec, + &mutex_prof_data[arena_prof_mutex_hpa_sec]); + } +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/pac.c b/src/duckdb/extension/jemalloc/jemalloc/src/pac.c new file mode 100644 index 000000000..57a0c953d --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/pac.c @@ -0,0 +1,592 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/pac.h" +#include "jemalloc/internal/san.h" + +static edata_t *pac_alloc_impl(tsdn_t *tsdn, pai_t *self, size_t size, + size_t alignment, bool zero, bool guarded, bool frequent_reuse, + bool *deferred_work_generated); +static bool pac_expand_impl(tsdn_t *tsdn, pai_t *self, edata_t *edata, + size_t old_size, size_t new_size, bool zero, bool *deferred_work_generated); +static bool pac_shrink_impl(tsdn_t *tsdn, pai_t *self, edata_t *edata, + size_t old_size, size_t new_size, bool *deferred_work_generated); +static void pac_dalloc_impl(tsdn_t *tsdn, pai_t *self, edata_t *edata, + bool *deferred_work_generated); +static uint64_t pac_time_until_deferred_work(tsdn_t *tsdn, pai_t *self); + +static inline void +pac_decay_data_get(pac_t *pac, extent_state_t state, + decay_t **r_decay, pac_decay_stats_t **r_decay_stats, ecache_t **r_ecache) { + switch(state) { + case extent_state_dirty: + *r_decay = &pac->decay_dirty; + *r_decay_stats = &pac->stats->decay_dirty; + *r_ecache = &pac->ecache_dirty; + return; + case extent_state_muzzy: + *r_decay = &pac->decay_muzzy; + *r_decay_stats = &pac->stats->decay_muzzy; + *r_ecache = &pac->ecache_muzzy; + return; + case extent_state_active: + case extent_state_retained: + case extent_state_transition: + case extent_state_merging: + default: + unreachable(); + } +} + +bool +pac_init(tsdn_t *tsdn, pac_t *pac, base_t *base, emap_t *emap, + edata_cache_t *edata_cache, nstime_t *cur_time, + size_t pac_oversize_threshold, ssize_t dirty_decay_ms, + ssize_t muzzy_decay_ms, pac_stats_t *pac_stats, malloc_mutex_t *stats_mtx) { + unsigned ind = base_ind_get(base); + /* + * Delay coalescing for dirty extents despite the disruptive effect on + * memory layout for best-fit extent allocation, since cached extents + * are likely to be reused soon after deallocation, and the cost of + * merging/splitting extents is non-trivial. + */ + if (ecache_init(tsdn, &pac->ecache_dirty, extent_state_dirty, ind, + /* delay_coalesce */ true)) { + return true; + } + /* + * Coalesce muzzy extents immediately, because operations on them are in + * the critical path much less often than for dirty extents. + */ + if (ecache_init(tsdn, &pac->ecache_muzzy, extent_state_muzzy, ind, + /* delay_coalesce */ false)) { + return true; + } + /* + * Coalesce retained extents immediately, in part because they will + * never be evicted (and therefore there's no opportunity for delayed + * coalescing), but also because operations on retained extents are not + * in the critical path. + */ + if (ecache_init(tsdn, &pac->ecache_retained, extent_state_retained, + ind, /* delay_coalesce */ false)) { + return true; + } + exp_grow_init(&pac->exp_grow); + if (malloc_mutex_init(&pac->grow_mtx, "extent_grow", + WITNESS_RANK_EXTENT_GROW, malloc_mutex_rank_exclusive)) { + return true; + } + atomic_store_zu(&pac->oversize_threshold, pac_oversize_threshold, + ATOMIC_RELAXED); + if (decay_init(&pac->decay_dirty, cur_time, dirty_decay_ms)) { + return true; + } + if (decay_init(&pac->decay_muzzy, cur_time, muzzy_decay_ms)) { + return true; + } + if (san_bump_alloc_init(&pac->sba)) { + return true; + } + + pac->base = base; + pac->emap = emap; + pac->edata_cache = edata_cache; + pac->stats = pac_stats; + pac->stats_mtx = stats_mtx; + atomic_store_zu(&pac->extent_sn_next, 0, ATOMIC_RELAXED); + + pac->pai.alloc = &pac_alloc_impl; + pac->pai.alloc_batch = &pai_alloc_batch_default; + pac->pai.expand = &pac_expand_impl; + pac->pai.shrink = &pac_shrink_impl; + pac->pai.dalloc = &pac_dalloc_impl; + pac->pai.dalloc_batch = &pai_dalloc_batch_default; + pac->pai.time_until_deferred_work = &pac_time_until_deferred_work; + + return false; +} + +static inline bool +pac_may_have_muzzy(pac_t *pac) { + return pac_decay_ms_get(pac, extent_state_muzzy) != 0; +} + +static edata_t * +pac_alloc_real(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size, + size_t alignment, bool zero, bool guarded) { + assert(!guarded || alignment <= PAGE); + + edata_t *edata = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_dirty, + NULL, size, alignment, zero, guarded); + + if (edata == NULL && pac_may_have_muzzy(pac)) { + edata = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_muzzy, + NULL, size, alignment, zero, guarded); + } + if (edata == NULL) { + edata = ecache_alloc_grow(tsdn, pac, ehooks, + &pac->ecache_retained, NULL, size, alignment, zero, + guarded); + if (config_stats && edata != NULL) { + atomic_fetch_add_zu(&pac->stats->pac_mapped, size, + ATOMIC_RELAXED); + } + } + + return edata; +} + +static edata_t * +pac_alloc_new_guarded(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size, + size_t alignment, bool zero, bool frequent_reuse) { + assert(alignment <= PAGE); + + edata_t *edata; + if (san_bump_enabled() && frequent_reuse) { + edata = san_bump_alloc(tsdn, &pac->sba, pac, ehooks, size, + zero); + } else { + size_t size_with_guards = san_two_side_guarded_sz(size); + /* Alloc a non-guarded extent first.*/ + edata = pac_alloc_real(tsdn, pac, ehooks, size_with_guards, + /* alignment */ PAGE, zero, /* guarded */ false); + if (edata != NULL) { + /* Add guards around it. */ + assert(edata_size_get(edata) == size_with_guards); + san_guard_pages_two_sided(tsdn, ehooks, edata, + pac->emap, true); + } + } + assert(edata == NULL || (edata_guarded_get(edata) && + edata_size_get(edata) == size)); + + return edata; +} + +static edata_t * +pac_alloc_impl(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, + bool zero, bool guarded, bool frequent_reuse, + bool *deferred_work_generated) { + pac_t *pac = (pac_t *)self; + ehooks_t *ehooks = pac_ehooks_get(pac); + + edata_t *edata = NULL; + /* + * The condition is an optimization - not frequently reused guarded + * allocations are never put in the ecache. pac_alloc_real also + * doesn't grow retained for guarded allocations. So pac_alloc_real + * for such allocations would always return NULL. + * */ + if (!guarded || frequent_reuse) { + edata = pac_alloc_real(tsdn, pac, ehooks, size, alignment, + zero, guarded); + } + if (edata == NULL && guarded) { + /* No cached guarded extents; creating a new one. */ + edata = pac_alloc_new_guarded(tsdn, pac, ehooks, size, + alignment, zero, frequent_reuse); + } + + return edata; +} + +static bool +pac_expand_impl(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, + size_t new_size, bool zero, bool *deferred_work_generated) { + pac_t *pac = (pac_t *)self; + ehooks_t *ehooks = pac_ehooks_get(pac); + + size_t mapped_add = 0; + size_t expand_amount = new_size - old_size; + + if (ehooks_merge_will_fail(ehooks)) { + return true; + } + edata_t *trail = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_dirty, + edata, expand_amount, PAGE, zero, /* guarded*/ false); + if (trail == NULL) { + trail = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_muzzy, + edata, expand_amount, PAGE, zero, /* guarded*/ false); + } + if (trail == NULL) { + trail = ecache_alloc_grow(tsdn, pac, ehooks, + &pac->ecache_retained, edata, expand_amount, PAGE, zero, + /* guarded */ false); + mapped_add = expand_amount; + } + if (trail == NULL) { + return true; + } + if (extent_merge_wrapper(tsdn, pac, ehooks, edata, trail)) { + extent_dalloc_wrapper(tsdn, pac, ehooks, trail); + return true; + } + if (config_stats && mapped_add > 0) { + atomic_fetch_add_zu(&pac->stats->pac_mapped, mapped_add, + ATOMIC_RELAXED); + } + return false; +} + +static bool +pac_shrink_impl(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, + size_t new_size, bool *deferred_work_generated) { + pac_t *pac = (pac_t *)self; + ehooks_t *ehooks = pac_ehooks_get(pac); + + size_t shrink_amount = old_size - new_size; + + if (ehooks_split_will_fail(ehooks)) { + return true; + } + + edata_t *trail = extent_split_wrapper(tsdn, pac, ehooks, edata, + new_size, shrink_amount, /* holding_core_locks */ false); + if (trail == NULL) { + return true; + } + ecache_dalloc(tsdn, pac, ehooks, &pac->ecache_dirty, trail); + *deferred_work_generated = true; + return false; +} + +static void +pac_dalloc_impl(tsdn_t *tsdn, pai_t *self, edata_t *edata, + bool *deferred_work_generated) { + pac_t *pac = (pac_t *)self; + ehooks_t *ehooks = pac_ehooks_get(pac); + + if (edata_guarded_get(edata)) { + /* + * Because cached guarded extents do exact fit only, large + * guarded extents are restored on dalloc eagerly (otherwise + * they will not be reused efficiently). Slab sizes have a + * limited number of size classes, and tend to cycle faster. + * + * In the case where coalesce is restrained (VirtualFree on + * Windows), guarded extents are also not cached -- otherwise + * during arena destroy / reset, the retained extents would not + * be whole regions (i.e. they are split between regular and + * guarded). + */ + if (!edata_slab_get(edata) || !maps_coalesce) { + assert(edata_size_get(edata) >= SC_LARGE_MINCLASS || + !maps_coalesce); + san_unguard_pages_two_sided(tsdn, ehooks, edata, + pac->emap); + } + } + + ecache_dalloc(tsdn, pac, ehooks, &pac->ecache_dirty, edata); + /* Purging of deallocated pages is deferred */ + *deferred_work_generated = true; +} + +static inline uint64_t +pac_ns_until_purge(tsdn_t *tsdn, decay_t *decay, size_t npages) { + if (malloc_mutex_trylock(tsdn, &decay->mtx)) { + /* Use minimal interval if decay is contended. */ + return BACKGROUND_THREAD_DEFERRED_MIN; + } + uint64_t result = decay_ns_until_purge(decay, npages, + ARENA_DEFERRED_PURGE_NPAGES_THRESHOLD); + + malloc_mutex_unlock(tsdn, &decay->mtx); + return result; +} + +static uint64_t +pac_time_until_deferred_work(tsdn_t *tsdn, pai_t *self) { + uint64_t time; + pac_t *pac = (pac_t *)self; + + time = pac_ns_until_purge(tsdn, + &pac->decay_dirty, + ecache_npages_get(&pac->ecache_dirty)); + if (time == BACKGROUND_THREAD_DEFERRED_MIN) { + return time; + } + + uint64_t muzzy = pac_ns_until_purge(tsdn, + &pac->decay_muzzy, + ecache_npages_get(&pac->ecache_muzzy)); + if (muzzy < time) { + time = muzzy; + } + return time; +} + +bool +pac_retain_grow_limit_get_set(tsdn_t *tsdn, pac_t *pac, size_t *old_limit, + size_t *new_limit) { + pszind_t new_ind JEMALLOC_CC_SILENCE_INIT(0); + if (new_limit != NULL) { + size_t limit = *new_limit; + /* Grow no more than the new limit. */ + if ((new_ind = sz_psz2ind(limit + 1) - 1) >= SC_NPSIZES) { + return true; + } + } + + malloc_mutex_lock(tsdn, &pac->grow_mtx); + if (old_limit != NULL) { + *old_limit = sz_pind2sz(pac->exp_grow.limit); + } + if (new_limit != NULL) { + pac->exp_grow.limit = new_ind; + } + malloc_mutex_unlock(tsdn, &pac->grow_mtx); + + return false; +} + +static size_t +pac_stash_decayed(tsdn_t *tsdn, pac_t *pac, ecache_t *ecache, + size_t npages_limit, size_t npages_decay_max, + edata_list_inactive_t *result) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + ehooks_t *ehooks = pac_ehooks_get(pac); + + /* Stash extents according to npages_limit. */ + size_t nstashed = 0; + while (nstashed < npages_decay_max) { + edata_t *edata = ecache_evict(tsdn, pac, ehooks, ecache, + npages_limit); + if (edata == NULL) { + break; + } + edata_list_inactive_append(result, edata); + nstashed += edata_size_get(edata) >> LG_PAGE; + } + return nstashed; +} + +static size_t +pac_decay_stashed(tsdn_t *tsdn, pac_t *pac, decay_t *decay, + pac_decay_stats_t *decay_stats, ecache_t *ecache, bool fully_decay, + edata_list_inactive_t *decay_extents) { + bool err; + + size_t nmadvise = 0; + size_t nunmapped = 0; + size_t npurged = 0; + + ehooks_t *ehooks = pac_ehooks_get(pac); + + bool try_muzzy = !fully_decay + && pac_decay_ms_get(pac, extent_state_muzzy) != 0; + + for (edata_t *edata = edata_list_inactive_first(decay_extents); edata != + NULL; edata = edata_list_inactive_first(decay_extents)) { + edata_list_inactive_remove(decay_extents, edata); + + size_t size = edata_size_get(edata); + size_t npages = size >> LG_PAGE; + + nmadvise++; + npurged += npages; + + switch (ecache->state) { + case extent_state_dirty: + if (try_muzzy) { + err = extent_purge_lazy_wrapper(tsdn, ehooks, + edata, /* offset */ 0, size); + if (!err) { + ecache_dalloc(tsdn, pac, ehooks, + &pac->ecache_muzzy, edata); + break; + } + } + JEMALLOC_FALLTHROUGH; + case extent_state_muzzy: + extent_dalloc_wrapper(tsdn, pac, ehooks, edata); + nunmapped += npages; + break; + case extent_state_active: + case extent_state_retained: + case extent_state_transition: + case extent_state_merging: + default: + not_reached(); + } + } + + if (config_stats) { + LOCKEDINT_MTX_LOCK(tsdn, *pac->stats_mtx); + locked_inc_u64(tsdn, LOCKEDINT_MTX(*pac->stats_mtx), + &decay_stats->npurge, 1); + locked_inc_u64(tsdn, LOCKEDINT_MTX(*pac->stats_mtx), + &decay_stats->nmadvise, nmadvise); + locked_inc_u64(tsdn, LOCKEDINT_MTX(*pac->stats_mtx), + &decay_stats->purged, npurged); + LOCKEDINT_MTX_UNLOCK(tsdn, *pac->stats_mtx); + atomic_fetch_sub_zu(&pac->stats->pac_mapped, + nunmapped << LG_PAGE, ATOMIC_RELAXED); + } + + return npurged; +} + +/* + * npages_limit: Decay at most npages_decay_max pages without violating the + * invariant: (ecache_npages_get(ecache) >= npages_limit). We need an upper + * bound on number of pages in order to prevent unbounded growth (namely in + * stashed), otherwise unbounded new pages could be added to extents during the + * current decay run, so that the purging thread never finishes. + */ +static void +pac_decay_to_limit(tsdn_t *tsdn, pac_t *pac, decay_t *decay, + pac_decay_stats_t *decay_stats, ecache_t *ecache, bool fully_decay, + size_t npages_limit, size_t npages_decay_max) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 1); + + if (decay->purging || npages_decay_max == 0) { + return; + } + decay->purging = true; + malloc_mutex_unlock(tsdn, &decay->mtx); + + edata_list_inactive_t decay_extents; + edata_list_inactive_init(&decay_extents); + size_t npurge = pac_stash_decayed(tsdn, pac, ecache, npages_limit, + npages_decay_max, &decay_extents); + if (npurge != 0) { + size_t npurged = pac_decay_stashed(tsdn, pac, decay, + decay_stats, ecache, fully_decay, &decay_extents); + assert(npurged == npurge); + } + + malloc_mutex_lock(tsdn, &decay->mtx); + decay->purging = false; +} + +void +pac_decay_all(tsdn_t *tsdn, pac_t *pac, decay_t *decay, + pac_decay_stats_t *decay_stats, ecache_t *ecache, bool fully_decay) { + malloc_mutex_assert_owner(tsdn, &decay->mtx); + pac_decay_to_limit(tsdn, pac, decay, decay_stats, ecache, fully_decay, + /* npages_limit */ 0, ecache_npages_get(ecache)); +} + +static void +pac_decay_try_purge(tsdn_t *tsdn, pac_t *pac, decay_t *decay, + pac_decay_stats_t *decay_stats, ecache_t *ecache, + size_t current_npages, size_t npages_limit) { + if (current_npages > npages_limit) { + pac_decay_to_limit(tsdn, pac, decay, decay_stats, ecache, + /* fully_decay */ false, npages_limit, + current_npages - npages_limit); + } +} + +bool +pac_maybe_decay_purge(tsdn_t *tsdn, pac_t *pac, decay_t *decay, + pac_decay_stats_t *decay_stats, ecache_t *ecache, + pac_purge_eagerness_t eagerness) { + malloc_mutex_assert_owner(tsdn, &decay->mtx); + + /* Purge all or nothing if the option is disabled. */ + ssize_t decay_ms = decay_ms_read(decay); + if (decay_ms <= 0) { + if (decay_ms == 0) { + pac_decay_to_limit(tsdn, pac, decay, decay_stats, + ecache, /* fully_decay */ false, + /* npages_limit */ 0, ecache_npages_get(ecache)); + } + return false; + } + + /* + * If the deadline has been reached, advance to the current epoch and + * purge to the new limit if necessary. Note that dirty pages created + * during the current epoch are not subject to purge until a future + * epoch, so as a result purging only happens during epoch advances, or + * being triggered by background threads (scheduled event). + */ + nstime_t time; + nstime_init_update(&time); + size_t npages_current = ecache_npages_get(ecache); + bool epoch_advanced = decay_maybe_advance_epoch(decay, &time, + npages_current); + if (eagerness == PAC_PURGE_ALWAYS + || (epoch_advanced && eagerness == PAC_PURGE_ON_EPOCH_ADVANCE)) { + size_t npages_limit = decay_npages_limit_get(decay); + pac_decay_try_purge(tsdn, pac, decay, decay_stats, ecache, + npages_current, npages_limit); + } + + return epoch_advanced; +} + +bool +pac_decay_ms_set(tsdn_t *tsdn, pac_t *pac, extent_state_t state, + ssize_t decay_ms, pac_purge_eagerness_t eagerness) { + decay_t *decay; + pac_decay_stats_t *decay_stats; + ecache_t *ecache; + pac_decay_data_get(pac, state, &decay, &decay_stats, &ecache); + + if (!decay_ms_valid(decay_ms)) { + return true; + } + + malloc_mutex_lock(tsdn, &decay->mtx); + /* + * Restart decay backlog from scratch, which may cause many dirty pages + * to be immediately purged. It would conceptually be possible to map + * the old backlog onto the new backlog, but there is no justification + * for such complexity since decay_ms changes are intended to be + * infrequent, either between the {-1, 0, >0} states, or a one-time + * arbitrary change during initial arena configuration. + */ + nstime_t cur_time; + nstime_init_update(&cur_time); + decay_reinit(decay, &cur_time, decay_ms); + pac_maybe_decay_purge(tsdn, pac, decay, decay_stats, ecache, eagerness); + malloc_mutex_unlock(tsdn, &decay->mtx); + + return false; +} + +ssize_t +pac_decay_ms_get(pac_t *pac, extent_state_t state) { + decay_t *decay; + pac_decay_stats_t *decay_stats; + ecache_t *ecache; + pac_decay_data_get(pac, state, &decay, &decay_stats, &ecache); + return decay_ms_read(decay); +} + +void +pac_reset(tsdn_t *tsdn, pac_t *pac) { + /* + * No-op for now; purging is still done at the arena-level. It should + * get moved in here, though. + */ + (void)tsdn; + (void)pac; +} + +void +pac_destroy(tsdn_t *tsdn, pac_t *pac) { + assert(ecache_npages_get(&pac->ecache_dirty) == 0); + assert(ecache_npages_get(&pac->ecache_muzzy) == 0); + /* + * Iterate over the retained extents and destroy them. This gives the + * extent allocator underlying the extent hooks an opportunity to unmap + * all retained memory without having to keep its own metadata + * structures. In practice, virtual memory for dss-allocated extents is + * leaked here, so best practice is to avoid dss for arenas to be + * destroyed, or provide custom extent hooks that track retained + * dss-based extents for later reuse. + */ + ehooks_t *ehooks = pac_ehooks_get(pac); + edata_t *edata; + while ((edata = ecache_evict(tsdn, pac, ehooks, + &pac->ecache_retained, 0)) != NULL) { + extent_destroy_wrapper(tsdn, pac, ehooks, edata); + } +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/pages.c b/src/duckdb/extension/jemalloc/jemalloc/src/pages.c new file mode 100644 index 000000000..1657fa811 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/pages.c @@ -0,0 +1,853 @@ +#include "jemalloc/internal/jemalloc_preamble.h" + +#include "jemalloc/internal/pages.h" + +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/malloc_io.h" + +#ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT +#include +#ifdef __FreeBSD__ +#include +#endif +#endif +#ifdef __NetBSD__ +#include /* ilog2 */ +#endif +#ifdef JEMALLOC_HAVE_VM_MAKE_TAG +#define PAGES_FD_TAG VM_MAKE_TAG(254U) +#else +#define PAGES_FD_TAG -1 +#endif +#if defined(JEMALLOC_HAVE_PRCTL) && defined(JEMALLOC_PAGEID) +#include +#ifndef PR_SET_VMA +#define PR_SET_VMA 0x53564d41 +#define PR_SET_VMA_ANON_NAME 0 +#endif +#endif + +/******************************************************************************/ +/* Data. */ + +/* Actual operating system page size, detected during bootstrap, <= PAGE. */ +size_t os_page; + +#ifndef _WIN32 +# define PAGES_PROT_COMMIT (PROT_READ | PROT_WRITE) +# define PAGES_PROT_DECOMMIT (PROT_NONE) +static int mmap_flags; +#endif +static bool os_overcommits; + +const char *const thp_mode_names[] = { + "default", + "always", + "never", + "not supported" +}; +thp_mode_t opt_thp = THP_MODE_DEFAULT; +thp_mode_t init_system_thp_mode; + +/* Runtime support for lazy purge. Irrelevant when !pages_can_purge_lazy. */ +static bool pages_can_purge_lazy_runtime = true; + +#ifdef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS +static int madvise_dont_need_zeros_is_faulty = -1; +/** + * Check that MADV_DONTNEED will actually zero pages on subsequent access. + * + * Since qemu does not support this, yet [1], and you can get very tricky + * assert if you will run program with jemalloc in use under qemu: + * + * : ../contrib/jemalloc/src/extent.c:1195: Failed assertion: "p[i] == 0" + * + * [1]: https://patchwork.kernel.org/patch/10576637/ + */ +static int madvise_MADV_DONTNEED_zeroes_pages(void) +{ + size_t size = PAGE; + + void * addr = mmap(NULL, size, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + + if (addr == MAP_FAILED) { + malloc_write(": Cannot allocate memory for " + "MADV_DONTNEED check\n"); + if (opt_abort) { + abort(); + } + } + + memset(addr, 'A', size); + int works; + if (madvise(addr, size, MADV_DONTNEED) == 0) { + works = memchr(addr, 'A', size) == NULL; + } else { + /* + * If madvise() does not support MADV_DONTNEED, then we can + * call it anyway, and use it's return code. + */ + works = 1; + } + + if (munmap(addr, size) != 0) { + malloc_write(": Cannot deallocate memory for " + "MADV_DONTNEED check\n"); + if (opt_abort) { + abort(); + } + } + + return works; +} +#endif + +#ifdef JEMALLOC_PAGEID +static int os_page_id(void *addr, size_t size, const char *name) +{ +#ifdef JEMALLOC_HAVE_PRCTL + /* + * While parsing `/proc//maps` file, the block could appear as + * 7f4836000000-7f4836800000 rw-p 00000000 00:00 0 [anon:jemalloc_pg_overcommit]` + */ + return prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, (uintptr_t)addr, size, + (uintptr_t)name); +#else + return 0; +#endif +} +#endif + +/******************************************************************************/ +/* + * Function prototypes for static functions that are referenced prior to + * definition. + */ + +static void os_pages_unmap(void *addr, size_t size); + +/******************************************************************************/ + +static void * +os_pages_map(void *addr, size_t size, size_t alignment, bool *commit) { + assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr); + assert(ALIGNMENT_CEILING(size, os_page) == size); + assert(size != 0); + + if (os_overcommits) { + *commit = true; + } + + void *ret; +#ifdef _WIN32 + /* + * If VirtualAlloc can't allocate at the given address when one is + * given, it fails and returns NULL. + */ + ret = VirtualAlloc(addr, size, MEM_RESERVE | (*commit ? MEM_COMMIT : 0), + PAGE_READWRITE); +#else + /* + * We don't use MAP_FIXED here, because it can cause the *replacement* + * of existing mappings, and we only want to create new mappings. + */ + { + int flags = mmap_flags; +#ifdef __NetBSD__ + /* + * On NetBSD PAGE for a platform is defined to the + * maximum page size of all machine architectures + * for that platform, so that we can use the same + * binaries across all machine architectures. + */ + if (alignment > os_page || PAGE > os_page) { + unsigned int a = ilog2(MAX(alignment, PAGE)); + flags |= MAP_ALIGNED(a); + } +#endif + int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT; + + ret = mmap(addr, size, prot, flags, PAGES_FD_TAG, 0); + } + assert(ret != NULL); + + if (ret == MAP_FAILED) { + ret = NULL; + } else if (addr != NULL && ret != addr) { + /* + * We succeeded in mapping memory, but not in the right place. + */ + os_pages_unmap(ret, size); + ret = NULL; + } +#endif + assert(ret == NULL || (addr == NULL && ret != addr) || (addr != NULL && + ret == addr)); +#ifdef JEMALLOC_PAGEID + int n = os_page_id(ret, size, + os_overcommits ? "jemalloc_pg_overcommit" : "jemalloc_pg"); + assert(n == 0 || (n == -1 && get_errno() == EINVAL)); +#endif + return ret; +} + +static void * +os_pages_trim(void *addr, size_t alloc_size, size_t leadsize, size_t size, + bool *commit) { + void *ret = (void *)((byte_t *)addr + leadsize); + + assert(alloc_size >= leadsize + size); +#ifdef _WIN32 + os_pages_unmap(addr, alloc_size); + void *new_addr = os_pages_map(ret, size, PAGE, commit); + if (new_addr == ret) { + return ret; + } + if (new_addr != NULL) { + os_pages_unmap(new_addr, size); + } + return NULL; +#else + size_t trailsize = alloc_size - leadsize - size; + + if (leadsize != 0) { + os_pages_unmap(addr, leadsize); + } + if (trailsize != 0) { + os_pages_unmap((void *)((byte_t *)ret + size), trailsize); + } + return ret; +#endif +} + +static void +os_pages_unmap(void *addr, size_t size) { + assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr); + assert(ALIGNMENT_CEILING(size, os_page) == size); + +#ifdef _WIN32 + if (VirtualFree(addr, 0, MEM_RELEASE) == 0) +#else + if (munmap(addr, size) == -1) +#endif + { + char buf[BUFERROR_BUF]; + + buferror(get_errno(), buf, sizeof(buf)); + malloc_printf(": Error in " +#ifdef _WIN32 + "VirtualFree" +#else + "munmap" +#endif + "(): %s\n", buf); + if (opt_abort) { + abort(); + } + } +} + +static void * +pages_map_slow(size_t size, size_t alignment, bool *commit) { + size_t alloc_size = size + alignment - os_page; + /* Beware size_t wrap-around. */ + if (alloc_size < size) { + return NULL; + } + + void *ret; + do { + void *pages = os_pages_map(NULL, alloc_size, alignment, commit); + if (pages == NULL) { + return NULL; + } + size_t leadsize = ALIGNMENT_CEILING((uintptr_t)pages, alignment) + - (uintptr_t)pages; + ret = os_pages_trim(pages, alloc_size, leadsize, size, commit); + } while (ret == NULL); + + assert(ret != NULL); + assert(PAGE_ADDR2BASE(ret) == ret); + return ret; +} + +void * +pages_map(void *addr, size_t size, size_t alignment, bool *commit) { + assert(alignment >= PAGE); + assert(ALIGNMENT_ADDR2BASE(addr, alignment) == addr); + +#if defined(__FreeBSD__) && defined(MAP_EXCL) + /* + * FreeBSD has mechanisms both to mmap at specific address without + * touching existing mappings, and to mmap with specific alignment. + */ + { + if (os_overcommits) { + *commit = true; + } + + int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT; + int flags = mmap_flags; + + if (addr != NULL) { + flags |= MAP_FIXED | MAP_EXCL; + } else { + unsigned alignment_bits = ffs_zu(alignment); + assert(alignment_bits > 0); + flags |= MAP_ALIGNED(alignment_bits); + } + + void *ret = mmap(addr, size, prot, flags, -1, 0); + if (ret == MAP_FAILED) { + ret = NULL; + } + + return ret; + } +#endif + /* + * Ideally, there would be a way to specify alignment to mmap() (like + * NetBSD has), but in the absence of such a feature, we have to work + * hard to efficiently create aligned mappings. The reliable, but + * slow method is to create a mapping that is over-sized, then trim the + * excess. However, that always results in one or two calls to + * os_pages_unmap(), and it can leave holes in the process's virtual + * memory map if memory grows downward. + * + * Optimistically try mapping precisely the right amount before falling + * back to the slow method, with the expectation that the optimistic + * approach works most of the time. + */ + + void *ret = os_pages_map(addr, size, os_page, commit); + if (ret == NULL || ret == addr) { + return ret; + } + assert(addr == NULL); + if (ALIGNMENT_ADDR2OFFSET(ret, alignment) != 0) { + os_pages_unmap(ret, size); + return pages_map_slow(size, alignment, commit); + } + + assert(PAGE_ADDR2BASE(ret) == ret); + return ret; +} + +void +pages_unmap(void *addr, size_t size) { + assert(PAGE_ADDR2BASE(addr) == addr); + assert(PAGE_CEILING(size) == size); + + os_pages_unmap(addr, size); +} + +static bool +os_pages_commit(void *addr, size_t size, bool commit) { + assert(PAGE_ADDR2BASE(addr) == addr); + assert(PAGE_CEILING(size) == size); + +#ifdef _WIN32 + return (commit ? (addr != VirtualAlloc(addr, size, MEM_COMMIT, + PAGE_READWRITE)) : (!VirtualFree(addr, size, MEM_DECOMMIT))); +#else + { + int prot = commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT; + void *result = mmap(addr, size, prot, mmap_flags | MAP_FIXED, + PAGES_FD_TAG, 0); + if (result == MAP_FAILED) { + return true; + } + if (result != addr) { + /* + * We succeeded in mapping memory, but not in the right + * place. + */ + os_pages_unmap(result, size); + return true; + } + return false; + } +#endif +} + +static bool +pages_commit_impl(void *addr, size_t size, bool commit) { + if (os_overcommits) { + return true; + } + + return os_pages_commit(addr, size, commit); +} + +bool +pages_commit(void *addr, size_t size) { + return pages_commit_impl(addr, size, true); +} + +bool +pages_decommit(void *addr, size_t size) { + return pages_commit_impl(addr, size, false); +} + +void +pages_mark_guards(void *head, void *tail) { + assert(head != NULL || tail != NULL); + assert(head == NULL || tail == NULL || + (uintptr_t)head < (uintptr_t)tail); +#ifdef JEMALLOC_HAVE_MPROTECT + if (head != NULL) { + mprotect(head, PAGE, PROT_NONE); + } + if (tail != NULL) { + mprotect(tail, PAGE, PROT_NONE); + } +#else + /* Decommit sets to PROT_NONE / MEM_DECOMMIT. */ + if (head != NULL) { + os_pages_commit(head, PAGE, false); + } + if (tail != NULL) { + os_pages_commit(tail, PAGE, false); + } +#endif +} + +void +pages_unmark_guards(void *head, void *tail) { + assert(head != NULL || tail != NULL); + assert(head == NULL || tail == NULL || + (uintptr_t)head < (uintptr_t)tail); +#ifdef JEMALLOC_HAVE_MPROTECT + bool head_and_tail = (head != NULL) && (tail != NULL); + size_t range = head_and_tail ? + (uintptr_t)tail - (uintptr_t)head + PAGE : + SIZE_T_MAX; + /* + * The amount of work that the kernel does in mprotect depends on the + * range argument. SC_LARGE_MINCLASS is an arbitrary threshold chosen + * to prevent kernel from doing too much work that would outweigh the + * savings of performing one less system call. + */ + bool ranged_mprotect = head_and_tail && range <= SC_LARGE_MINCLASS; + if (ranged_mprotect) { + mprotect(head, range, PROT_READ | PROT_WRITE); + } else { + if (head != NULL) { + mprotect(head, PAGE, PROT_READ | PROT_WRITE); + } + if (tail != NULL) { + mprotect(tail, PAGE, PROT_READ | PROT_WRITE); + } + } +#else + if (head != NULL) { + os_pages_commit(head, PAGE, true); + } + if (tail != NULL) { + os_pages_commit(tail, PAGE, true); + } +#endif +} + +bool +pages_purge_lazy(void *addr, size_t size) { + assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr); + assert(PAGE_CEILING(size) == size); + + if (!pages_can_purge_lazy) { + return true; + } + if (!pages_can_purge_lazy_runtime) { + /* + * Built with lazy purge enabled, but detected it was not + * supported on the current system. + */ + return true; + } + +#ifdef _WIN32 + VirtualAlloc(addr, size, MEM_RESET, PAGE_READWRITE); + return false; +#elif defined(JEMALLOC_PURGE_MADVISE_FREE) + return (madvise(addr, size, +# ifdef MADV_FREE + MADV_FREE +# else + JEMALLOC_MADV_FREE +# endif + ) != 0); +#elif defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \ + !defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS) + return (madvise(addr, size, MADV_DONTNEED) != 0); +#elif defined(JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED) && \ + !defined(JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS) + return (posix_madvise(addr, size, POSIX_MADV_DONTNEED) != 0); +#else + not_reached(); +#endif +} + +bool +pages_purge_forced(void *addr, size_t size) { + assert(PAGE_ADDR2BASE(addr) == addr); + assert(PAGE_CEILING(size) == size); + + if (!pages_can_purge_forced) { + return true; + } + +#if defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \ + defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS) + return (unlikely(madvise_dont_need_zeros_is_faulty) || + madvise(addr, size, MADV_DONTNEED) != 0); +#elif defined(JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED) && \ + defined(JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS) + return (unlikely(madvise_dont_need_zeros_is_faulty) || + posix_madvise(addr, size, POSIX_MADV_DONTNEED) != 0); +#elif defined(JEMALLOC_MAPS_COALESCE) + /* Try to overlay a new demand-zeroed mapping. */ + return pages_commit(addr, size); +#else + not_reached(); +#endif +} + +static bool +pages_huge_impl(void *addr, size_t size, bool aligned) { + if (aligned) { + assert(HUGEPAGE_ADDR2BASE(addr) == addr); + assert(HUGEPAGE_CEILING(size) == size); + } +#if defined(JEMALLOC_HAVE_MADVISE_HUGE) + return (madvise(addr, size, MADV_HUGEPAGE) != 0); +#elif defined(JEMALLOC_HAVE_MEMCNTL) + struct memcntl_mha m = {0}; + m.mha_cmd = MHA_MAPSIZE_VA; + m.mha_pagesize = HUGEPAGE; + return (memcntl(addr, size, MC_HAT_ADVISE, (caddr_t)&m, 0, 0) == 0); +#else + return true; +#endif +} + +bool +pages_huge(void *addr, size_t size) { + return pages_huge_impl(addr, size, true); +} + +static bool +pages_huge_unaligned(void *addr, size_t size) { + return pages_huge_impl(addr, size, false); +} + +static bool +pages_nohuge_impl(void *addr, size_t size, bool aligned) { + if (aligned) { + assert(HUGEPAGE_ADDR2BASE(addr) == addr); + assert(HUGEPAGE_CEILING(size) == size); + } + +#ifdef JEMALLOC_HAVE_MADVISE_HUGE + return (madvise(addr, size, MADV_NOHUGEPAGE) != 0); +#else + return false; +#endif +} + +bool +pages_nohuge(void *addr, size_t size) { + return pages_nohuge_impl(addr, size, true); +} + +static bool +pages_nohuge_unaligned(void *addr, size_t size) { + return pages_nohuge_impl(addr, size, false); +} + +bool +pages_dontdump(void *addr, size_t size) { + assert(PAGE_ADDR2BASE(addr) == addr); + assert(PAGE_CEILING(size) == size); +#if defined(JEMALLOC_MADVISE_DONTDUMP) + return madvise(addr, size, MADV_DONTDUMP) != 0; +#elif defined(JEMALLOC_MADVISE_NOCORE) + return madvise(addr, size, MADV_NOCORE) != 0; +#else + return false; +#endif +} + +bool +pages_dodump(void *addr, size_t size) { + assert(PAGE_ADDR2BASE(addr) == addr); + assert(PAGE_CEILING(size) == size); +#if defined(JEMALLOC_MADVISE_DONTDUMP) + return madvise(addr, size, MADV_DODUMP) != 0; +#elif defined(JEMALLOC_MADVISE_NOCORE) + return madvise(addr, size, MADV_CORE) != 0; +#else + return false; +#endif +} + + +static size_t +os_page_detect(void) { +#ifdef _WIN32 + SYSTEM_INFO si; + GetSystemInfo(&si); + return si.dwPageSize; +#elif defined(__FreeBSD__) + /* + * This returns the value obtained from + * the auxv vector, avoiding a syscall. + */ + return getpagesize(); +#else + long result = sysconf(_SC_PAGESIZE); + if (result == -1) { + return LG_PAGE; + } + return (size_t)result; +#endif +} + +#ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT +static bool +os_overcommits_sysctl(void) { + int vm_overcommit; + size_t sz; + + sz = sizeof(vm_overcommit); +#if defined(__FreeBSD__) && defined(VM_OVERCOMMIT) + int mib[2]; + + mib[0] = CTL_VM; + mib[1] = VM_OVERCOMMIT; + if (sysctl(mib, 2, &vm_overcommit, &sz, NULL, 0) != 0) { + return false; /* Error. */ + } +#else + if (sysctlbyname("vm.overcommit", &vm_overcommit, &sz, NULL, 0) != 0) { + return false; /* Error. */ + } +#endif + + return ((vm_overcommit & 0x3) == 0); +} +#endif + +#ifdef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY +/* + * Use syscall(2) rather than {open,read,close}(2) when possible to avoid + * reentry during bootstrapping if another library has interposed system call + * wrappers. + */ +static bool +os_overcommits_proc(void) { + int fd; + char buf[1] = {'0'}; + +#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open) + #if defined(O_CLOEXEC) + fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY | + O_CLOEXEC); + #else + fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY); + if (fd != -1) { + fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC); + } + #endif +#elif defined(JEMALLOC_USE_SYSCALL) && defined(SYS_openat) + #if defined(O_CLOEXEC) + fd = (int)syscall(SYS_openat, + AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC); + #else + fd = (int)syscall(SYS_openat, + AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY); + if (fd != -1) { + fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC); + } + #endif +#else + #if defined(O_CLOEXEC) + fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC); + #else + fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY); + if (fd != -1) { + fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC); + } + #endif +#endif + + if (fd == -1) { + return false; /* Error. */ + } + + ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf)); +#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close) + syscall(SYS_close, fd); +#else + close(fd); +#endif + + if (nread < 1) { + return false; /* Error. */ + } + /* + * /proc/sys/vm/overcommit_memory meanings: + * 0: Heuristic overcommit. + * 1: Always overcommit. + * 2: Never overcommit. + */ + return (buf[0] == '0' || buf[0] == '1'); +} +#endif + +void +pages_set_thp_state (void *ptr, size_t size) { + if (opt_thp == thp_mode_default || opt_thp == init_system_thp_mode) { + return; + } + assert(opt_thp != thp_mode_not_supported && + init_system_thp_mode != thp_mode_not_supported); + + if (opt_thp == thp_mode_always + && init_system_thp_mode != thp_mode_never) { + assert(init_system_thp_mode == thp_mode_default); + pages_huge_unaligned(ptr, size); + } else if (opt_thp == thp_mode_never) { + assert(init_system_thp_mode == thp_mode_default || + init_system_thp_mode == thp_mode_always); + pages_nohuge_unaligned(ptr, size); + } +} + +static void +init_thp_state(void) { + if (!have_madvise_huge && !have_memcntl) { + if (metadata_thp_enabled() && opt_abort) { + malloc_write(": no MADV_HUGEPAGE support\n"); + abort(); + } + goto label_error; + } +#if defined(JEMALLOC_HAVE_MADVISE_HUGE) + static const char sys_state_madvise[] = "always [madvise] never\n"; + static const char sys_state_always[] = "[always] madvise never\n"; + static const char sys_state_never[] = "always madvise [never]\n"; + char buf[sizeof(sys_state_madvise)]; + +#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open) + int fd = (int)syscall(SYS_open, + "/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY); +#elif defined(JEMALLOC_USE_SYSCALL) && defined(SYS_openat) + int fd = (int)syscall(SYS_openat, + AT_FDCWD, "/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY); +#else + int fd = open("/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY); +#endif + if (fd == -1) { + goto label_error; + } + + ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf)); +#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close) + syscall(SYS_close, fd); +#else + close(fd); +#endif + + if (nread < 0) { + goto label_error; + } + + if (strncmp(buf, sys_state_madvise, (size_t)nread) == 0) { + init_system_thp_mode = thp_mode_default; + } else if (strncmp(buf, sys_state_always, (size_t)nread) == 0) { + init_system_thp_mode = thp_mode_always; + } else if (strncmp(buf, sys_state_never, (size_t)nread) == 0) { + init_system_thp_mode = thp_mode_never; + } else { + goto label_error; + } + return; +#elif defined(JEMALLOC_HAVE_MEMCNTL) + init_system_thp_mode = thp_mode_default; + return; +#endif +label_error: + opt_thp = init_system_thp_mode = thp_mode_not_supported; +} + +bool +pages_boot(void) { + os_page = os_page_detect(); + if (os_page > PAGE) { + malloc_write(": Unsupported system page size\n"); + if (opt_abort) { + abort(); + } + return true; + } + +#ifdef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS + if (!opt_trust_madvise) { + madvise_dont_need_zeros_is_faulty = !madvise_MADV_DONTNEED_zeroes_pages(); + if (madvise_dont_need_zeros_is_faulty) { + malloc_write(": MADV_DONTNEED does not work (memset will be used instead)\n"); + malloc_write(": (This is the expected behaviour if you are running under QEMU)\n"); + } + } else { + /* In case opt_trust_madvise is disable, + * do not do runtime check */ + madvise_dont_need_zeros_is_faulty = 0; + } +#endif + +#ifndef _WIN32 + mmap_flags = MAP_PRIVATE | MAP_ANON; +#endif + +#ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT + os_overcommits = os_overcommits_sysctl(); +#elif defined(JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY) + os_overcommits = os_overcommits_proc(); +# ifdef MAP_NORESERVE + if (os_overcommits) { + mmap_flags |= MAP_NORESERVE; + } +# endif +#elif defined(__NetBSD__) + os_overcommits = true; +#else + os_overcommits = false; +#endif + + init_thp_state(); + +#ifdef __FreeBSD__ + /* + * FreeBSD doesn't need the check; madvise(2) is known to work. + */ +#else + /* Detect lazy purge runtime support. */ + if (pages_can_purge_lazy) { + bool committed = false; + void *madv_free_page = os_pages_map(NULL, PAGE, PAGE, &committed); + if (madv_free_page == NULL) { + return true; + } + assert(pages_can_purge_lazy_runtime); + if (pages_purge_lazy(madv_free_page, PAGE)) { + pages_can_purge_lazy_runtime = false; + } + os_pages_unmap(madv_free_page, PAGE); + } +#endif + + return false; +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/pai.c b/src/duckdb/extension/jemalloc/jemalloc/src/pai.c new file mode 100644 index 000000000..e8cddfc3b --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/pai.c @@ -0,0 +1,32 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +size_t +pai_alloc_batch_default(tsdn_t *tsdn, pai_t *self, size_t size, size_t nallocs, + edata_list_active_t *results, bool frequent_reuse, + bool *deferred_work_generated) { + for (size_t i = 0; i < nallocs; i++) { + bool deferred_by_alloc = false; + edata_t *edata = pai_alloc(tsdn, self, size, PAGE, + /* zero */ false, /* guarded */ false, frequent_reuse, + &deferred_by_alloc); + *deferred_work_generated |= deferred_by_alloc; + if (edata == NULL) { + return i; + } + edata_list_active_append(results, edata); + } + return nallocs; +} + +void +pai_dalloc_batch_default(tsdn_t *tsdn, pai_t *self, + edata_list_active_t *list, bool *deferred_work_generated) { + edata_t *edata; + while ((edata = edata_list_active_first(list)) != NULL) { + bool deferred_by_dalloc = false; + edata_list_active_remove(list, edata); + pai_dalloc(tsdn, self, edata, &deferred_by_dalloc); + *deferred_work_generated |= deferred_by_dalloc; + } +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/peak_event.c b/src/duckdb/extension/jemalloc/jemalloc/src/peak_event.c new file mode 100644 index 000000000..4093fbcc6 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/peak_event.c @@ -0,0 +1,82 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/peak_event.h" + +#include "jemalloc/internal/activity_callback.h" +#include "jemalloc/internal/peak.h" + +/* + * Update every 64K by default. We're not exposing this as a configuration + * option for now; we don't want to bind ourselves too tightly to any particular + * performance requirements for small values, or guarantee that we'll even be + * able to provide fine-grained accuracy. + */ +#define PEAK_EVENT_WAIT (64 * 1024) + +/* Update the peak with current tsd state. */ +void +peak_event_update(tsd_t *tsd) { + uint64_t alloc = tsd_thread_allocated_get(tsd); + uint64_t dalloc = tsd_thread_deallocated_get(tsd); + peak_t *peak = tsd_peakp_get(tsd); + peak_update(peak, alloc, dalloc); +} + +static void +peak_event_activity_callback(tsd_t *tsd) { + activity_callback_thunk_t *thunk = tsd_activity_callback_thunkp_get( + tsd); + uint64_t alloc = tsd_thread_allocated_get(tsd); + uint64_t dalloc = tsd_thread_deallocated_get(tsd); + if (thunk->callback != NULL) { + thunk->callback(thunk->uctx, alloc, dalloc); + } +} + +/* Set current state to zero. */ +void +peak_event_zero(tsd_t *tsd) { + uint64_t alloc = tsd_thread_allocated_get(tsd); + uint64_t dalloc = tsd_thread_deallocated_get(tsd); + peak_t *peak = tsd_peakp_get(tsd); + peak_set_zero(peak, alloc, dalloc); +} + +uint64_t +peak_event_max(tsd_t *tsd) { + peak_t *peak = tsd_peakp_get(tsd); + return peak_max(peak); +} + +uint64_t +peak_alloc_new_event_wait(tsd_t *tsd) { + return PEAK_EVENT_WAIT; +} + +uint64_t +peak_alloc_postponed_event_wait(tsd_t *tsd) { + return TE_MIN_START_WAIT; +} + +void +peak_alloc_event_handler(tsd_t *tsd, uint64_t elapsed) { + peak_event_update(tsd); + peak_event_activity_callback(tsd); +} + +uint64_t +peak_dalloc_new_event_wait(tsd_t *tsd) { + return PEAK_EVENT_WAIT; +} + +uint64_t +peak_dalloc_postponed_event_wait(tsd_t *tsd) { + return TE_MIN_START_WAIT; +} + +void +peak_dalloc_event_handler(tsd_t *tsd, uint64_t elapsed) { + peak_event_update(tsd); + peak_event_activity_callback(tsd); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/prof.c b/src/duckdb/extension/jemalloc/jemalloc/src/prof.c new file mode 100644 index 000000000..8fdc6f71a --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/prof.c @@ -0,0 +1,851 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/ctl.h" +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/counter.h" +#include "jemalloc/internal/prof_data.h" +#include "jemalloc/internal/prof_log.h" +#include "jemalloc/internal/prof_recent.h" +#include "jemalloc/internal/prof_stats.h" +#include "jemalloc/internal/prof_sys.h" +#include "jemalloc/internal/prof_hook.h" +#include "jemalloc/internal/thread_event.h" + +/* + * This file implements the profiling "APIs" needed by other parts of jemalloc, + * and also manages the relevant "operational" data, mainly options and mutexes; + * the core profiling data structures are encapsulated in prof_data.c. + */ + +/******************************************************************************/ + +/* Data. */ + +bool opt_prof = false; +bool opt_prof_active = true; +bool opt_prof_thread_active_init = true; +unsigned opt_prof_bt_max = PROF_BT_MAX_DEFAULT; +size_t opt_lg_prof_sample = LG_PROF_SAMPLE_DEFAULT; +ssize_t opt_lg_prof_interval = LG_PROF_INTERVAL_DEFAULT; +bool opt_prof_gdump = false; +bool opt_prof_final = false; +bool opt_prof_leak = false; +bool opt_prof_leak_error = false; +bool opt_prof_accum = false; +bool opt_prof_pid_namespace = false; +char opt_prof_prefix[PROF_DUMP_FILENAME_LEN]; +bool opt_prof_sys_thread_name = false; +bool opt_prof_unbias = true; + +/* Accessed via prof_sample_event_handler(). */ +static counter_accum_t prof_idump_accumulated; + +/* + * Initialized as opt_prof_active, and accessed via + * prof_active_[gs]et{_unlocked,}(). + */ +bool prof_active_state; +static malloc_mutex_t prof_active_mtx; + +/* + * Initialized as opt_prof_thread_active_init, and accessed via + * prof_thread_active_init_[gs]et(). + */ +static bool prof_thread_active_init; +static malloc_mutex_t prof_thread_active_init_mtx; + +/* + * Initialized as opt_prof_gdump, and accessed via + * prof_gdump_[gs]et{_unlocked,}(). + */ +bool prof_gdump_val; +static malloc_mutex_t prof_gdump_mtx; + +uint64_t prof_interval = 0; + +size_t lg_prof_sample; + +static uint64_t next_thr_uid; +static malloc_mutex_t next_thr_uid_mtx; + +/* Do not dump any profiles until bootstrapping is complete. */ +bool prof_booted = false; + +/* Logically a prof_backtrace_hook_t. */ +static atomic_p_t prof_backtrace_hook; + +/* Logically a prof_dump_hook_t. */ +static atomic_p_t prof_dump_hook; + +/* Logically a prof_sample_hook_t. */ +static atomic_p_t prof_sample_hook; + +/* Logically a prof_sample_free_hook_t. */ +static atomic_p_t prof_sample_free_hook; + +/******************************************************************************/ + +void +prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx) { + cassert(config_prof); + + if (tsd_reentrancy_level_get(tsd) > 0) { + assert(tctx == PROF_TCTX_SENTINEL); + return; + } + + if (prof_tctx_is_valid(tctx)) { + /* + * This `assert` really shouldn't be necessary. It's here + * because there's a bug in the clang static analyzer; it + * somehow does not realize that by `prof_tctx_is_valid(tctx)` + * being true that we've already ensured that `tctx` is not + * `NULL`. + */ + assert(tctx != NULL); + malloc_mutex_lock(tsd_tsdn(tsd), tctx->tdata->lock); + tctx->prepared = false; + prof_tctx_try_destroy(tsd, tctx); + } +} + +void +prof_malloc_sample_object(tsd_t *tsd, const void *ptr, size_t size, + size_t usize, prof_tctx_t *tctx) { + cassert(config_prof); + + if (opt_prof_sys_thread_name) { + prof_sys_thread_name_fetch(tsd); + } + + edata_t *edata = emap_edata_lookup(tsd_tsdn(tsd), &arena_emap_global, + ptr); + prof_info_set(tsd, edata, tctx, size); + + szind_t szind = sz_size2index(usize); + + malloc_mutex_lock(tsd_tsdn(tsd), tctx->tdata->lock); + /* + * We need to do these map lookups while holding the lock, to avoid the + * possibility of races with prof_reset calls, which update the map and + * then acquire the lock. This actually still leaves a data race on the + * contents of the unbias map, but we have not yet gone through and + * atomic-ified the prof module, and compilers are not yet causing us + * issues. The key thing is to make sure that, if we read garbage data, + * the prof_reset call is about to mark our tctx as expired before any + * dumping of our corrupted output is attempted. + */ + size_t shifted_unbiased_cnt = prof_shifted_unbiased_cnt[szind]; + size_t unbiased_bytes = prof_unbiased_sz[szind]; + tctx->cnts.curobjs++; + tctx->cnts.curobjs_shifted_unbiased += shifted_unbiased_cnt; + tctx->cnts.curbytes += usize; + tctx->cnts.curbytes_unbiased += unbiased_bytes; + if (opt_prof_accum) { + tctx->cnts.accumobjs++; + tctx->cnts.accumobjs_shifted_unbiased += shifted_unbiased_cnt; + tctx->cnts.accumbytes += usize; + tctx->cnts.accumbytes_unbiased += unbiased_bytes; + } + bool record_recent = prof_recent_alloc_prepare(tsd, tctx); + tctx->prepared = false; + malloc_mutex_unlock(tsd_tsdn(tsd), tctx->tdata->lock); + if (record_recent) { + assert(tctx == edata_prof_tctx_get(edata)); + prof_recent_alloc(tsd, edata, size, usize); + } + + if (opt_prof_stats) { + prof_stats_inc(tsd, szind, size); + } + + /* Sample hook. */ + prof_sample_hook_t prof_sample_hook = prof_sample_hook_get(); + if (prof_sample_hook != NULL) { + prof_bt_t *bt = &tctx->gctx->bt; + pre_reentrancy(tsd, NULL); + prof_sample_hook(ptr, size, bt->vec, bt->len, usize); + post_reentrancy(tsd); + } +} + +void +prof_free_sampled_object(tsd_t *tsd, const void *ptr, size_t usize, + prof_info_t *prof_info) { + cassert(config_prof); + + assert(prof_info != NULL); + prof_tctx_t *tctx = prof_info->alloc_tctx; + assert(prof_tctx_is_valid(tctx)); + + szind_t szind = sz_size2index(usize); + + /* Unsample hook. */ + prof_sample_free_hook_t prof_sample_free_hook = + prof_sample_free_hook_get(); + if (prof_sample_free_hook != NULL) { + pre_reentrancy(tsd, NULL); + prof_sample_free_hook(ptr, usize); + post_reentrancy(tsd); + } + + malloc_mutex_lock(tsd_tsdn(tsd), tctx->tdata->lock); + + assert(tctx->cnts.curobjs > 0); + assert(tctx->cnts.curbytes >= usize); + /* + * It's not correct to do equivalent asserts for unbiased bytes, because + * of the potential for races with prof.reset calls. The map contents + * should really be atomic, but we have not atomic-ified the prof module + * yet. + */ + tctx->cnts.curobjs--; + tctx->cnts.curobjs_shifted_unbiased -= prof_shifted_unbiased_cnt[szind]; + tctx->cnts.curbytes -= usize; + tctx->cnts.curbytes_unbiased -= prof_unbiased_sz[szind]; + + prof_try_log(tsd, usize, prof_info); + + prof_tctx_try_destroy(tsd, tctx); + + if (opt_prof_stats) { + prof_stats_dec(tsd, szind, prof_info->alloc_size); + } +} + +prof_tctx_t * +prof_tctx_create(tsd_t *tsd) { + if (!tsd_nominal(tsd) || tsd_reentrancy_level_get(tsd) > 0) { + return NULL; + } + + prof_tdata_t *tdata = prof_tdata_get(tsd, true); + if (tdata == NULL) { + return NULL; + } + + prof_bt_t bt; + bt_init(&bt, tdata->vec); + prof_backtrace(tsd, &bt); + return prof_lookup(tsd, &bt); +} + +/* + * The bodies of this function and prof_leakcheck() are compiled out unless heap + * profiling is enabled, so that it is possible to compile jemalloc with + * floating point support completely disabled. Avoiding floating point code is + * important on memory-constrained systems, but it also enables a workaround for + * versions of glibc that don't properly save/restore floating point registers + * during dynamic lazy symbol loading (which internally calls into whatever + * malloc implementation happens to be integrated into the application). Note + * that some compilers (e.g. gcc 4.8) may use floating point registers for fast + * memory moves, so jemalloc must be compiled with such optimizations disabled + * (e.g. + * -mno-sse) in order for the workaround to be complete. + */ +uint64_t +prof_sample_new_event_wait(tsd_t *tsd) { +#ifdef JEMALLOC_PROF + if (lg_prof_sample == 0) { + return TE_MIN_START_WAIT; + } + + /* + * Compute sample interval as a geometrically distributed random + * variable with mean (2^lg_prof_sample). + * + * __ __ + * | log(u) | 1 + * bytes_until_sample = | -------- |, where p = --------------- + * | log(1-p) | lg_prof_sample + * 2 + * + * For more information on the math, see: + * + * Non-Uniform Random Variate Generation + * Luc Devroye + * Springer-Verlag, New York, 1986 + * pp 500 + * (http://luc.devroye.org/rnbookindex.html) + * + * In the actual computation, there's a non-zero probability that our + * pseudo random number generator generates an exact 0, and to avoid + * log(0), we set u to 1.0 in case r is 0. Therefore u effectively is + * uniformly distributed in (0, 1] instead of [0, 1). Further, rather + * than taking the ceiling, we take the floor and then add 1, since + * otherwise bytes_until_sample would be 0 if u is exactly 1.0. + */ + uint64_t r = prng_lg_range_u64(tsd_prng_statep_get(tsd), 53); + double u = (r == 0U) ? 1.0 : (double)((long double)r * + (1.0L/9007199254740992.0L)); + return (uint64_t)(log(u) / + log(1.0 - (1.0 / (double)((uint64_t)1U << lg_prof_sample)))) + + (uint64_t)1U; +#else + not_reached(); + return TE_MAX_START_WAIT; +#endif +} + +uint64_t +prof_sample_postponed_event_wait(tsd_t *tsd) { + /* + * The postponed wait time for prof sample event is computed as if we + * want a new wait time (i.e. as if the event were triggered). If we + * instead postpone to the immediate next allocation, like how we're + * handling the other events, then we can have sampling bias, if e.g. + * the allocation immediately following a reentrancy always comes from + * the same stack trace. + */ + return prof_sample_new_event_wait(tsd); +} + +void +prof_sample_event_handler(tsd_t *tsd, uint64_t elapsed) { + cassert(config_prof); + assert(elapsed > 0 && elapsed != TE_INVALID_ELAPSED); + if (prof_interval == 0 || !prof_active_get_unlocked()) { + return; + } + if (counter_accum(tsd_tsdn(tsd), &prof_idump_accumulated, elapsed)) { + prof_idump(tsd_tsdn(tsd)); + } +} + +static void +prof_fdump(void) { + tsd_t *tsd; + + cassert(config_prof); + assert(opt_prof_final); + + if (!prof_booted) { + return; + } + tsd = tsd_fetch(); + assert(tsd_reentrancy_level_get(tsd) == 0); + + prof_fdump_impl(tsd); +} + +static bool +prof_idump_accum_init(void) { + cassert(config_prof); + + return counter_accum_init(&prof_idump_accumulated, prof_interval); +} + +void +prof_idump(tsdn_t *tsdn) { + tsd_t *tsd; + prof_tdata_t *tdata; + + cassert(config_prof); + + if (!prof_booted || tsdn_null(tsdn) || !prof_active_get_unlocked()) { + return; + } + tsd = tsdn_tsd(tsdn); + if (tsd_reentrancy_level_get(tsd) > 0) { + return; + } + + tdata = prof_tdata_get(tsd, true); + if (tdata == NULL) { + return; + } + if (tdata->enq) { + tdata->enq_idump = true; + return; + } + + prof_idump_impl(tsd); +} + +bool +prof_mdump(tsd_t *tsd, const char *filename) { + cassert(config_prof); + assert(tsd_reentrancy_level_get(tsd) == 0); + + if (!opt_prof || !prof_booted) { + return true; + } + + return prof_mdump_impl(tsd, filename); +} + +void +prof_gdump(tsdn_t *tsdn) { + tsd_t *tsd; + prof_tdata_t *tdata; + + cassert(config_prof); + + if (!prof_booted || tsdn_null(tsdn) || !prof_active_get_unlocked()) { + return; + } + tsd = tsdn_tsd(tsdn); + if (tsd_reentrancy_level_get(tsd) > 0) { + return; + } + + tdata = prof_tdata_get(tsd, false); + if (tdata == NULL) { + return; + } + if (tdata->enq) { + tdata->enq_gdump = true; + return; + } + + prof_gdump_impl(tsd); +} + +static uint64_t +prof_thr_uid_alloc(tsdn_t *tsdn) { + uint64_t thr_uid; + + malloc_mutex_lock(tsdn, &next_thr_uid_mtx); + thr_uid = next_thr_uid; + next_thr_uid++; + malloc_mutex_unlock(tsdn, &next_thr_uid_mtx); + + return thr_uid; +} + +prof_tdata_t * +prof_tdata_init(tsd_t *tsd) { + return prof_tdata_init_impl(tsd, prof_thr_uid_alloc(tsd_tsdn(tsd)), 0, + NULL, prof_thread_active_init_get(tsd_tsdn(tsd))); +} + +prof_tdata_t * +prof_tdata_reinit(tsd_t *tsd, prof_tdata_t *tdata) { + uint64_t thr_uid = tdata->thr_uid; + uint64_t thr_discrim = tdata->thr_discrim + 1; + bool active = tdata->active; + + /* Keep a local copy of the thread name, before detaching. */ + prof_thread_name_assert(tdata); + char thread_name[PROF_THREAD_NAME_MAX_LEN]; + strncpy(thread_name, tdata->thread_name, PROF_THREAD_NAME_MAX_LEN); + prof_tdata_detach(tsd, tdata); + + return prof_tdata_init_impl(tsd, thr_uid, thr_discrim, thread_name, + active); +} + +void +prof_tdata_cleanup(tsd_t *tsd) { + prof_tdata_t *tdata; + + if (!config_prof) { + return; + } + + tdata = tsd_prof_tdata_get(tsd); + if (tdata != NULL) { + prof_tdata_detach(tsd, tdata); + } +} + +bool +prof_active_get(tsdn_t *tsdn) { + bool prof_active_current; + + prof_active_assert(); + malloc_mutex_lock(tsdn, &prof_active_mtx); + prof_active_current = prof_active_state; + malloc_mutex_unlock(tsdn, &prof_active_mtx); + return prof_active_current; +} + +bool +prof_active_set(tsdn_t *tsdn, bool active) { + bool prof_active_old; + + prof_active_assert(); + malloc_mutex_lock(tsdn, &prof_active_mtx); + prof_active_old = prof_active_state; + prof_active_state = active; + malloc_mutex_unlock(tsdn, &prof_active_mtx); + prof_active_assert(); + return prof_active_old; +} + +const char * +prof_thread_name_get(tsd_t *tsd) { + static const char *prof_thread_name_dummy = ""; + + assert(tsd_reentrancy_level_get(tsd) == 0); + prof_tdata_t *tdata = prof_tdata_get(tsd, true); + if (tdata == NULL) { + return prof_thread_name_dummy; + } + + return tdata->thread_name; +} + +int +prof_thread_name_set(tsd_t *tsd, const char *thread_name) { + if (opt_prof_sys_thread_name) { + return ENOENT; + } else { + return prof_thread_name_set_impl(tsd, thread_name); + } +} + +bool +prof_thread_active_get(tsd_t *tsd) { + assert(tsd_reentrancy_level_get(tsd) == 0); + + prof_tdata_t *tdata; + + tdata = prof_tdata_get(tsd, true); + if (tdata == NULL) { + return false; + } + return tdata->active; +} + +bool +prof_thread_active_set(tsd_t *tsd, bool active) { + assert(tsd_reentrancy_level_get(tsd) == 0); + + prof_tdata_t *tdata; + + tdata = prof_tdata_get(tsd, true); + if (tdata == NULL) { + return true; + } + tdata->active = active; + return false; +} + +bool +prof_thread_active_init_get(tsdn_t *tsdn) { + bool active_init; + + malloc_mutex_lock(tsdn, &prof_thread_active_init_mtx); + active_init = prof_thread_active_init; + malloc_mutex_unlock(tsdn, &prof_thread_active_init_mtx); + return active_init; +} + +bool +prof_thread_active_init_set(tsdn_t *tsdn, bool active_init) { + bool active_init_old; + + malloc_mutex_lock(tsdn, &prof_thread_active_init_mtx); + active_init_old = prof_thread_active_init; + prof_thread_active_init = active_init; + malloc_mutex_unlock(tsdn, &prof_thread_active_init_mtx); + return active_init_old; +} + +bool +prof_gdump_get(tsdn_t *tsdn) { + bool prof_gdump_current; + + malloc_mutex_lock(tsdn, &prof_gdump_mtx); + prof_gdump_current = prof_gdump_val; + malloc_mutex_unlock(tsdn, &prof_gdump_mtx); + return prof_gdump_current; +} + +bool +prof_gdump_set(tsdn_t *tsdn, bool gdump) { + bool prof_gdump_old; + + malloc_mutex_lock(tsdn, &prof_gdump_mtx); + prof_gdump_old = prof_gdump_val; + prof_gdump_val = gdump; + malloc_mutex_unlock(tsdn, &prof_gdump_mtx); + return prof_gdump_old; +} + +void +prof_backtrace_hook_set(prof_backtrace_hook_t hook) { + atomic_store_p(&prof_backtrace_hook, hook, ATOMIC_RELEASE); +} + +prof_backtrace_hook_t +prof_backtrace_hook_get(void) { + return (prof_backtrace_hook_t)atomic_load_p(&prof_backtrace_hook, + ATOMIC_ACQUIRE); +} + +void +prof_dump_hook_set(prof_dump_hook_t hook) { + atomic_store_p(&prof_dump_hook, hook, ATOMIC_RELEASE); +} + +prof_dump_hook_t +prof_dump_hook_get(void) { + return (prof_dump_hook_t)atomic_load_p(&prof_dump_hook, + ATOMIC_ACQUIRE); +} + +void +prof_sample_hook_set(prof_sample_hook_t hook) { + atomic_store_p(&prof_sample_hook, hook, ATOMIC_RELEASE); +} + +prof_sample_hook_t +prof_sample_hook_get(void) { + return (prof_sample_hook_t)atomic_load_p(&prof_sample_hook, + ATOMIC_ACQUIRE); +} + +void +prof_sample_free_hook_set(prof_sample_free_hook_t hook) { + atomic_store_p(&prof_sample_free_hook, hook, ATOMIC_RELEASE); +} + +prof_sample_free_hook_t +prof_sample_free_hook_get(void) { + return (prof_sample_free_hook_t)atomic_load_p(&prof_sample_free_hook, + ATOMIC_ACQUIRE); +} + +void +prof_boot0(void) { + cassert(config_prof); + + memcpy(opt_prof_prefix, PROF_PREFIX_DEFAULT, + sizeof(PROF_PREFIX_DEFAULT)); +} + +void +prof_boot1(void) { + cassert(config_prof); + + /* + * opt_prof must be in its final state before any arenas are + * initialized, so this function must be executed early. + */ + if (opt_prof_leak_error && !opt_prof_leak) { + opt_prof_leak = true; + } + + if (opt_prof_leak && !opt_prof) { + /* + * Enable opt_prof, but in such a way that profiles are never + * automatically dumped. + */ + opt_prof = true; + opt_prof_gdump = false; + } else if (opt_prof) { + if (opt_lg_prof_interval >= 0) { + prof_interval = (((uint64_t)1U) << + opt_lg_prof_interval); + } + } +} + +bool +prof_boot2(tsd_t *tsd, base_t *base) { + cassert(config_prof); + + /* + * Initialize the global mutexes unconditionally to maintain correct + * stats when opt_prof is false. + */ + if (malloc_mutex_init(&prof_active_mtx, "prof_active", + WITNESS_RANK_PROF_ACTIVE, malloc_mutex_rank_exclusive)) { + return true; + } + if (malloc_mutex_init(&prof_gdump_mtx, "prof_gdump", + WITNESS_RANK_PROF_GDUMP, malloc_mutex_rank_exclusive)) { + return true; + } + if (malloc_mutex_init(&prof_thread_active_init_mtx, + "prof_thread_active_init", WITNESS_RANK_PROF_THREAD_ACTIVE_INIT, + malloc_mutex_rank_exclusive)) { + return true; + } + if (malloc_mutex_init(&bt2gctx_mtx, "prof_bt2gctx", + WITNESS_RANK_PROF_BT2GCTX, malloc_mutex_rank_exclusive)) { + return true; + } + if (malloc_mutex_init(&tdatas_mtx, "prof_tdatas", + WITNESS_RANK_PROF_TDATAS, malloc_mutex_rank_exclusive)) { + return true; + } + if (malloc_mutex_init(&next_thr_uid_mtx, "prof_next_thr_uid", + WITNESS_RANK_PROF_NEXT_THR_UID, malloc_mutex_rank_exclusive)) { + return true; + } + if (malloc_mutex_init(&prof_stats_mtx, "prof_stats", + WITNESS_RANK_PROF_STATS, malloc_mutex_rank_exclusive)) { + return true; + } + if (malloc_mutex_init(&prof_dump_filename_mtx, + "prof_dump_filename", WITNESS_RANK_PROF_DUMP_FILENAME, + malloc_mutex_rank_exclusive)) { + return true; + } + if (malloc_mutex_init(&prof_dump_mtx, "prof_dump", + WITNESS_RANK_PROF_DUMP, malloc_mutex_rank_exclusive)) { + return true; + } + + if (opt_prof) { + lg_prof_sample = opt_lg_prof_sample; + prof_unbias_map_init(); + prof_active_state = opt_prof_active; + prof_gdump_val = opt_prof_gdump; + prof_thread_active_init = opt_prof_thread_active_init; + + if (prof_data_init(tsd)) { + return true; + } + + next_thr_uid = 0; + if (prof_idump_accum_init()) { + return true; + } + + if (opt_prof_final && opt_prof_prefix[0] != '\0' && + atexit(prof_fdump) != 0) { + malloc_write(": Error in atexit()\n"); + if (opt_abort) { + abort(); + } + } + + if (prof_log_init(tsd)) { + return true; + } + + if (prof_recent_init()) { + return true; + } + + prof_base = base; + + gctx_locks = (malloc_mutex_t *)base_alloc(tsd_tsdn(tsd), base, + PROF_NCTX_LOCKS * sizeof(malloc_mutex_t), CACHELINE); + if (gctx_locks == NULL) { + return true; + } + for (unsigned i = 0; i < PROF_NCTX_LOCKS; i++) { + if (malloc_mutex_init(&gctx_locks[i], "prof_gctx", + WITNESS_RANK_PROF_GCTX, + malloc_mutex_rank_exclusive)) { + return true; + } + } + + tdata_locks = (malloc_mutex_t *)base_alloc(tsd_tsdn(tsd), base, + PROF_NTDATA_LOCKS * sizeof(malloc_mutex_t), CACHELINE); + if (tdata_locks == NULL) { + return true; + } + for (unsigned i = 0; i < PROF_NTDATA_LOCKS; i++) { + if (malloc_mutex_init(&tdata_locks[i], "prof_tdata", + WITNESS_RANK_PROF_TDATA, + malloc_mutex_rank_exclusive)) { + return true; + } + } + + prof_unwind_init(); + prof_hooks_init(); + } + prof_booted = true; + + return false; +} + +void +prof_prefork0(tsdn_t *tsdn) { + if (config_prof && opt_prof) { + unsigned i; + + malloc_mutex_prefork(tsdn, &prof_dump_mtx); + malloc_mutex_prefork(tsdn, &bt2gctx_mtx); + malloc_mutex_prefork(tsdn, &tdatas_mtx); + for (i = 0; i < PROF_NTDATA_LOCKS; i++) { + malloc_mutex_prefork(tsdn, &tdata_locks[i]); + } + malloc_mutex_prefork(tsdn, &log_mtx); + for (i = 0; i < PROF_NCTX_LOCKS; i++) { + malloc_mutex_prefork(tsdn, &gctx_locks[i]); + } + malloc_mutex_prefork(tsdn, &prof_recent_dump_mtx); + } +} + +void +prof_prefork1(tsdn_t *tsdn) { + if (config_prof && opt_prof) { + counter_prefork(tsdn, &prof_idump_accumulated); + malloc_mutex_prefork(tsdn, &prof_active_mtx); + malloc_mutex_prefork(tsdn, &prof_dump_filename_mtx); + malloc_mutex_prefork(tsdn, &prof_gdump_mtx); + malloc_mutex_prefork(tsdn, &prof_recent_alloc_mtx); + malloc_mutex_prefork(tsdn, &prof_stats_mtx); + malloc_mutex_prefork(tsdn, &next_thr_uid_mtx); + malloc_mutex_prefork(tsdn, &prof_thread_active_init_mtx); + } +} + +void +prof_postfork_parent(tsdn_t *tsdn) { + if (config_prof && opt_prof) { + unsigned i; + + malloc_mutex_postfork_parent(tsdn, + &prof_thread_active_init_mtx); + malloc_mutex_postfork_parent(tsdn, &next_thr_uid_mtx); + malloc_mutex_postfork_parent(tsdn, &prof_stats_mtx); + malloc_mutex_postfork_parent(tsdn, &prof_recent_alloc_mtx); + malloc_mutex_postfork_parent(tsdn, &prof_gdump_mtx); + malloc_mutex_postfork_parent(tsdn, &prof_dump_filename_mtx); + malloc_mutex_postfork_parent(tsdn, &prof_active_mtx); + counter_postfork_parent(tsdn, &prof_idump_accumulated); + malloc_mutex_postfork_parent(tsdn, &prof_recent_dump_mtx); + for (i = 0; i < PROF_NCTX_LOCKS; i++) { + malloc_mutex_postfork_parent(tsdn, &gctx_locks[i]); + } + malloc_mutex_postfork_parent(tsdn, &log_mtx); + for (i = 0; i < PROF_NTDATA_LOCKS; i++) { + malloc_mutex_postfork_parent(tsdn, &tdata_locks[i]); + } + malloc_mutex_postfork_parent(tsdn, &tdatas_mtx); + malloc_mutex_postfork_parent(tsdn, &bt2gctx_mtx); + malloc_mutex_postfork_parent(tsdn, &prof_dump_mtx); + } +} + +void +prof_postfork_child(tsdn_t *tsdn) { + if (config_prof && opt_prof) { + unsigned i; + + malloc_mutex_postfork_child(tsdn, &prof_thread_active_init_mtx); + malloc_mutex_postfork_child(tsdn, &next_thr_uid_mtx); + malloc_mutex_postfork_child(tsdn, &prof_stats_mtx); + malloc_mutex_postfork_child(tsdn, &prof_recent_alloc_mtx); + malloc_mutex_postfork_child(tsdn, &prof_gdump_mtx); + malloc_mutex_postfork_child(tsdn, &prof_dump_filename_mtx); + malloc_mutex_postfork_child(tsdn, &prof_active_mtx); + counter_postfork_child(tsdn, &prof_idump_accumulated); + malloc_mutex_postfork_child(tsdn, &prof_recent_dump_mtx); + for (i = 0; i < PROF_NCTX_LOCKS; i++) { + malloc_mutex_postfork_child(tsdn, &gctx_locks[i]); + } + malloc_mutex_postfork_child(tsdn, &log_mtx); + for (i = 0; i < PROF_NTDATA_LOCKS; i++) { + malloc_mutex_postfork_child(tsdn, &tdata_locks[i]); + } + malloc_mutex_postfork_child(tsdn, &tdatas_mtx); + malloc_mutex_postfork_child(tsdn, &bt2gctx_mtx); + malloc_mutex_postfork_child(tsdn, &prof_dump_mtx); + } +} + +/******************************************************************************/ diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/prof_data.c b/src/duckdb/extension/jemalloc/jemalloc/src/prof_data.c new file mode 100644 index 000000000..39af0c90a --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/prof_data.c @@ -0,0 +1,1425 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/ckh.h" +#include "jemalloc/internal/hash.h" +#include "jemalloc/internal/malloc_io.h" +#include "jemalloc/internal/prof_data.h" + +/* + * This file defines and manages the core profiling data structures. + * + * Conceptually, profiling data can be imagined as a table with three columns: + * thread, stack trace, and current allocation size. (When prof_accum is on, + * there's one additional column which is the cumulative allocation size.) + * + * Implementation wise, each thread maintains a hash recording the stack trace + * to allocation size correspondences, which are basically the individual rows + * in the table. In addition, two global "indices" are built to make data + * aggregation efficient (for dumping): bt2gctx and tdatas, which are basically + * the "grouped by stack trace" and "grouped by thread" views of the same table, + * respectively. Note that the allocation size is only aggregated to the two + * indices at dumping time, so as to optimize for performance. + */ + +/******************************************************************************/ + +malloc_mutex_t bt2gctx_mtx; +malloc_mutex_t tdatas_mtx; +malloc_mutex_t prof_dump_mtx; + +/* + * Table of mutexes that are shared among gctx's. These are leaf locks, so + * there is no problem with using them for more than one gctx at the same time. + * The primary motivation for this sharing though is that gctx's are ephemeral, + * and destroying mutexes causes complications for systems that allocate when + * creating/destroying mutexes. + */ +malloc_mutex_t *gctx_locks; +static atomic_u_t cum_gctxs; /* Atomic counter. */ + +/* + * Table of mutexes that are shared among tdata's. No operations require + * holding multiple tdata locks, so there is no problem with using them for more + * than one tdata at the same time, even though a gctx lock may be acquired + * while holding a tdata lock. + */ +malloc_mutex_t *tdata_locks; + +/* + * Global hash of (prof_bt_t *)-->(prof_gctx_t *). This is the master data + * structure that knows about all backtraces currently captured. + */ +static ckh_t bt2gctx; + +/* + * Tree of all extant prof_tdata_t structures, regardless of state, + * {attached,detached,expired}. + */ +static prof_tdata_tree_t tdatas; + +size_t prof_unbiased_sz[PROF_SC_NSIZES]; +size_t prof_shifted_unbiased_cnt[PROF_SC_NSIZES]; + +/******************************************************************************/ +/* Red-black trees. */ + +static int +prof_tctx_comp(const prof_tctx_t *a, const prof_tctx_t *b) { + uint64_t a_thr_uid = a->thr_uid; + uint64_t b_thr_uid = b->thr_uid; + int ret = (a_thr_uid > b_thr_uid) - (a_thr_uid < b_thr_uid); + if (ret == 0) { + uint64_t a_thr_discrim = a->thr_discrim; + uint64_t b_thr_discrim = b->thr_discrim; + ret = (a_thr_discrim > b_thr_discrim) - (a_thr_discrim < + b_thr_discrim); + if (ret == 0) { + uint64_t a_tctx_uid = a->tctx_uid; + uint64_t b_tctx_uid = b->tctx_uid; + ret = (a_tctx_uid > b_tctx_uid) - (a_tctx_uid < + b_tctx_uid); + } + } + return ret; +} + +/* NOLINTBEGIN(performance-no-int-to-ptr) */ +rb_gen(static UNUSED, tctx_tree_, prof_tctx_tree_t, prof_tctx_t, + tctx_link, prof_tctx_comp) +/* NOLINTEND(performance-no-int-to-ptr) */ + +static int +prof_gctx_comp(const prof_gctx_t *a, const prof_gctx_t *b) { + unsigned a_len = a->bt.len; + unsigned b_len = b->bt.len; + unsigned comp_len = (a_len < b_len) ? a_len : b_len; + int ret = memcmp(a->bt.vec, b->bt.vec, comp_len * sizeof(void *)); + if (ret == 0) { + ret = (a_len > b_len) - (a_len < b_len); + } + return ret; +} + +/* NOLINTBEGIN(performance-no-int-to-ptr) */ +rb_gen(static UNUSED, gctx_tree_, prof_gctx_tree_t, prof_gctx_t, dump_link, + prof_gctx_comp) +/* NOLINTEND(performance-no-int-to-ptr) */ + +static int +prof_tdata_comp(const prof_tdata_t *a, const prof_tdata_t *b) { + int ret; + uint64_t a_uid = a->thr_uid; + uint64_t b_uid = b->thr_uid; + + ret = ((a_uid > b_uid) - (a_uid < b_uid)); + if (ret == 0) { + uint64_t a_discrim = a->thr_discrim; + uint64_t b_discrim = b->thr_discrim; + + ret = ((a_discrim > b_discrim) - (a_discrim < b_discrim)); + } + return ret; +} + +/* NOLINTBEGIN(performance-no-int-to-ptr) */ +rb_gen(static UNUSED, tdata_tree_, prof_tdata_tree_t, prof_tdata_t, tdata_link, + prof_tdata_comp) +/* NOLINTEND(performance-no-int-to-ptr) */ + +/******************************************************************************/ + +static malloc_mutex_t * +prof_gctx_mutex_choose(void) { + unsigned ngctxs = atomic_fetch_add_u(&cum_gctxs, 1, ATOMIC_RELAXED); + + return &gctx_locks[(ngctxs - 1) % PROF_NCTX_LOCKS]; +} + +static malloc_mutex_t * +prof_tdata_mutex_choose(uint64_t thr_uid) { + return &tdata_locks[thr_uid % PROF_NTDATA_LOCKS]; +} + +bool +prof_data_init(tsd_t *tsd) { + tdata_tree_new(&tdatas); + return ckh_new(tsd, &bt2gctx, PROF_CKH_MINITEMS, + prof_bt_hash, prof_bt_keycomp); +} + +static void +prof_enter(tsd_t *tsd, prof_tdata_t *tdata) { + cassert(config_prof); + assert(tdata == prof_tdata_get(tsd, false)); + + if (tdata != NULL) { + assert(!tdata->enq); + tdata->enq = true; + } + + malloc_mutex_lock(tsd_tsdn(tsd), &bt2gctx_mtx); +} + +static void +prof_leave(tsd_t *tsd, prof_tdata_t *tdata) { + cassert(config_prof); + assert(tdata == prof_tdata_get(tsd, false)); + + malloc_mutex_unlock(tsd_tsdn(tsd), &bt2gctx_mtx); + + if (tdata != NULL) { + bool idump, gdump; + + assert(tdata->enq); + tdata->enq = false; + idump = tdata->enq_idump; + tdata->enq_idump = false; + gdump = tdata->enq_gdump; + tdata->enq_gdump = false; + + if (idump) { + prof_idump(tsd_tsdn(tsd)); + } + if (gdump) { + prof_gdump(tsd_tsdn(tsd)); + } + } +} + +static prof_gctx_t * +prof_gctx_create(tsdn_t *tsdn, prof_bt_t *bt) { + /* + * Create a single allocation that has space for vec of length bt->len. + */ + size_t size = offsetof(prof_gctx_t, vec) + (bt->len * sizeof(void *)); + prof_gctx_t *gctx = (prof_gctx_t *)iallocztm(tsdn, size, + sz_size2index(size), false, NULL, true, arena_get(TSDN_NULL, 0, true), + true); + if (gctx == NULL) { + return NULL; + } + gctx->lock = prof_gctx_mutex_choose(); + /* + * Set nlimbo to 1, in order to avoid a race condition with + * prof_tctx_destroy()/prof_gctx_try_destroy(). + */ + gctx->nlimbo = 1; + tctx_tree_new(&gctx->tctxs); + /* Duplicate bt. */ + memcpy(gctx->vec, bt->vec, bt->len * sizeof(void *)); + gctx->bt.vec = gctx->vec; + gctx->bt.len = bt->len; + return gctx; +} + +static void +prof_gctx_try_destroy(tsd_t *tsd, prof_tdata_t *tdata_self, + prof_gctx_t *gctx) { + cassert(config_prof); + + /* + * Check that gctx is still unused by any thread cache before destroying + * it. prof_lookup() increments gctx->nlimbo in order to avoid a race + * condition with this function, as does prof_tctx_destroy() in order to + * avoid a race between the main body of prof_tctx_destroy() and entry + * into this function. + */ + prof_enter(tsd, tdata_self); + malloc_mutex_lock(tsd_tsdn(tsd), gctx->lock); + assert(gctx->nlimbo != 0); + if (tctx_tree_empty(&gctx->tctxs) && gctx->nlimbo == 1) { + /* Remove gctx from bt2gctx. */ + if (ckh_remove(tsd, &bt2gctx, &gctx->bt, NULL, NULL)) { + not_reached(); + } + prof_leave(tsd, tdata_self); + /* Destroy gctx. */ + malloc_mutex_unlock(tsd_tsdn(tsd), gctx->lock); + idalloctm(tsd_tsdn(tsd), gctx, NULL, NULL, true, true); + } else { + /* + * Compensate for increment in prof_tctx_destroy() or + * prof_lookup(). + */ + gctx->nlimbo--; + malloc_mutex_unlock(tsd_tsdn(tsd), gctx->lock); + prof_leave(tsd, tdata_self); + } +} + +static bool +prof_gctx_should_destroy(prof_gctx_t *gctx) { + if (opt_prof_accum) { + return false; + } + if (!tctx_tree_empty(&gctx->tctxs)) { + return false; + } + if (gctx->nlimbo != 0) { + return false; + } + return true; +} + +static bool +prof_lookup_global(tsd_t *tsd, prof_bt_t *bt, prof_tdata_t *tdata, + void **p_btkey, prof_gctx_t **p_gctx, bool *p_new_gctx) { + union { + prof_gctx_t *p; + void *v; + } gctx, tgctx; + union { + prof_bt_t *p; + void *v; + } btkey; + bool new_gctx; + + prof_enter(tsd, tdata); + if (ckh_search(&bt2gctx, bt, &btkey.v, &gctx.v)) { + /* bt has never been seen before. Insert it. */ + prof_leave(tsd, tdata); + tgctx.p = prof_gctx_create(tsd_tsdn(tsd), bt); + if (tgctx.v == NULL) { + return true; + } + prof_enter(tsd, tdata); + if (ckh_search(&bt2gctx, bt, &btkey.v, &gctx.v)) { + gctx.p = tgctx.p; + btkey.p = &gctx.p->bt; + if (ckh_insert(tsd, &bt2gctx, btkey.v, gctx.v)) { + /* OOM. */ + prof_leave(tsd, tdata); + idalloctm(tsd_tsdn(tsd), gctx.v, NULL, NULL, + true, true); + return true; + } + new_gctx = true; + } else { + new_gctx = false; + } + } else { + tgctx.v = NULL; + new_gctx = false; + } + + if (!new_gctx) { + /* + * Increment nlimbo, in order to avoid a race condition with + * prof_tctx_destroy()/prof_gctx_try_destroy(). + */ + malloc_mutex_lock(tsd_tsdn(tsd), gctx.p->lock); + gctx.p->nlimbo++; + malloc_mutex_unlock(tsd_tsdn(tsd), gctx.p->lock); + new_gctx = false; + + if (tgctx.v != NULL) { + /* Lost race to insert. */ + idalloctm(tsd_tsdn(tsd), tgctx.v, NULL, NULL, true, + true); + } + } + prof_leave(tsd, tdata); + + *p_btkey = btkey.v; + *p_gctx = gctx.p; + *p_new_gctx = new_gctx; + return false; +} + +prof_tctx_t * +prof_lookup(tsd_t *tsd, prof_bt_t *bt) { + union { + prof_tctx_t *p; + void *v; + } ret; + prof_tdata_t *tdata; + bool not_found; + + cassert(config_prof); + + tdata = prof_tdata_get(tsd, false); + assert(tdata != NULL); + + malloc_mutex_lock(tsd_tsdn(tsd), tdata->lock); + not_found = ckh_search(&tdata->bt2tctx, bt, NULL, &ret.v); + if (!not_found) { /* Note double negative! */ + ret.p->prepared = true; + } + malloc_mutex_unlock(tsd_tsdn(tsd), tdata->lock); + if (not_found) { + void *btkey; + prof_gctx_t *gctx; + bool new_gctx, error; + + /* + * This thread's cache lacks bt. Look for it in the global + * cache. + */ + if (prof_lookup_global(tsd, bt, tdata, &btkey, &gctx, + &new_gctx)) { + return NULL; + } + + /* Link a prof_tctx_t into gctx for this thread. */ + ret.v = iallocztm(tsd_tsdn(tsd), sizeof(prof_tctx_t), + sz_size2index(sizeof(prof_tctx_t)), false, NULL, true, + arena_ichoose(tsd, NULL), true); + if (ret.p == NULL) { + if (new_gctx) { + prof_gctx_try_destroy(tsd, tdata, gctx); + } + return NULL; + } + ret.p->tdata = tdata; + ret.p->thr_uid = tdata->thr_uid; + ret.p->thr_discrim = tdata->thr_discrim; + ret.p->recent_count = 0; + memset(&ret.p->cnts, 0, sizeof(prof_cnt_t)); + ret.p->gctx = gctx; + ret.p->tctx_uid = tdata->tctx_uid_next++; + ret.p->prepared = true; + ret.p->state = prof_tctx_state_initializing; + malloc_mutex_lock(tsd_tsdn(tsd), tdata->lock); + error = ckh_insert(tsd, &tdata->bt2tctx, btkey, ret.v); + malloc_mutex_unlock(tsd_tsdn(tsd), tdata->lock); + if (error) { + if (new_gctx) { + prof_gctx_try_destroy(tsd, tdata, gctx); + } + idalloctm(tsd_tsdn(tsd), ret.v, NULL, NULL, true, true); + return NULL; + } + malloc_mutex_lock(tsd_tsdn(tsd), gctx->lock); + ret.p->state = prof_tctx_state_nominal; + tctx_tree_insert(&gctx->tctxs, ret.p); + gctx->nlimbo--; + malloc_mutex_unlock(tsd_tsdn(tsd), gctx->lock); + } + + return ret.p; +} + +/* Used in unit tests. */ +static prof_tdata_t * +prof_tdata_count_iter(prof_tdata_tree_t *tdatas_ptr, prof_tdata_t *tdata, + void *arg) { + size_t *tdata_count = (size_t *)arg; + + (*tdata_count)++; + + return NULL; +} + +/* Used in unit tests. */ +size_t +prof_tdata_count(void) { + size_t tdata_count = 0; + tsdn_t *tsdn; + + tsdn = tsdn_fetch(); + malloc_mutex_lock(tsdn, &tdatas_mtx); + tdata_tree_iter(&tdatas, NULL, prof_tdata_count_iter, + (void *)&tdata_count); + malloc_mutex_unlock(tsdn, &tdatas_mtx); + + return tdata_count; +} + +/* Used in unit tests. */ +size_t +prof_bt_count(void) { + size_t bt_count; + tsd_t *tsd; + prof_tdata_t *tdata; + + tsd = tsd_fetch(); + tdata = prof_tdata_get(tsd, false); + if (tdata == NULL) { + return 0; + } + + malloc_mutex_lock(tsd_tsdn(tsd), &bt2gctx_mtx); + bt_count = ckh_count(&bt2gctx); + malloc_mutex_unlock(tsd_tsdn(tsd), &bt2gctx_mtx); + + return bt_count; +} + +static void +prof_thread_name_write_tdata(prof_tdata_t *tdata, const char *thread_name) { + strncpy(tdata->thread_name, thread_name, PROF_THREAD_NAME_MAX_LEN); + tdata->thread_name[PROF_THREAD_NAME_MAX_LEN - 1] = '\0'; +} + +int +prof_thread_name_set_impl(tsd_t *tsd, const char *thread_name) { + assert(tsd_reentrancy_level_get(tsd) == 0); + assert(thread_name != NULL); + + for (unsigned i = 0; thread_name[i] != '\0'; i++) { + char c = thread_name[i]; + if (!isgraph(c) && !isblank(c)) { + return EINVAL; + } + } + + prof_tdata_t *tdata = prof_tdata_get(tsd, true); + if (tdata == NULL) { + return ENOMEM; + } + + prof_thread_name_write_tdata(tdata, thread_name); + + return 0; +} + +JEMALLOC_FORMAT_PRINTF(3, 4) +static void +prof_dump_printf(write_cb_t *prof_dump_write, void *cbopaque, + const char *format, ...) { + va_list ap; + char buf[PROF_PRINTF_BUFSIZE]; + + va_start(ap, format); + malloc_vsnprintf(buf, sizeof(buf), format, ap); + va_end(ap); + prof_dump_write(cbopaque, buf); +} + +/* + * Casting a double to a uint64_t may not necessarily be in range; this can be + * UB. I don't think this is practically possible with the cur counters, but + * plausibly could be with the accum counters. + */ +#ifdef JEMALLOC_PROF +static uint64_t +prof_double_uint64_cast(double d) { + /* + * Note: UINT64_MAX + 1 is exactly representable as a double on all + * reasonable platforms (certainly those we'll support). Writing this + * as !(a < b) instead of (a >= b) means that we're NaN-safe. + */ + double rounded = round(d); + if (!(rounded < (double)UINT64_MAX)) { + return UINT64_MAX; + } + return (uint64_t)rounded; +} +#endif + +void prof_unbias_map_init(void) { + /* See the comment in prof_sample_new_event_wait */ +#ifdef JEMALLOC_PROF + for (szind_t i = 0; i < SC_NSIZES; i++) { + double sz = (double)sz_index2size(i); + double rate = (double)(ZU(1) << lg_prof_sample); + double div_val = 1.0 - exp(-sz / rate); + double unbiased_sz = sz / div_val; + /* + * The "true" right value for the unbiased count is + * 1.0/(1 - exp(-sz/rate)). The problem is, we keep the counts + * as integers (for a variety of reasons -- rounding errors + * could trigger asserts, and not all libcs can properly handle + * floating point arithmetic during malloc calls inside libc). + * Rounding to an integer, though, can lead to rounding errors + * of over 30% for sizes close to the sampling rate. So + * instead, we multiply by a constant, dividing the maximum + * possible roundoff error by that constant. To avoid overflow + * in summing up size_t values, the largest safe constant we can + * pick is the size of the smallest allocation. + */ + double cnt_shift = (double)(ZU(1) << SC_LG_TINY_MIN); + double shifted_unbiased_cnt = cnt_shift / div_val; + prof_unbiased_sz[i] = (size_t)round(unbiased_sz); + prof_shifted_unbiased_cnt[i] = (size_t)round( + shifted_unbiased_cnt); + } +#else + unreachable(); +#endif +} + +/* + * The unbiasing story is long. The jeprof unbiasing logic was copied from + * pprof. Both shared an issue: they unbiased using the average size of the + * allocations at a particular stack trace. This can work out OK if allocations + * are mostly of the same size given some stack, but not otherwise. We now + * internally track what the unbiased results ought to be. We can't just report + * them as they are though; they'll still go through the jeprof unbiasing + * process. Instead, we figure out what values we can feed *into* jeprof's + * unbiasing mechanism that will lead to getting the right values out. + * + * It'll unbias count and aggregate size as: + * + * c_out = c_in * 1/(1-exp(-s_in/c_in/R) + * s_out = s_in * 1/(1-exp(-s_in/c_in/R) + * + * We want to solve for the values of c_in and s_in that will + * give the c_out and s_out that we've computed internally. + * + * Let's do a change of variables (both to make the math easier and to make it + * easier to write): + * x = s_in / c_in + * y = s_in + * k = 1/R. + * + * Then + * c_out = y/x * 1/(1-exp(-k*x)) + * s_out = y * 1/(1-exp(-k*x)) + * + * The first equation gives: + * y = x * c_out * (1-exp(-k*x)) + * The second gives: + * y = s_out * (1-exp(-k*x)) + * So we have + * x = s_out / c_out. + * And all the other values fall out from that. + * + * This is all a fair bit of work. The thing we get out of it is that we don't + * break backwards compatibility with jeprof (and the various tools that have + * copied its unbiasing logic). Eventually, we anticipate a v3 heap profile + * dump format based on JSON, at which point I think much of this logic can get + * cleaned up (since we'll be taking a compatibility break there anyways). + */ +static void +prof_do_unbias(uint64_t c_out_shifted_i, uint64_t s_out_i, uint64_t *r_c_in, + uint64_t *r_s_in) { +#ifdef JEMALLOC_PROF + if (c_out_shifted_i == 0 || s_out_i == 0) { + *r_c_in = 0; + *r_s_in = 0; + return; + } + /* + * See the note in prof_unbias_map_init() to see why we take c_out in a + * shifted form. + */ + double c_out = (double)c_out_shifted_i + / (double)(ZU(1) << SC_LG_TINY_MIN); + double s_out = (double)s_out_i; + double R = (double)(ZU(1) << lg_prof_sample); + + double x = s_out / c_out; + double y = s_out * (1.0 - exp(-x / R)); + + double c_in = y / x; + double s_in = y; + + *r_c_in = prof_double_uint64_cast(c_in); + *r_s_in = prof_double_uint64_cast(s_in); +#else + unreachable(); +#endif +} + +static void +prof_dump_print_cnts(write_cb_t *prof_dump_write, void *cbopaque, + const prof_cnt_t *cnts) { + uint64_t curobjs; + uint64_t curbytes; + uint64_t accumobjs; + uint64_t accumbytes; + if (opt_prof_unbias) { + prof_do_unbias(cnts->curobjs_shifted_unbiased, + cnts->curbytes_unbiased, &curobjs, &curbytes); + prof_do_unbias(cnts->accumobjs_shifted_unbiased, + cnts->accumbytes_unbiased, &accumobjs, &accumbytes); + } else { + curobjs = cnts->curobjs; + curbytes = cnts->curbytes; + accumobjs = cnts->accumobjs; + accumbytes = cnts->accumbytes; + } + prof_dump_printf(prof_dump_write, cbopaque, + "%"FMTu64": %"FMTu64" [%"FMTu64": %"FMTu64"]", + curobjs, curbytes, accumobjs, accumbytes); +} + +static void +prof_tctx_merge_tdata(tsdn_t *tsdn, prof_tctx_t *tctx, prof_tdata_t *tdata) { + malloc_mutex_assert_owner(tsdn, tctx->tdata->lock); + + malloc_mutex_lock(tsdn, tctx->gctx->lock); + + switch (tctx->state) { + case prof_tctx_state_initializing: + malloc_mutex_unlock(tsdn, tctx->gctx->lock); + return; + case prof_tctx_state_nominal: + tctx->state = prof_tctx_state_dumping; + malloc_mutex_unlock(tsdn, tctx->gctx->lock); + + memcpy(&tctx->dump_cnts, &tctx->cnts, sizeof(prof_cnt_t)); + + tdata->cnt_summed.curobjs += tctx->dump_cnts.curobjs; + tdata->cnt_summed.curobjs_shifted_unbiased + += tctx->dump_cnts.curobjs_shifted_unbiased; + tdata->cnt_summed.curbytes += tctx->dump_cnts.curbytes; + tdata->cnt_summed.curbytes_unbiased + += tctx->dump_cnts.curbytes_unbiased; + if (opt_prof_accum) { + tdata->cnt_summed.accumobjs += + tctx->dump_cnts.accumobjs; + tdata->cnt_summed.accumobjs_shifted_unbiased += + tctx->dump_cnts.accumobjs_shifted_unbiased; + tdata->cnt_summed.accumbytes += + tctx->dump_cnts.accumbytes; + tdata->cnt_summed.accumbytes_unbiased += + tctx->dump_cnts.accumbytes_unbiased; + } + break; + case prof_tctx_state_dumping: + case prof_tctx_state_purgatory: + not_reached(); + } +} + +static void +prof_tctx_merge_gctx(tsdn_t *tsdn, prof_tctx_t *tctx, prof_gctx_t *gctx) { + malloc_mutex_assert_owner(tsdn, gctx->lock); + + gctx->cnt_summed.curobjs += tctx->dump_cnts.curobjs; + gctx->cnt_summed.curobjs_shifted_unbiased + += tctx->dump_cnts.curobjs_shifted_unbiased; + gctx->cnt_summed.curbytes += tctx->dump_cnts.curbytes; + gctx->cnt_summed.curbytes_unbiased += tctx->dump_cnts.curbytes_unbiased; + if (opt_prof_accum) { + gctx->cnt_summed.accumobjs += tctx->dump_cnts.accumobjs; + gctx->cnt_summed.accumobjs_shifted_unbiased + += tctx->dump_cnts.accumobjs_shifted_unbiased; + gctx->cnt_summed.accumbytes += tctx->dump_cnts.accumbytes; + gctx->cnt_summed.accumbytes_unbiased + += tctx->dump_cnts.accumbytes_unbiased; + } +} + +static prof_tctx_t * +prof_tctx_merge_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *arg) { + tsdn_t *tsdn = (tsdn_t *)arg; + + malloc_mutex_assert_owner(tsdn, tctx->gctx->lock); + + switch (tctx->state) { + case prof_tctx_state_nominal: + /* New since dumping started; ignore. */ + break; + case prof_tctx_state_dumping: + case prof_tctx_state_purgatory: + prof_tctx_merge_gctx(tsdn, tctx, tctx->gctx); + break; + case prof_tctx_state_initializing: + default: + not_reached(); + } + + return NULL; +} + +typedef struct prof_dump_iter_arg_s prof_dump_iter_arg_t; +struct prof_dump_iter_arg_s { + tsdn_t *tsdn; + write_cb_t *prof_dump_write; + void *cbopaque; +}; + +static prof_tctx_t * +prof_tctx_dump_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *opaque) { + prof_dump_iter_arg_t *arg = (prof_dump_iter_arg_t *)opaque; + malloc_mutex_assert_owner(arg->tsdn, tctx->gctx->lock); + + switch (tctx->state) { + case prof_tctx_state_initializing: + case prof_tctx_state_nominal: + /* Not captured by this dump. */ + break; + case prof_tctx_state_dumping: + case prof_tctx_state_purgatory: + prof_dump_printf(arg->prof_dump_write, arg->cbopaque, + " t%"FMTu64": ", tctx->thr_uid); + prof_dump_print_cnts(arg->prof_dump_write, arg->cbopaque, + &tctx->dump_cnts); + arg->prof_dump_write(arg->cbopaque, "\n"); + break; + default: + not_reached(); + } + return NULL; +} + +static prof_tctx_t * +prof_tctx_finish_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *arg) { + tsdn_t *tsdn = (tsdn_t *)arg; + prof_tctx_t *ret; + + malloc_mutex_assert_owner(tsdn, tctx->gctx->lock); + + switch (tctx->state) { + case prof_tctx_state_nominal: + /* New since dumping started; ignore. */ + break; + case prof_tctx_state_dumping: + tctx->state = prof_tctx_state_nominal; + break; + case prof_tctx_state_purgatory: + ret = tctx; + goto label_return; + case prof_tctx_state_initializing: + default: + not_reached(); + } + + ret = NULL; +label_return: + return ret; +} + +static void +prof_dump_gctx_prep(tsdn_t *tsdn, prof_gctx_t *gctx, prof_gctx_tree_t *gctxs) { + cassert(config_prof); + + malloc_mutex_lock(tsdn, gctx->lock); + + /* + * Increment nlimbo so that gctx won't go away before dump. + * Additionally, link gctx into the dump list so that it is included in + * prof_dump()'s second pass. + */ + gctx->nlimbo++; + gctx_tree_insert(gctxs, gctx); + + memset(&gctx->cnt_summed, 0, sizeof(prof_cnt_t)); + + malloc_mutex_unlock(tsdn, gctx->lock); +} + +typedef struct prof_gctx_merge_iter_arg_s prof_gctx_merge_iter_arg_t; +struct prof_gctx_merge_iter_arg_s { + tsdn_t *tsdn; + size_t *leak_ngctx; +}; + +static prof_gctx_t * +prof_gctx_merge_iter(prof_gctx_tree_t *gctxs, prof_gctx_t *gctx, void *opaque) { + prof_gctx_merge_iter_arg_t *arg = (prof_gctx_merge_iter_arg_t *)opaque; + + malloc_mutex_lock(arg->tsdn, gctx->lock); + tctx_tree_iter(&gctx->tctxs, NULL, prof_tctx_merge_iter, + (void *)arg->tsdn); + if (gctx->cnt_summed.curobjs != 0) { + (*arg->leak_ngctx)++; + } + malloc_mutex_unlock(arg->tsdn, gctx->lock); + + return NULL; +} + +static void +prof_gctx_finish(tsd_t *tsd, prof_gctx_tree_t *gctxs) { + prof_tdata_t *tdata = prof_tdata_get(tsd, false); + prof_gctx_t *gctx; + + /* + * Standard tree iteration won't work here, because as soon as we + * decrement gctx->nlimbo and unlock gctx, another thread can + * concurrently destroy it, which will corrupt the tree. Therefore, + * tear down the tree one node at a time during iteration. + */ + while ((gctx = gctx_tree_first(gctxs)) != NULL) { + gctx_tree_remove(gctxs, gctx); + malloc_mutex_lock(tsd_tsdn(tsd), gctx->lock); + { + prof_tctx_t *next; + + next = NULL; + do { + prof_tctx_t *to_destroy = + tctx_tree_iter(&gctx->tctxs, next, + prof_tctx_finish_iter, + (void *)tsd_tsdn(tsd)); + if (to_destroy != NULL) { + next = tctx_tree_next(&gctx->tctxs, + to_destroy); + tctx_tree_remove(&gctx->tctxs, + to_destroy); + idalloctm(tsd_tsdn(tsd), to_destroy, + NULL, NULL, true, true); + } else { + next = NULL; + } + } while (next != NULL); + } + gctx->nlimbo--; + if (prof_gctx_should_destroy(gctx)) { + gctx->nlimbo++; + malloc_mutex_unlock(tsd_tsdn(tsd), gctx->lock); + prof_gctx_try_destroy(tsd, tdata, gctx); + } else { + malloc_mutex_unlock(tsd_tsdn(tsd), gctx->lock); + } + } +} + +typedef struct prof_tdata_merge_iter_arg_s prof_tdata_merge_iter_arg_t; +struct prof_tdata_merge_iter_arg_s { + tsdn_t *tsdn; + prof_cnt_t *cnt_all; +}; + +static prof_tdata_t * +prof_tdata_merge_iter(prof_tdata_tree_t *tdatas_ptr, prof_tdata_t *tdata, + void *opaque) { + prof_tdata_merge_iter_arg_t *arg = + (prof_tdata_merge_iter_arg_t *)opaque; + + malloc_mutex_lock(arg->tsdn, tdata->lock); + if (!tdata->expired) { + size_t tabind; + union { + prof_tctx_t *p; + void *v; + } tctx; + + tdata->dumping = true; + memset(&tdata->cnt_summed, 0, sizeof(prof_cnt_t)); + for (tabind = 0; !ckh_iter(&tdata->bt2tctx, &tabind, NULL, + &tctx.v);) { + prof_tctx_merge_tdata(arg->tsdn, tctx.p, tdata); + } + + arg->cnt_all->curobjs += tdata->cnt_summed.curobjs; + arg->cnt_all->curobjs_shifted_unbiased + += tdata->cnt_summed.curobjs_shifted_unbiased; + arg->cnt_all->curbytes += tdata->cnt_summed.curbytes; + arg->cnt_all->curbytes_unbiased + += tdata->cnt_summed.curbytes_unbiased; + if (opt_prof_accum) { + arg->cnt_all->accumobjs += tdata->cnt_summed.accumobjs; + arg->cnt_all->accumobjs_shifted_unbiased + += tdata->cnt_summed.accumobjs_shifted_unbiased; + arg->cnt_all->accumbytes += + tdata->cnt_summed.accumbytes; + arg->cnt_all->accumbytes_unbiased += + tdata->cnt_summed.accumbytes_unbiased; + } + } else { + tdata->dumping = false; + } + malloc_mutex_unlock(arg->tsdn, tdata->lock); + + return NULL; +} + +static prof_tdata_t * +prof_tdata_dump_iter(prof_tdata_tree_t *tdatas_ptr, prof_tdata_t *tdata, + void *opaque) { + if (!tdata->dumping) { + return NULL; + } + + prof_dump_iter_arg_t *arg = (prof_dump_iter_arg_t *)opaque; + prof_dump_printf(arg->prof_dump_write, arg->cbopaque, " t%"FMTu64": ", + tdata->thr_uid); + prof_dump_print_cnts(arg->prof_dump_write, arg->cbopaque, + &tdata->cnt_summed); + if (!prof_thread_name_empty(tdata)) { + arg->prof_dump_write(arg->cbopaque, " "); + arg->prof_dump_write(arg->cbopaque, tdata->thread_name); + } + arg->prof_dump_write(arg->cbopaque, "\n"); + return NULL; +} + +static void +prof_dump_header(prof_dump_iter_arg_t *arg, const prof_cnt_t *cnt_all) { + prof_dump_printf(arg->prof_dump_write, arg->cbopaque, + "heap_v2/%"FMTu64"\n t*: ", ((uint64_t)1U << lg_prof_sample)); + prof_dump_print_cnts(arg->prof_dump_write, arg->cbopaque, cnt_all); + arg->prof_dump_write(arg->cbopaque, "\n"); + + malloc_mutex_lock(arg->tsdn, &tdatas_mtx); + tdata_tree_iter(&tdatas, NULL, prof_tdata_dump_iter, arg); + malloc_mutex_unlock(arg->tsdn, &tdatas_mtx); +} + +static void +prof_dump_gctx(prof_dump_iter_arg_t *arg, prof_gctx_t *gctx, + const prof_bt_t *bt, prof_gctx_tree_t *gctxs) { + cassert(config_prof); + malloc_mutex_assert_owner(arg->tsdn, gctx->lock); + + /* Avoid dumping such gctx's that have no useful data. */ + if ((!opt_prof_accum && gctx->cnt_summed.curobjs == 0) || + (opt_prof_accum && gctx->cnt_summed.accumobjs == 0)) { + assert(gctx->cnt_summed.curobjs == 0); + assert(gctx->cnt_summed.curbytes == 0); + /* + * These asserts would not be correct -- see the comment on races + * in prof.c + * assert(gctx->cnt_summed.curobjs_unbiased == 0); + * assert(gctx->cnt_summed.curbytes_unbiased == 0); + */ + assert(gctx->cnt_summed.accumobjs == 0); + assert(gctx->cnt_summed.accumobjs_shifted_unbiased == 0); + assert(gctx->cnt_summed.accumbytes == 0); + assert(gctx->cnt_summed.accumbytes_unbiased == 0); + return; + } + + arg->prof_dump_write(arg->cbopaque, "@"); + for (unsigned i = 0; i < bt->len; i++) { + prof_dump_printf(arg->prof_dump_write, arg->cbopaque, + " %#"FMTxPTR, (uintptr_t)bt->vec[i]); + } + + arg->prof_dump_write(arg->cbopaque, "\n t*: "); + prof_dump_print_cnts(arg->prof_dump_write, arg->cbopaque, + &gctx->cnt_summed); + arg->prof_dump_write(arg->cbopaque, "\n"); + + tctx_tree_iter(&gctx->tctxs, NULL, prof_tctx_dump_iter, arg); +} + +/* + * See prof_sample_new_event_wait() comment for why the body of this function + * is conditionally compiled. + */ +static void +prof_leakcheck(const prof_cnt_t *cnt_all, size_t leak_ngctx) { +#ifdef JEMALLOC_PROF + /* + * Scaling is equivalent AdjustSamples() in jeprof, but the result may + * differ slightly from what jeprof reports, because here we scale the + * summary values, whereas jeprof scales each context individually and + * reports the sums of the scaled values. + */ + if (cnt_all->curbytes != 0) { + double sample_period = (double)((uint64_t)1 << lg_prof_sample); + double ratio = (((double)cnt_all->curbytes) / + (double)cnt_all->curobjs) / sample_period; + double scale_factor = 1.0 / (1.0 - exp(-ratio)); + uint64_t curbytes = (uint64_t)round(((double)cnt_all->curbytes) + * scale_factor); + uint64_t curobjs = (uint64_t)round(((double)cnt_all->curobjs) * + scale_factor); + + malloc_printf(": Leak approximation summary: ~%"FMTu64 + " byte%s, ~%"FMTu64" object%s, >= %zu context%s\n", + curbytes, (curbytes != 1) ? "s" : "", curobjs, (curobjs != + 1) ? "s" : "", leak_ngctx, (leak_ngctx != 1) ? "s" : ""); + malloc_printf( + ": Run jeprof on dump output for leak detail\n"); + if (opt_prof_leak_error) { + malloc_printf( + ": Exiting with error code because memory" + " leaks were detected\n"); + /* + * Use _exit() with underscore to avoid calling atexit() + * and entering endless cycle. + */ + _exit(1); + } + } +#endif +} + +static prof_gctx_t * +prof_gctx_dump_iter(prof_gctx_tree_t *gctxs, prof_gctx_t *gctx, void *opaque) { + prof_dump_iter_arg_t *arg = (prof_dump_iter_arg_t *)opaque; + malloc_mutex_lock(arg->tsdn, gctx->lock); + prof_dump_gctx(arg, gctx, &gctx->bt, gctxs); + malloc_mutex_unlock(arg->tsdn, gctx->lock); + return NULL; +} + +static void +prof_dump_prep(tsd_t *tsd, prof_tdata_t *tdata, prof_cnt_t *cnt_all, + size_t *leak_ngctx, prof_gctx_tree_t *gctxs) { + size_t tabind; + union { + prof_gctx_t *p; + void *v; + } gctx; + + prof_enter(tsd, tdata); + + /* + * Put gctx's in limbo and clear their counters in preparation for + * summing. + */ + gctx_tree_new(gctxs); + for (tabind = 0; !ckh_iter(&bt2gctx, &tabind, NULL, &gctx.v);) { + prof_dump_gctx_prep(tsd_tsdn(tsd), gctx.p, gctxs); + } + + /* + * Iterate over tdatas, and for the non-expired ones snapshot their tctx + * stats and merge them into the associated gctx's. + */ + memset(cnt_all, 0, sizeof(prof_cnt_t)); + prof_tdata_merge_iter_arg_t prof_tdata_merge_iter_arg = {tsd_tsdn(tsd), + cnt_all}; + malloc_mutex_lock(tsd_tsdn(tsd), &tdatas_mtx); + tdata_tree_iter(&tdatas, NULL, prof_tdata_merge_iter, + &prof_tdata_merge_iter_arg); + malloc_mutex_unlock(tsd_tsdn(tsd), &tdatas_mtx); + + /* Merge tctx stats into gctx's. */ + *leak_ngctx = 0; + prof_gctx_merge_iter_arg_t prof_gctx_merge_iter_arg = {tsd_tsdn(tsd), + leak_ngctx}; + gctx_tree_iter(gctxs, NULL, prof_gctx_merge_iter, + &prof_gctx_merge_iter_arg); + + prof_leave(tsd, tdata); +} + +void +prof_dump_impl(tsd_t *tsd, write_cb_t *prof_dump_write, void *cbopaque, + prof_tdata_t *tdata, bool leakcheck) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_dump_mtx); + prof_cnt_t cnt_all; + size_t leak_ngctx; + prof_gctx_tree_t gctxs; + prof_dump_prep(tsd, tdata, &cnt_all, &leak_ngctx, &gctxs); + prof_dump_iter_arg_t prof_dump_iter_arg = {tsd_tsdn(tsd), + prof_dump_write, cbopaque}; + prof_dump_header(&prof_dump_iter_arg, &cnt_all); + gctx_tree_iter(&gctxs, NULL, prof_gctx_dump_iter, &prof_dump_iter_arg); + prof_gctx_finish(tsd, &gctxs); + if (leakcheck) { + prof_leakcheck(&cnt_all, leak_ngctx); + } +} + +/* Used in unit tests. */ +void +prof_cnt_all(prof_cnt_t *cnt_all) { + tsd_t *tsd = tsd_fetch(); + prof_tdata_t *tdata = prof_tdata_get(tsd, false); + if (tdata == NULL) { + memset(cnt_all, 0, sizeof(prof_cnt_t)); + } else { + size_t leak_ngctx; + prof_gctx_tree_t gctxs; + prof_dump_prep(tsd, tdata, cnt_all, &leak_ngctx, &gctxs); + prof_gctx_finish(tsd, &gctxs); + } +} + +void +prof_bt_hash(const void *key, size_t r_hash[2]) { + prof_bt_t *bt = (prof_bt_t *)key; + + cassert(config_prof); + + hash(bt->vec, bt->len * sizeof(void *), 0x94122f33U, r_hash); +} + +bool +prof_bt_keycomp(const void *k1, const void *k2) { + const prof_bt_t *bt1 = (prof_bt_t *)k1; + const prof_bt_t *bt2 = (prof_bt_t *)k2; + + cassert(config_prof); + + if (bt1->len != bt2->len) { + return false; + } + return (memcmp(bt1->vec, bt2->vec, bt1->len * sizeof(void *)) == 0); +} + +prof_tdata_t * +prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid, uint64_t thr_discrim, + char *thread_name, bool active) { + assert(tsd_reentrancy_level_get(tsd) == 0); + + prof_tdata_t *tdata; + + cassert(config_prof); + + /* Initialize an empty cache for this thread. */ + size_t tdata_sz = ALIGNMENT_CEILING(sizeof(prof_tdata_t), QUANTUM); + size_t total_sz = tdata_sz + sizeof(void *) * opt_prof_bt_max; + tdata = (prof_tdata_t *)iallocztm(tsd_tsdn(tsd), + total_sz, sz_size2index(total_sz), false, NULL, true, + arena_get(TSDN_NULL, 0, true), true); + if (tdata == NULL) { + return NULL; + } + + tdata->vec = (void **)((byte_t *)tdata + tdata_sz); + tdata->lock = prof_tdata_mutex_choose(thr_uid); + tdata->thr_uid = thr_uid; + tdata->thr_discrim = thr_discrim; + tdata->attached = true; + tdata->expired = false; + tdata->tctx_uid_next = 0; + if (thread_name == NULL) { + prof_thread_name_clear(tdata); + } else { + prof_thread_name_write_tdata(tdata, thread_name); + } + prof_thread_name_assert(tdata); + + if (ckh_new(tsd, &tdata->bt2tctx, PROF_CKH_MINITEMS, prof_bt_hash, + prof_bt_keycomp)) { + idalloctm(tsd_tsdn(tsd), tdata, NULL, NULL, true, true); + return NULL; + } + + tdata->enq = false; + tdata->enq_idump = false; + tdata->enq_gdump = false; + + tdata->dumping = false; + tdata->active = active; + + malloc_mutex_lock(tsd_tsdn(tsd), &tdatas_mtx); + tdata_tree_insert(&tdatas, tdata); + malloc_mutex_unlock(tsd_tsdn(tsd), &tdatas_mtx); + + return tdata; +} + +static bool +prof_tdata_should_destroy_unlocked(prof_tdata_t *tdata, bool even_if_attached) { + if (tdata->attached && !even_if_attached) { + return false; + } + if (ckh_count(&tdata->bt2tctx) != 0) { + return false; + } + return true; +} + +static bool +prof_tdata_should_destroy(tsdn_t *tsdn, prof_tdata_t *tdata, + bool even_if_attached) { + malloc_mutex_assert_owner(tsdn, tdata->lock); + + return prof_tdata_should_destroy_unlocked(tdata, even_if_attached); +} + +static void +prof_tdata_destroy_locked(tsd_t *tsd, prof_tdata_t *tdata, + bool even_if_attached) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &tdatas_mtx); + malloc_mutex_assert_not_owner(tsd_tsdn(tsd), tdata->lock); + + tdata_tree_remove(&tdatas, tdata); + assert(prof_tdata_should_destroy_unlocked(tdata, even_if_attached)); + + ckh_delete(tsd, &tdata->bt2tctx); + idalloctm(tsd_tsdn(tsd), tdata, NULL, NULL, true, true); +} + +static void +prof_tdata_destroy(tsd_t *tsd, prof_tdata_t *tdata, bool even_if_attached) { + malloc_mutex_lock(tsd_tsdn(tsd), &tdatas_mtx); + prof_tdata_destroy_locked(tsd, tdata, even_if_attached); + malloc_mutex_unlock(tsd_tsdn(tsd), &tdatas_mtx); +} + +void +prof_tdata_detach(tsd_t *tsd, prof_tdata_t *tdata) { + bool destroy_tdata; + + malloc_mutex_lock(tsd_tsdn(tsd), tdata->lock); + if (tdata->attached) { + destroy_tdata = prof_tdata_should_destroy(tsd_tsdn(tsd), tdata, + true); + /* + * Only detach if !destroy_tdata, because detaching would allow + * another thread to win the race to destroy tdata. + */ + if (!destroy_tdata) { + tdata->attached = false; + } + tsd_prof_tdata_set(tsd, NULL); + } else { + destroy_tdata = false; + } + malloc_mutex_unlock(tsd_tsdn(tsd), tdata->lock); + if (destroy_tdata) { + prof_tdata_destroy(tsd, tdata, true); + } +} + +static bool +prof_tdata_expire(tsdn_t *tsdn, prof_tdata_t *tdata) { + bool destroy_tdata; + + malloc_mutex_lock(tsdn, tdata->lock); + if (!tdata->expired) { + tdata->expired = true; + destroy_tdata = prof_tdata_should_destroy(tsdn, tdata, false); + } else { + destroy_tdata = false; + } + malloc_mutex_unlock(tsdn, tdata->lock); + + return destroy_tdata; +} + +static prof_tdata_t * +prof_tdata_reset_iter(prof_tdata_tree_t *tdatas_ptr, prof_tdata_t *tdata, + void *arg) { + tsdn_t *tsdn = (tsdn_t *)arg; + + return (prof_tdata_expire(tsdn, tdata) ? tdata : NULL); +} + +void +prof_reset(tsd_t *tsd, size_t lg_sample) { + prof_tdata_t *next; + + assert(lg_sample < (sizeof(uint64_t) << 3)); + + malloc_mutex_lock(tsd_tsdn(tsd), &prof_dump_mtx); + malloc_mutex_lock(tsd_tsdn(tsd), &tdatas_mtx); + + lg_prof_sample = lg_sample; + prof_unbias_map_init(); + + next = NULL; + do { + prof_tdata_t *to_destroy = tdata_tree_iter(&tdatas, next, + prof_tdata_reset_iter, (void *)tsd); + if (to_destroy != NULL) { + next = tdata_tree_next(&tdatas, to_destroy); + prof_tdata_destroy_locked(tsd, to_destroy, false); + } else { + next = NULL; + } + } while (next != NULL); + + malloc_mutex_unlock(tsd_tsdn(tsd), &tdatas_mtx); + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_dump_mtx); +} + +static bool +prof_tctx_should_destroy(tsd_t *tsd, prof_tctx_t *tctx) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), tctx->tdata->lock); + + if (opt_prof_accum) { + return false; + } + if (tctx->cnts.curobjs != 0) { + return false; + } + if (tctx->prepared) { + return false; + } + if (tctx->recent_count != 0) { + return false; + } + return true; +} + +static void +prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), tctx->tdata->lock); + + assert(tctx->cnts.curobjs == 0); + assert(tctx->cnts.curbytes == 0); + /* + * These asserts are not correct -- see the comment about races in + * prof.c + * + * assert(tctx->cnts.curobjs_shifted_unbiased == 0); + * assert(tctx->cnts.curbytes_unbiased == 0); + */ + assert(!opt_prof_accum); + assert(tctx->cnts.accumobjs == 0); + assert(tctx->cnts.accumbytes == 0); + /* + * These ones are, since accumbyte counts never go down. Either + * prof_accum is off (in which case these should never have changed from + * their initial value of zero), or it's on (in which case we shouldn't + * be destroying this tctx). + */ + assert(tctx->cnts.accumobjs_shifted_unbiased == 0); + assert(tctx->cnts.accumbytes_unbiased == 0); + + prof_gctx_t *gctx = tctx->gctx; + + { + prof_tdata_t *tdata = tctx->tdata; + tctx->tdata = NULL; + ckh_remove(tsd, &tdata->bt2tctx, &gctx->bt, NULL, NULL); + bool destroy_tdata = prof_tdata_should_destroy(tsd_tsdn(tsd), + tdata, false); + malloc_mutex_unlock(tsd_tsdn(tsd), tdata->lock); + if (destroy_tdata) { + prof_tdata_destroy(tsd, tdata, false); + } + } + + bool destroy_tctx, destroy_gctx; + + malloc_mutex_lock(tsd_tsdn(tsd), gctx->lock); + switch (tctx->state) { + case prof_tctx_state_nominal: + tctx_tree_remove(&gctx->tctxs, tctx); + destroy_tctx = true; + if (prof_gctx_should_destroy(gctx)) { + /* + * Increment gctx->nlimbo in order to keep another + * thread from winning the race to destroy gctx while + * this one has gctx->lock dropped. Without this, it + * would be possible for another thread to: + * + * 1) Sample an allocation associated with gctx. + * 2) Deallocate the sampled object. + * 3) Successfully prof_gctx_try_destroy(gctx). + * + * The result would be that gctx no longer exists by the + * time this thread accesses it in + * prof_gctx_try_destroy(). + */ + gctx->nlimbo++; + destroy_gctx = true; + } else { + destroy_gctx = false; + } + break; + case prof_tctx_state_dumping: + /* + * A dumping thread needs tctx to remain valid until dumping + * has finished. Change state such that the dumping thread will + * complete destruction during a late dump iteration phase. + */ + tctx->state = prof_tctx_state_purgatory; + destroy_tctx = false; + destroy_gctx = false; + break; + case prof_tctx_state_initializing: + case prof_tctx_state_purgatory: + default: + not_reached(); + destroy_tctx = false; + destroy_gctx = false; + } + malloc_mutex_unlock(tsd_tsdn(tsd), gctx->lock); + if (destroy_gctx) { + prof_gctx_try_destroy(tsd, prof_tdata_get(tsd, false), gctx); + } + if (destroy_tctx) { + idalloctm(tsd_tsdn(tsd), tctx, NULL, NULL, true, true); + } +} + +void +prof_tctx_try_destroy(tsd_t *tsd, prof_tctx_t *tctx) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), tctx->tdata->lock); + if (prof_tctx_should_destroy(tsd, tctx)) { + /* tctx->tdata->lock will be released in prof_tctx_destroy(). */ + prof_tctx_destroy(tsd, tctx); + } else { + malloc_mutex_unlock(tsd_tsdn(tsd), tctx->tdata->lock); + } +} + +/******************************************************************************/ diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/prof_log.c b/src/duckdb/extension/jemalloc/jemalloc/src/prof_log.c new file mode 100644 index 000000000..f4000aecf --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/prof_log.c @@ -0,0 +1,716 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/buf_writer.h" +#include "jemalloc/internal/ckh.h" +#include "jemalloc/internal/emitter.h" +#include "jemalloc/internal/hash.h" +#include "jemalloc/internal/malloc_io.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/prof_data.h" +#include "jemalloc/internal/prof_log.h" +#include "jemalloc/internal/prof_sys.h" + +bool opt_prof_log = false; +typedef enum prof_logging_state_e prof_logging_state_t; +enum prof_logging_state_e { + prof_logging_state_stopped, + prof_logging_state_started, + prof_logging_state_dumping +}; + +/* + * - stopped: log_start never called, or previous log_stop has completed. + * - started: log_start called, log_stop not called yet. Allocations are logged. + * - dumping: log_stop called but not finished; samples are not logged anymore. + */ +static prof_logging_state_t prof_logging_state = prof_logging_state_stopped; + +/* Used in unit tests. */ +static bool prof_log_dummy = false; + +/* Incremented for every log file that is output. */ +static uint64_t log_seq = 0; +static char log_filename[ + /* Minimize memory bloat for non-prof builds. */ +#ifdef JEMALLOC_PROF + PATH_MAX + +#endif + 1]; + +/* Timestamp for most recent call to log_start(). */ +static nstime_t log_start_timestamp; + +/* Increment these when adding to the log_bt and log_thr linked lists. */ +static size_t log_bt_index = 0; +static size_t log_thr_index = 0; + +/* Linked list node definitions. These are only used in this file. */ +typedef struct prof_bt_node_s prof_bt_node_t; + +struct prof_bt_node_s { + prof_bt_node_t *next; + size_t index; + prof_bt_t bt; + /* Variable size backtrace vector pointed to by bt. */ + void *vec[1]; +}; + +typedef struct prof_thr_node_s prof_thr_node_t; + +struct prof_thr_node_s { + prof_thr_node_t *next; + size_t index; + uint64_t thr_uid; + /* Variable size based on thr_name_sz. */ + char name[1]; +}; + +typedef struct prof_alloc_node_s prof_alloc_node_t; + +/* This is output when logging sampled allocations. */ +struct prof_alloc_node_s { + prof_alloc_node_t *next; + /* Indices into an array of thread data. */ + size_t alloc_thr_ind; + size_t free_thr_ind; + + /* Indices into an array of backtraces. */ + size_t alloc_bt_ind; + size_t free_bt_ind; + + uint64_t alloc_time_ns; + uint64_t free_time_ns; + + size_t usize; +}; + +/* + * Created on the first call to prof_try_log and deleted on prof_log_stop. + * These are the backtraces and threads that have already been logged by an + * allocation. + */ +static bool log_tables_initialized = false; +static ckh_t log_bt_node_set; +static ckh_t log_thr_node_set; + +/* Store linked lists for logged data. */ +static prof_bt_node_t *log_bt_first = NULL; +static prof_bt_node_t *log_bt_last = NULL; +static prof_thr_node_t *log_thr_first = NULL; +static prof_thr_node_t *log_thr_last = NULL; +static prof_alloc_node_t *log_alloc_first = NULL; +static prof_alloc_node_t *log_alloc_last = NULL; + +/* Protects the prof_logging_state and any log_{...} variable. */ +malloc_mutex_t log_mtx; + +/******************************************************************************/ +/* + * Function prototypes for static functions that are referenced prior to + * definition. + */ + +/* Hashtable functions for log_bt_node_set and log_thr_node_set. */ +static void prof_thr_node_hash(const void *key, size_t r_hash[2]); +static bool prof_thr_node_keycomp(const void *k1, const void *k2); +static void prof_bt_node_hash(const void *key, size_t r_hash[2]); +static bool prof_bt_node_keycomp(const void *k1, const void *k2); + +/******************************************************************************/ + +static size_t +prof_log_bt_index(tsd_t *tsd, prof_bt_t *bt) { + assert(prof_logging_state == prof_logging_state_started); + malloc_mutex_assert_owner(tsd_tsdn(tsd), &log_mtx); + + prof_bt_node_t dummy_node; + dummy_node.bt = *bt; + prof_bt_node_t *node; + + /* See if this backtrace is already cached in the table. */ + if (ckh_search(&log_bt_node_set, (void *)(&dummy_node), + (void **)(&node), NULL)) { + size_t sz = offsetof(prof_bt_node_t, vec) + + (bt->len * sizeof(void *)); + prof_bt_node_t *new_node = (prof_bt_node_t *) + iallocztm(tsd_tsdn(tsd), sz, sz_size2index(sz), false, NULL, + true, arena_get(TSDN_NULL, 0, true), true); + if (log_bt_first == NULL) { + log_bt_first = new_node; + log_bt_last = new_node; + } else { + log_bt_last->next = new_node; + log_bt_last = new_node; + } + + new_node->next = NULL; + new_node->index = log_bt_index; + /* + * Copy the backtrace: bt is inside a tdata or gctx, which + * might die before prof_log_stop is called. + */ + new_node->bt.len = bt->len; + memcpy(new_node->vec, bt->vec, bt->len * sizeof(void *)); + new_node->bt.vec = new_node->vec; + + log_bt_index++; + ckh_insert(tsd, &log_bt_node_set, (void *)new_node, NULL); + return new_node->index; + } else { + return node->index; + } +} + +static size_t +prof_log_thr_index(tsd_t *tsd, uint64_t thr_uid, const char *name) { + assert(prof_logging_state == prof_logging_state_started); + malloc_mutex_assert_owner(tsd_tsdn(tsd), &log_mtx); + + prof_thr_node_t dummy_node; + dummy_node.thr_uid = thr_uid; + prof_thr_node_t *node; + + /* See if this thread is already cached in the table. */ + if (ckh_search(&log_thr_node_set, (void *)(&dummy_node), + (void **)(&node), NULL)) { + size_t sz = offsetof(prof_thr_node_t, name) + strlen(name) + 1; + prof_thr_node_t *new_node = (prof_thr_node_t *) + iallocztm(tsd_tsdn(tsd), sz, sz_size2index(sz), false, NULL, + true, arena_get(TSDN_NULL, 0, true), true); + if (log_thr_first == NULL) { + log_thr_first = new_node; + log_thr_last = new_node; + } else { + log_thr_last->next = new_node; + log_thr_last = new_node; + } + + new_node->next = NULL; + new_node->index = log_thr_index; + new_node->thr_uid = thr_uid; + strcpy(new_node->name, name); + + log_thr_index++; + ckh_insert(tsd, &log_thr_node_set, (void *)new_node, NULL); + return new_node->index; + } else { + return node->index; + } +} + +JEMALLOC_COLD +void +prof_try_log(tsd_t *tsd, size_t usize, prof_info_t *prof_info) { + cassert(config_prof); + prof_tctx_t *tctx = prof_info->alloc_tctx; + malloc_mutex_assert_owner(tsd_tsdn(tsd), tctx->tdata->lock); + + prof_tdata_t *cons_tdata = prof_tdata_get(tsd, false); + if (cons_tdata == NULL) { + /* + * We decide not to log these allocations. cons_tdata will be + * NULL only when the current thread is in a weird state (e.g. + * it's being destroyed). + */ + return; + } + + malloc_mutex_lock(tsd_tsdn(tsd), &log_mtx); + + if (prof_logging_state != prof_logging_state_started) { + goto label_done; + } + + if (!log_tables_initialized) { + bool err1 = ckh_new(tsd, &log_bt_node_set, PROF_CKH_MINITEMS, + prof_bt_node_hash, prof_bt_node_keycomp); + bool err2 = ckh_new(tsd, &log_thr_node_set, PROF_CKH_MINITEMS, + prof_thr_node_hash, prof_thr_node_keycomp); + if (err1 || err2) { + goto label_done; + } + log_tables_initialized = true; + } + + nstime_t alloc_time = prof_info->alloc_time; + nstime_t free_time; + nstime_prof_init_update(&free_time); + + size_t sz = sizeof(prof_alloc_node_t); + prof_alloc_node_t *new_node = (prof_alloc_node_t *) + iallocztm(tsd_tsdn(tsd), sz, sz_size2index(sz), false, NULL, true, + arena_get(TSDN_NULL, 0, true), true); + + const char *prod_thr_name = tctx->tdata->thread_name; + const char *cons_thr_name = prof_thread_name_get(tsd); + + prof_bt_t bt; + /* Initialize the backtrace, using the buffer in tdata to store it. */ + bt_init(&bt, cons_tdata->vec); + prof_backtrace(tsd, &bt); + prof_bt_t *cons_bt = &bt; + + /* We haven't destroyed tctx yet, so gctx should be good to read. */ + prof_bt_t *prod_bt = &tctx->gctx->bt; + + new_node->next = NULL; + new_node->alloc_thr_ind = prof_log_thr_index(tsd, tctx->tdata->thr_uid, + prod_thr_name); + new_node->free_thr_ind = prof_log_thr_index(tsd, cons_tdata->thr_uid, + cons_thr_name); + new_node->alloc_bt_ind = prof_log_bt_index(tsd, prod_bt); + new_node->free_bt_ind = prof_log_bt_index(tsd, cons_bt); + new_node->alloc_time_ns = nstime_ns(&alloc_time); + new_node->free_time_ns = nstime_ns(&free_time); + new_node->usize = usize; + + if (log_alloc_first == NULL) { + log_alloc_first = new_node; + log_alloc_last = new_node; + } else { + log_alloc_last->next = new_node; + log_alloc_last = new_node; + } + +label_done: + malloc_mutex_unlock(tsd_tsdn(tsd), &log_mtx); +} + +static void +prof_bt_node_hash(const void *key, size_t r_hash[2]) { + const prof_bt_node_t *bt_node = (prof_bt_node_t *)key; + prof_bt_hash((void *)(&bt_node->bt), r_hash); +} + +static bool +prof_bt_node_keycomp(const void *k1, const void *k2) { + const prof_bt_node_t *bt_node1 = (prof_bt_node_t *)k1; + const prof_bt_node_t *bt_node2 = (prof_bt_node_t *)k2; + return prof_bt_keycomp((void *)(&bt_node1->bt), + (void *)(&bt_node2->bt)); +} + +static void +prof_thr_node_hash(const void *key, size_t r_hash[2]) { + const prof_thr_node_t *thr_node = (prof_thr_node_t *)key; + hash(&thr_node->thr_uid, sizeof(uint64_t), 0x94122f35U, r_hash); +} + +static bool +prof_thr_node_keycomp(const void *k1, const void *k2) { + const prof_thr_node_t *thr_node1 = (prof_thr_node_t *)k1; + const prof_thr_node_t *thr_node2 = (prof_thr_node_t *)k2; + return thr_node1->thr_uid == thr_node2->thr_uid; +} + +/* Used in unit tests. */ +size_t +prof_log_bt_count(void) { + cassert(config_prof); + size_t cnt = 0; + prof_bt_node_t *node = log_bt_first; + while (node != NULL) { + cnt++; + node = node->next; + } + return cnt; +} + +/* Used in unit tests. */ +size_t +prof_log_alloc_count(void) { + cassert(config_prof); + size_t cnt = 0; + prof_alloc_node_t *node = log_alloc_first; + while (node != NULL) { + cnt++; + node = node->next; + } + return cnt; +} + +/* Used in unit tests. */ +size_t +prof_log_thr_count(void) { + cassert(config_prof); + size_t cnt = 0; + prof_thr_node_t *node = log_thr_first; + while (node != NULL) { + cnt++; + node = node->next; + } + return cnt; +} + +/* Used in unit tests. */ +bool +prof_log_is_logging(void) { + cassert(config_prof); + return prof_logging_state == prof_logging_state_started; +} + +/* Used in unit tests. */ +bool +prof_log_rep_check(void) { + cassert(config_prof); + if (prof_logging_state == prof_logging_state_stopped + && log_tables_initialized) { + return true; + } + + if (log_bt_last != NULL && log_bt_last->next != NULL) { + return true; + } + if (log_thr_last != NULL && log_thr_last->next != NULL) { + return true; + } + if (log_alloc_last != NULL && log_alloc_last->next != NULL) { + return true; + } + + size_t bt_count = prof_log_bt_count(); + size_t thr_count = prof_log_thr_count(); + size_t alloc_count = prof_log_alloc_count(); + + + if (prof_logging_state == prof_logging_state_stopped) { + if (bt_count != 0 || thr_count != 0 || alloc_count || 0) { + return true; + } + } + + prof_alloc_node_t *node = log_alloc_first; + while (node != NULL) { + if (node->alloc_bt_ind >= bt_count) { + return true; + } + if (node->free_bt_ind >= bt_count) { + return true; + } + if (node->alloc_thr_ind >= thr_count) { + return true; + } + if (node->free_thr_ind >= thr_count) { + return true; + } + if (node->alloc_time_ns > node->free_time_ns) { + return true; + } + node = node->next; + } + + return false; +} + +/* Used in unit tests. */ +void +prof_log_dummy_set(bool new_value) { + cassert(config_prof); + prof_log_dummy = new_value; +} + +/* Used as an atexit function to stop logging on exit. */ +static void +prof_log_stop_final(void) { + tsd_t *tsd = tsd_fetch(); + prof_log_stop(tsd_tsdn(tsd)); +} + +JEMALLOC_COLD +bool +prof_log_start(tsdn_t *tsdn, const char *filename) { + cassert(config_prof); + + if (!opt_prof) { + return true; + } + + bool ret = false; + + malloc_mutex_lock(tsdn, &log_mtx); + + static bool prof_log_atexit_called = false; + if (!prof_log_atexit_called) { + prof_log_atexit_called = true; + if (atexit(prof_log_stop_final) != 0) { + malloc_write(": Error in atexit() " + "for logging\n"); + if (opt_abort) { + abort(); + } + ret = true; + goto label_done; + } + } + + if (prof_logging_state != prof_logging_state_stopped) { + ret = true; + } else if (filename == NULL) { + /* Make default name. */ + prof_get_default_filename(tsdn, log_filename, log_seq); + log_seq++; + prof_logging_state = prof_logging_state_started; + } else if (strlen(filename) >= PROF_DUMP_FILENAME_LEN) { + ret = true; + } else { + strcpy(log_filename, filename); + prof_logging_state = prof_logging_state_started; + } + + if (!ret) { + nstime_prof_init_update(&log_start_timestamp); + } +label_done: + malloc_mutex_unlock(tsdn, &log_mtx); + + return ret; +} + +struct prof_emitter_cb_arg_s { + int fd; + ssize_t ret; +}; + +static void +prof_emitter_write_cb(void *opaque, const char *to_write) { + struct prof_emitter_cb_arg_s *arg = + (struct prof_emitter_cb_arg_s *)opaque; + size_t bytes = strlen(to_write); + if (prof_log_dummy) { + return; + } + arg->ret = malloc_write_fd(arg->fd, to_write, bytes); +} + +/* + * prof_log_emit_{...} goes through the appropriate linked list, emitting each + * node to the json and deallocating it. + */ +static void +prof_log_emit_threads(tsd_t *tsd, emitter_t *emitter) { + emitter_json_array_kv_begin(emitter, "threads"); + prof_thr_node_t *thr_node = log_thr_first; + prof_thr_node_t *thr_old_node; + while (thr_node != NULL) { + emitter_json_object_begin(emitter); + + emitter_json_kv(emitter, "thr_uid", emitter_type_uint64, + &thr_node->thr_uid); + + char *thr_name = thr_node->name; + + emitter_json_kv(emitter, "thr_name", emitter_type_string, + &thr_name); + + emitter_json_object_end(emitter); + thr_old_node = thr_node; + thr_node = thr_node->next; + idalloctm(tsd_tsdn(tsd), thr_old_node, NULL, NULL, true, true); + } + emitter_json_array_end(emitter); +} + +static void +prof_log_emit_traces(tsd_t *tsd, emitter_t *emitter) { + emitter_json_array_kv_begin(emitter, "stack_traces"); + prof_bt_node_t *bt_node = log_bt_first; + prof_bt_node_t *bt_old_node; + /* + * Calculate how many hex digits we need: twice number of bytes, two for + * "0x", and then one more for terminating '\0'. + */ + char buf[2 * sizeof(intptr_t) + 3]; + size_t buf_sz = sizeof(buf); + while (bt_node != NULL) { + emitter_json_array_begin(emitter); + size_t i; + for (i = 0; i < bt_node->bt.len; i++) { + malloc_snprintf(buf, buf_sz, "%p", bt_node->bt.vec[i]); + char *trace_str = buf; + emitter_json_value(emitter, emitter_type_string, + &trace_str); + } + emitter_json_array_end(emitter); + + bt_old_node = bt_node; + bt_node = bt_node->next; + idalloctm(tsd_tsdn(tsd), bt_old_node, NULL, NULL, true, true); + } + emitter_json_array_end(emitter); +} + +static void +prof_log_emit_allocs(tsd_t *tsd, emitter_t *emitter) { + emitter_json_array_kv_begin(emitter, "allocations"); + prof_alloc_node_t *alloc_node = log_alloc_first; + prof_alloc_node_t *alloc_old_node; + while (alloc_node != NULL) { + emitter_json_object_begin(emitter); + + emitter_json_kv(emitter, "alloc_thread", emitter_type_size, + &alloc_node->alloc_thr_ind); + + emitter_json_kv(emitter, "free_thread", emitter_type_size, + &alloc_node->free_thr_ind); + + emitter_json_kv(emitter, "alloc_trace", emitter_type_size, + &alloc_node->alloc_bt_ind); + + emitter_json_kv(emitter, "free_trace", emitter_type_size, + &alloc_node->free_bt_ind); + + emitter_json_kv(emitter, "alloc_timestamp", + emitter_type_uint64, &alloc_node->alloc_time_ns); + + emitter_json_kv(emitter, "free_timestamp", emitter_type_uint64, + &alloc_node->free_time_ns); + + emitter_json_kv(emitter, "usize", emitter_type_uint64, + &alloc_node->usize); + + emitter_json_object_end(emitter); + + alloc_old_node = alloc_node; + alloc_node = alloc_node->next; + idalloctm(tsd_tsdn(tsd), alloc_old_node, NULL, NULL, true, + true); + } + emitter_json_array_end(emitter); +} + +static void +prof_log_emit_metadata(emitter_t *emitter) { + emitter_json_object_kv_begin(emitter, "info"); + + nstime_t now; + + nstime_prof_init_update(&now); + uint64_t ns = nstime_ns(&now) - nstime_ns(&log_start_timestamp); + emitter_json_kv(emitter, "duration", emitter_type_uint64, &ns); + + char *vers = JEMALLOC_VERSION; + emitter_json_kv(emitter, "version", + emitter_type_string, &vers); + + emitter_json_kv(emitter, "lg_sample_rate", + emitter_type_int, &lg_prof_sample); + + const char *res_type = prof_time_res_mode_names[opt_prof_time_res]; + emitter_json_kv(emitter, "prof_time_resolution", emitter_type_string, + &res_type); + + int pid = prof_getpid(); + emitter_json_kv(emitter, "pid", emitter_type_int, &pid); + + emitter_json_object_end(emitter); +} + +#define PROF_LOG_STOP_BUFSIZE PROF_DUMP_BUFSIZE +JEMALLOC_COLD +bool +prof_log_stop(tsdn_t *tsdn) { + cassert(config_prof); + if (!opt_prof || !prof_booted) { + return true; + } + + tsd_t *tsd = tsdn_tsd(tsdn); + malloc_mutex_lock(tsdn, &log_mtx); + + if (prof_logging_state != prof_logging_state_started) { + malloc_mutex_unlock(tsdn, &log_mtx); + return true; + } + + /* + * Set the state to dumping. We'll set it to stopped when we're done. + * Since other threads won't be able to start/stop/log when the state is + * dumping, we don't have to hold the lock during the whole method. + */ + prof_logging_state = prof_logging_state_dumping; + malloc_mutex_unlock(tsdn, &log_mtx); + + + emitter_t emitter; + + /* Create a file. */ + + int fd; + if (prof_log_dummy) { + fd = 0; + } else { + fd = creat(log_filename, 0644); + } + + if (fd == -1) { + malloc_printf(": creat() for log file \"%s\" " + " failed with %d\n", log_filename, errno); + if (opt_abort) { + abort(); + } + return true; + } + + struct prof_emitter_cb_arg_s arg; + arg.fd = fd; + + buf_writer_t buf_writer; + buf_writer_init(tsdn, &buf_writer, prof_emitter_write_cb, &arg, NULL, + PROF_LOG_STOP_BUFSIZE); + emitter_init(&emitter, emitter_output_json_compact, buf_writer_cb, + &buf_writer); + + emitter_begin(&emitter); + prof_log_emit_metadata(&emitter); + prof_log_emit_threads(tsd, &emitter); + prof_log_emit_traces(tsd, &emitter); + prof_log_emit_allocs(tsd, &emitter); + emitter_end(&emitter); + + buf_writer_terminate(tsdn, &buf_writer); + + /* Reset global state. */ + if (log_tables_initialized) { + ckh_delete(tsd, &log_bt_node_set); + ckh_delete(tsd, &log_thr_node_set); + } + log_tables_initialized = false; + log_bt_index = 0; + log_thr_index = 0; + log_bt_first = NULL; + log_bt_last = NULL; + log_thr_first = NULL; + log_thr_last = NULL; + log_alloc_first = NULL; + log_alloc_last = NULL; + + malloc_mutex_lock(tsdn, &log_mtx); + prof_logging_state = prof_logging_state_stopped; + malloc_mutex_unlock(tsdn, &log_mtx); + + if (prof_log_dummy) { + return false; + } + return close(fd) || arg.ret == -1; +} +#undef PROF_LOG_STOP_BUFSIZE + +JEMALLOC_COLD +bool +prof_log_init(tsd_t *tsd) { + cassert(config_prof); + if (malloc_mutex_init(&log_mtx, "prof_log", + WITNESS_RANK_PROF_LOG, malloc_mutex_rank_exclusive)) { + return true; + } + + if (opt_prof_log) { + prof_log_start(tsd_tsdn(tsd), NULL); + } + + return false; +} + +/******************************************************************************/ diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/prof_recent.c b/src/duckdb/extension/jemalloc/jemalloc/src/prof_recent.c new file mode 100644 index 000000000..b5639b4cb --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/prof_recent.c @@ -0,0 +1,602 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/buf_writer.h" +#include "jemalloc/internal/emitter.h" +#include "jemalloc/internal/prof_data.h" +#include "jemalloc/internal/prof_recent.h" + +ssize_t opt_prof_recent_alloc_max = PROF_RECENT_ALLOC_MAX_DEFAULT; +malloc_mutex_t prof_recent_alloc_mtx; /* Protects the fields below */ +static atomic_zd_t prof_recent_alloc_max; +static ssize_t prof_recent_alloc_count = 0; +prof_recent_list_t prof_recent_alloc_list; + +malloc_mutex_t prof_recent_dump_mtx; /* Protects dumping. */ + +static void +prof_recent_alloc_max_init(void) { + atomic_store_zd(&prof_recent_alloc_max, opt_prof_recent_alloc_max, + ATOMIC_RELAXED); +} + +static inline ssize_t +prof_recent_alloc_max_get_no_lock(void) { + return atomic_load_zd(&prof_recent_alloc_max, ATOMIC_RELAXED); +} + +static inline ssize_t +prof_recent_alloc_max_get(tsd_t *tsd) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + return prof_recent_alloc_max_get_no_lock(); +} + +static inline ssize_t +prof_recent_alloc_max_update(tsd_t *tsd, ssize_t max) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + ssize_t old_max = prof_recent_alloc_max_get(tsd); + atomic_store_zd(&prof_recent_alloc_max, max, ATOMIC_RELAXED); + return old_max; +} + +static prof_recent_t * +prof_recent_allocate_node(tsdn_t *tsdn) { + return (prof_recent_t *)iallocztm(tsdn, sizeof(prof_recent_t), + sz_size2index(sizeof(prof_recent_t)), false, NULL, true, + arena_get(tsdn, 0, false), true); +} + +static void +prof_recent_free_node(tsdn_t *tsdn, prof_recent_t *node) { + assert(node != NULL); + assert(isalloc(tsdn, node) == sz_s2u(sizeof(prof_recent_t))); + idalloctm(tsdn, node, NULL, NULL, true, true); +} + +static inline void +increment_recent_count(tsd_t *tsd, prof_tctx_t *tctx) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), tctx->tdata->lock); + ++tctx->recent_count; + assert(tctx->recent_count > 0); +} + +bool +prof_recent_alloc_prepare(tsd_t *tsd, prof_tctx_t *tctx) { + cassert(config_prof); + assert(opt_prof && prof_booted); + malloc_mutex_assert_owner(tsd_tsdn(tsd), tctx->tdata->lock); + malloc_mutex_assert_not_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + + /* + * Check whether last-N mode is turned on without trying to acquire the + * lock, so as to optimize for the following two scenarios: + * (1) Last-N mode is switched off; + * (2) Dumping, during which last-N mode is temporarily turned off so + * as not to block sampled allocations. + */ + if (prof_recent_alloc_max_get_no_lock() == 0) { + return false; + } + + /* + * Increment recent_count to hold the tctx so that it won't be gone + * even after tctx->tdata->lock is released. This acts as a + * "placeholder"; the real recording of the allocation requires a lock + * on prof_recent_alloc_mtx and is done in prof_recent_alloc (when + * tctx->tdata->lock has been released). + */ + increment_recent_count(tsd, tctx); + return true; +} + +static void +decrement_recent_count(tsd_t *tsd, prof_tctx_t *tctx) { + malloc_mutex_assert_not_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + assert(tctx != NULL); + malloc_mutex_lock(tsd_tsdn(tsd), tctx->tdata->lock); + assert(tctx->recent_count > 0); + --tctx->recent_count; + prof_tctx_try_destroy(tsd, tctx); +} + +static inline edata_t * +prof_recent_alloc_edata_get_no_lock(const prof_recent_t *n) { + return (edata_t *)atomic_load_p(&n->alloc_edata, ATOMIC_ACQUIRE); +} + +edata_t * +prof_recent_alloc_edata_get_no_lock_test(const prof_recent_t *n) { + cassert(config_prof); + return prof_recent_alloc_edata_get_no_lock(n); +} + +static inline edata_t * +prof_recent_alloc_edata_get(tsd_t *tsd, const prof_recent_t *n) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + return prof_recent_alloc_edata_get_no_lock(n); +} + +static void +prof_recent_alloc_edata_set(tsd_t *tsd, prof_recent_t *n, edata_t *edata) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + atomic_store_p(&n->alloc_edata, edata, ATOMIC_RELEASE); +} + +void +edata_prof_recent_alloc_init(edata_t *edata) { + cassert(config_prof); + edata_prof_recent_alloc_set_dont_call_directly(edata, NULL); +} + +static inline prof_recent_t * +edata_prof_recent_alloc_get_no_lock(const edata_t *edata) { + cassert(config_prof); + return edata_prof_recent_alloc_get_dont_call_directly(edata); +} + +prof_recent_t * +edata_prof_recent_alloc_get_no_lock_test(const edata_t *edata) { + cassert(config_prof); + return edata_prof_recent_alloc_get_no_lock(edata); +} + +static inline prof_recent_t * +edata_prof_recent_alloc_get(tsd_t *tsd, const edata_t *edata) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + prof_recent_t *recent_alloc = + edata_prof_recent_alloc_get_no_lock(edata); + assert(recent_alloc == NULL || + prof_recent_alloc_edata_get(tsd, recent_alloc) == edata); + return recent_alloc; +} + +static prof_recent_t * +edata_prof_recent_alloc_update_internal(tsd_t *tsd, edata_t *edata, + prof_recent_t *recent_alloc) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + prof_recent_t *old_recent_alloc = + edata_prof_recent_alloc_get(tsd, edata); + edata_prof_recent_alloc_set_dont_call_directly(edata, recent_alloc); + return old_recent_alloc; +} + +static void +edata_prof_recent_alloc_set(tsd_t *tsd, edata_t *edata, + prof_recent_t *recent_alloc) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + assert(recent_alloc != NULL); + prof_recent_t *old_recent_alloc = + edata_prof_recent_alloc_update_internal(tsd, edata, recent_alloc); + assert(old_recent_alloc == NULL); + prof_recent_alloc_edata_set(tsd, recent_alloc, edata); +} + +static void +edata_prof_recent_alloc_reset(tsd_t *tsd, edata_t *edata, + prof_recent_t *recent_alloc) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + assert(recent_alloc != NULL); + prof_recent_t *old_recent_alloc = + edata_prof_recent_alloc_update_internal(tsd, edata, NULL); + assert(old_recent_alloc == recent_alloc); + assert(edata == prof_recent_alloc_edata_get(tsd, recent_alloc)); + prof_recent_alloc_edata_set(tsd, recent_alloc, NULL); +} + +/* + * This function should be called right before an allocation is released, so + * that the associated recent allocation record can contain the following + * information: + * (1) The allocation is released; + * (2) The time of the deallocation; and + * (3) The prof_tctx associated with the deallocation. + */ +void +prof_recent_alloc_reset(tsd_t *tsd, edata_t *edata) { + cassert(config_prof); + /* + * Check whether the recent allocation record still exists without + * trying to acquire the lock. + */ + if (edata_prof_recent_alloc_get_no_lock(edata) == NULL) { + return; + } + + prof_tctx_t *dalloc_tctx = prof_tctx_create(tsd); + /* + * In case dalloc_tctx is NULL, e.g. due to OOM, we will not record the + * deallocation time / tctx, which is handled later, after we check + * again when holding the lock. + */ + + if (dalloc_tctx != NULL) { + malloc_mutex_lock(tsd_tsdn(tsd), dalloc_tctx->tdata->lock); + increment_recent_count(tsd, dalloc_tctx); + dalloc_tctx->prepared = false; + malloc_mutex_unlock(tsd_tsdn(tsd), dalloc_tctx->tdata->lock); + } + + malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + /* Check again after acquiring the lock. */ + prof_recent_t *recent = edata_prof_recent_alloc_get(tsd, edata); + if (recent != NULL) { + assert(nstime_equals_zero(&recent->dalloc_time)); + assert(recent->dalloc_tctx == NULL); + if (dalloc_tctx != NULL) { + nstime_prof_update(&recent->dalloc_time); + recent->dalloc_tctx = dalloc_tctx; + dalloc_tctx = NULL; + } + edata_prof_recent_alloc_reset(tsd, edata, recent); + } + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + + if (dalloc_tctx != NULL) { + /* We lost the rase - the allocation record was just gone. */ + decrement_recent_count(tsd, dalloc_tctx); + } +} + +static void +prof_recent_alloc_evict_edata(tsd_t *tsd, prof_recent_t *recent_alloc) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + edata_t *edata = prof_recent_alloc_edata_get(tsd, recent_alloc); + if (edata != NULL) { + edata_prof_recent_alloc_reset(tsd, edata, recent_alloc); + } +} + +static bool +prof_recent_alloc_is_empty(tsd_t *tsd) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + if (ql_empty(&prof_recent_alloc_list)) { + assert(prof_recent_alloc_count == 0); + return true; + } else { + assert(prof_recent_alloc_count > 0); + return false; + } +} + +static void +prof_recent_alloc_assert_count(tsd_t *tsd) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + if (!config_debug) { + return; + } + ssize_t count = 0; + prof_recent_t *n; + ql_foreach(n, &prof_recent_alloc_list, link) { + ++count; + } + assert(count == prof_recent_alloc_count); + assert(prof_recent_alloc_max_get(tsd) == -1 || + count <= prof_recent_alloc_max_get(tsd)); +} + +void +prof_recent_alloc(tsd_t *tsd, edata_t *edata, size_t size, size_t usize) { + cassert(config_prof); + assert(edata != NULL); + prof_tctx_t *tctx = edata_prof_tctx_get(edata); + + malloc_mutex_assert_not_owner(tsd_tsdn(tsd), tctx->tdata->lock); + malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + prof_recent_alloc_assert_count(tsd); + + /* + * Reserve a new prof_recent_t node if needed. If needed, we release + * the prof_recent_alloc_mtx lock and allocate. Then, rather than + * immediately checking for OOM, we regain the lock and try to make use + * of the reserve node if needed. There are six scenarios: + * + * \ now | no need | need but OOMed | need and allocated + * later \ | | | + * ------------------------------------------------------------ + * no need | (1) | (2) | (3) + * ------------------------------------------------------------ + * need | (4) | (5) | (6) + * + * First, "(4)" never happens, because we don't release the lock in the + * middle if there's no need for a new node; in such cases "(1)" always + * takes place, which is trivial. + * + * Out of the remaining four scenarios, "(6)" is the common case and is + * trivial. "(5)" is also trivial, in which case we'll rollback the + * effect of prof_recent_alloc_prepare() as expected. + * + * "(2)" / "(3)" occurs when the need for a new node is gone after we + * regain the lock. If the new node is successfully allocated, i.e. in + * the case of "(3)", we'll release it in the end; otherwise, i.e. in + * the case of "(2)", we do nothing - we're lucky that the OOM ends up + * doing no harm at all. + * + * Therefore, the only performance cost of the "release lock" -> + * "allocate" -> "regain lock" design is the "(3)" case, but it happens + * very rarely, so the cost is relatively small compared to the gain of + * not having to have the lock order of prof_recent_alloc_mtx above all + * the allocation locks. + */ + prof_recent_t *reserve = NULL; + if (prof_recent_alloc_max_get(tsd) == -1 || + prof_recent_alloc_count < prof_recent_alloc_max_get(tsd)) { + assert(prof_recent_alloc_max_get(tsd) != 0); + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + reserve = prof_recent_allocate_node(tsd_tsdn(tsd)); + malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + prof_recent_alloc_assert_count(tsd); + } + + if (prof_recent_alloc_max_get(tsd) == 0) { + assert(prof_recent_alloc_is_empty(tsd)); + goto label_rollback; + } + + prof_tctx_t *old_alloc_tctx, *old_dalloc_tctx; + if (prof_recent_alloc_count == prof_recent_alloc_max_get(tsd)) { + /* If upper limit is reached, rotate the head. */ + assert(prof_recent_alloc_max_get(tsd) != -1); + assert(!prof_recent_alloc_is_empty(tsd)); + prof_recent_t *head = ql_first(&prof_recent_alloc_list); + old_alloc_tctx = head->alloc_tctx; + assert(old_alloc_tctx != NULL); + old_dalloc_tctx = head->dalloc_tctx; + prof_recent_alloc_evict_edata(tsd, head); + ql_rotate(&prof_recent_alloc_list, link); + } else { + /* Otherwise make use of the new node. */ + assert(prof_recent_alloc_max_get(tsd) == -1 || + prof_recent_alloc_count < prof_recent_alloc_max_get(tsd)); + if (reserve == NULL) { + goto label_rollback; + } + ql_elm_new(reserve, link); + ql_tail_insert(&prof_recent_alloc_list, reserve, link); + reserve = NULL; + old_alloc_tctx = NULL; + old_dalloc_tctx = NULL; + ++prof_recent_alloc_count; + } + + /* Fill content into the tail node. */ + prof_recent_t *tail = ql_last(&prof_recent_alloc_list, link); + assert(tail != NULL); + tail->size = size; + tail->usize = usize; + nstime_copy(&tail->alloc_time, edata_prof_alloc_time_get(edata)); + tail->alloc_tctx = tctx; + nstime_init_zero(&tail->dalloc_time); + tail->dalloc_tctx = NULL; + edata_prof_recent_alloc_set(tsd, edata, tail); + + assert(!prof_recent_alloc_is_empty(tsd)); + prof_recent_alloc_assert_count(tsd); + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + + if (reserve != NULL) { + prof_recent_free_node(tsd_tsdn(tsd), reserve); + } + + /* + * Asynchronously handle the tctx of the old node, so that there's no + * simultaneous holdings of prof_recent_alloc_mtx and tdata->lock. + * In the worst case this may delay the tctx release but it's better + * than holding prof_recent_alloc_mtx for longer. + */ + if (old_alloc_tctx != NULL) { + decrement_recent_count(tsd, old_alloc_tctx); + } + if (old_dalloc_tctx != NULL) { + decrement_recent_count(tsd, old_dalloc_tctx); + } + return; + +label_rollback: + assert(edata_prof_recent_alloc_get(tsd, edata) == NULL); + prof_recent_alloc_assert_count(tsd); + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + if (reserve != NULL) { + prof_recent_free_node(tsd_tsdn(tsd), reserve); + } + decrement_recent_count(tsd, tctx); +} + +ssize_t +prof_recent_alloc_max_ctl_read(void) { + cassert(config_prof); + /* Don't bother to acquire the lock. */ + return prof_recent_alloc_max_get_no_lock(); +} + +static void +prof_recent_alloc_restore_locked(tsd_t *tsd, prof_recent_list_t *to_delete) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + ssize_t max = prof_recent_alloc_max_get(tsd); + if (max == -1 || prof_recent_alloc_count <= max) { + /* Easy case - no need to alter the list. */ + ql_new(to_delete); + prof_recent_alloc_assert_count(tsd); + return; + } + + prof_recent_t *node; + ql_foreach(node, &prof_recent_alloc_list, link) { + if (prof_recent_alloc_count == max) { + break; + } + prof_recent_alloc_evict_edata(tsd, node); + --prof_recent_alloc_count; + } + assert(prof_recent_alloc_count == max); + + ql_move(to_delete, &prof_recent_alloc_list); + if (max == 0) { + assert(node == NULL); + } else { + assert(node != NULL); + ql_split(to_delete, node, &prof_recent_alloc_list, link); + } + assert(!ql_empty(to_delete)); + prof_recent_alloc_assert_count(tsd); +} + +static void +prof_recent_alloc_async_cleanup(tsd_t *tsd, prof_recent_list_t *to_delete) { + malloc_mutex_assert_not_owner(tsd_tsdn(tsd), &prof_recent_dump_mtx); + malloc_mutex_assert_not_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + while (!ql_empty(to_delete)) { + prof_recent_t *node = ql_first(to_delete); + ql_remove(to_delete, node, link); + decrement_recent_count(tsd, node->alloc_tctx); + if (node->dalloc_tctx != NULL) { + decrement_recent_count(tsd, node->dalloc_tctx); + } + prof_recent_free_node(tsd_tsdn(tsd), node); + } +} + +ssize_t +prof_recent_alloc_max_ctl_write(tsd_t *tsd, ssize_t max) { + cassert(config_prof); + assert(max >= -1); + malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + prof_recent_alloc_assert_count(tsd); + const ssize_t old_max = prof_recent_alloc_max_update(tsd, max); + prof_recent_list_t to_delete; + prof_recent_alloc_restore_locked(tsd, &to_delete); + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + prof_recent_alloc_async_cleanup(tsd, &to_delete); + return old_max; +} + +static void +prof_recent_alloc_dump_bt(emitter_t *emitter, prof_tctx_t *tctx) { + char bt_buf[2 * sizeof(intptr_t) + 3]; + char *s = bt_buf; + assert(tctx != NULL); + prof_bt_t *bt = &tctx->gctx->bt; + for (size_t i = 0; i < bt->len; ++i) { + malloc_snprintf(bt_buf, sizeof(bt_buf), "%p", bt->vec[i]); + emitter_json_value(emitter, emitter_type_string, &s); + } +} + +static void +prof_recent_alloc_dump_node(emitter_t *emitter, prof_recent_t *node) { + emitter_json_object_begin(emitter); + + emitter_json_kv(emitter, "size", emitter_type_size, &node->size); + emitter_json_kv(emitter, "usize", emitter_type_size, &node->usize); + bool released = prof_recent_alloc_edata_get_no_lock(node) == NULL; + emitter_json_kv(emitter, "released", emitter_type_bool, &released); + + emitter_json_kv(emitter, "alloc_thread_uid", emitter_type_uint64, + &node->alloc_tctx->thr_uid); + prof_tdata_t *alloc_tdata = node->alloc_tctx->tdata; + assert(alloc_tdata != NULL); + if (!prof_thread_name_empty(alloc_tdata)) { + const char *thread_name = alloc_tdata->thread_name; + emitter_json_kv(emitter, "alloc_thread_name", + emitter_type_string, &thread_name); + } + uint64_t alloc_time_ns = nstime_ns(&node->alloc_time); + emitter_json_kv(emitter, "alloc_time", emitter_type_uint64, + &alloc_time_ns); + emitter_json_array_kv_begin(emitter, "alloc_trace"); + prof_recent_alloc_dump_bt(emitter, node->alloc_tctx); + emitter_json_array_end(emitter); + + if (released && node->dalloc_tctx != NULL) { + emitter_json_kv(emitter, "dalloc_thread_uid", + emitter_type_uint64, &node->dalloc_tctx->thr_uid); + prof_tdata_t *dalloc_tdata = node->dalloc_tctx->tdata; + assert(dalloc_tdata != NULL); + if (!prof_thread_name_empty(dalloc_tdata)) { + const char *thread_name = dalloc_tdata->thread_name; + emitter_json_kv(emitter, "dalloc_thread_name", + emitter_type_string, &thread_name); + } + assert(!nstime_equals_zero(&node->dalloc_time)); + uint64_t dalloc_time_ns = nstime_ns(&node->dalloc_time); + emitter_json_kv(emitter, "dalloc_time", emitter_type_uint64, + &dalloc_time_ns); + emitter_json_array_kv_begin(emitter, "dalloc_trace"); + prof_recent_alloc_dump_bt(emitter, node->dalloc_tctx); + emitter_json_array_end(emitter); + } + + emitter_json_object_end(emitter); +} + +#define PROF_RECENT_PRINT_BUFSIZE 65536 +JEMALLOC_COLD +void +prof_recent_alloc_dump(tsd_t *tsd, write_cb_t *write_cb, void *cbopaque) { + cassert(config_prof); + malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_dump_mtx); + buf_writer_t buf_writer; + buf_writer_init(tsd_tsdn(tsd), &buf_writer, write_cb, cbopaque, NULL, + PROF_RECENT_PRINT_BUFSIZE); + emitter_t emitter; + emitter_init(&emitter, emitter_output_json_compact, buf_writer_cb, + &buf_writer); + prof_recent_list_t temp_list; + + malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + prof_recent_alloc_assert_count(tsd); + ssize_t dump_max = prof_recent_alloc_max_get(tsd); + ql_move(&temp_list, &prof_recent_alloc_list); + ssize_t dump_count = prof_recent_alloc_count; + prof_recent_alloc_count = 0; + prof_recent_alloc_assert_count(tsd); + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + + emitter_begin(&emitter); + uint64_t sample_interval = (uint64_t)1U << lg_prof_sample; + emitter_json_kv(&emitter, "sample_interval", emitter_type_uint64, + &sample_interval); + emitter_json_kv(&emitter, "recent_alloc_max", emitter_type_ssize, + &dump_max); + emitter_json_array_kv_begin(&emitter, "recent_alloc"); + prof_recent_t *node; + ql_foreach(node, &temp_list, link) { + prof_recent_alloc_dump_node(&emitter, node); + } + emitter_json_array_end(&emitter); + emitter_end(&emitter); + + malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + prof_recent_alloc_assert_count(tsd); + ql_concat(&temp_list, &prof_recent_alloc_list, link); + ql_move(&prof_recent_alloc_list, &temp_list); + prof_recent_alloc_count += dump_count; + prof_recent_alloc_restore_locked(tsd, &temp_list); + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + + buf_writer_terminate(tsd_tsdn(tsd), &buf_writer); + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_dump_mtx); + + prof_recent_alloc_async_cleanup(tsd, &temp_list); +} +#undef PROF_RECENT_PRINT_BUFSIZE + +bool +prof_recent_init(void) { + cassert(config_prof); + prof_recent_alloc_max_init(); + + if (malloc_mutex_init(&prof_recent_alloc_mtx, "prof_recent_alloc", + WITNESS_RANK_PROF_RECENT_ALLOC, malloc_mutex_rank_exclusive)) { + return true; + } + + if (malloc_mutex_init(&prof_recent_dump_mtx, "prof_recent_dump", + WITNESS_RANK_PROF_RECENT_DUMP, malloc_mutex_rank_exclusive)) { + return true; + } + + ql_new(&prof_recent_alloc_list); + + return false; +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/prof_stats.c b/src/duckdb/extension/jemalloc/jemalloc/src/prof_stats.c new file mode 100644 index 000000000..5d1a506bb --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/prof_stats.c @@ -0,0 +1,57 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/prof_stats.h" + +bool opt_prof_stats = false; +malloc_mutex_t prof_stats_mtx; +static prof_stats_t prof_stats_live[PROF_SC_NSIZES]; +static prof_stats_t prof_stats_accum[PROF_SC_NSIZES]; + +static void +prof_stats_enter(tsd_t *tsd, szind_t ind) { + assert(opt_prof && opt_prof_stats); + assert(ind < SC_NSIZES); + malloc_mutex_lock(tsd_tsdn(tsd), &prof_stats_mtx); +} + +static void +prof_stats_leave(tsd_t *tsd) { + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_stats_mtx); +} + +void +prof_stats_inc(tsd_t *tsd, szind_t ind, size_t size) { + cassert(config_prof); + prof_stats_enter(tsd, ind); + prof_stats_live[ind].req_sum += size; + prof_stats_live[ind].count++; + prof_stats_accum[ind].req_sum += size; + prof_stats_accum[ind].count++; + prof_stats_leave(tsd); +} + +void +prof_stats_dec(tsd_t *tsd, szind_t ind, size_t size) { + cassert(config_prof); + prof_stats_enter(tsd, ind); + prof_stats_live[ind].req_sum -= size; + prof_stats_live[ind].count--; + prof_stats_leave(tsd); +} + +void +prof_stats_get_live(tsd_t *tsd, szind_t ind, prof_stats_t *stats) { + cassert(config_prof); + prof_stats_enter(tsd, ind); + memcpy(stats, &prof_stats_live[ind], sizeof(prof_stats_t)); + prof_stats_leave(tsd); +} + +void +prof_stats_get_accum(tsd_t *tsd, szind_t ind, prof_stats_t *stats) { + cassert(config_prof); + prof_stats_enter(tsd, ind); + memcpy(stats, &prof_stats_accum[ind], sizeof(prof_stats_t)); + prof_stats_leave(tsd); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/prof_sys.c b/src/duckdb/extension/jemalloc/jemalloc/src/prof_sys.c new file mode 100644 index 000000000..8a904040e --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/prof_sys.c @@ -0,0 +1,944 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/buf_writer.h" +#include "jemalloc/internal/ctl.h" +#include "jemalloc/internal/prof_data.h" +#include "jemalloc/internal/prof_sys.h" + +#ifdef JEMALLOC_PROF_LIBUNWIND +#define UNW_LOCAL_ONLY +#include +#endif + +#ifdef JEMALLOC_PROF_LIBGCC +/* + * We have a circular dependency -- jemalloc_internal.h tells us if we should + * use libgcc's unwinding functionality, but after we've included that, we've + * already hooked _Unwind_Backtrace. We'll temporarily disable hooking. + */ +#undef _Unwind_Backtrace +#include +#define _Unwind_Backtrace JEMALLOC_TEST_HOOK(_Unwind_Backtrace, test_hooks_libc_hook) +#endif + +/******************************************************************************/ + +malloc_mutex_t prof_dump_filename_mtx; + +static uint64_t prof_dump_seq; +static uint64_t prof_dump_iseq; +static uint64_t prof_dump_mseq; +static uint64_t prof_dump_useq; + +static char *prof_prefix = NULL; + +/* The fallback allocator profiling functionality will use. */ +base_t *prof_base; + +void +bt_init(prof_bt_t *bt, void **vec) { + cassert(config_prof); + + bt->vec = vec; + bt->len = 0; +} + +#ifdef JEMALLOC_PROF_LIBUNWIND +static void +prof_backtrace_impl(void **vec, unsigned *len, unsigned max_len) { + int nframes; + + cassert(config_prof); + assert(*len == 0); + assert(vec != NULL); + assert(max_len <= PROF_BT_MAX_LIMIT); + + nframes = unw_backtrace(vec, max_len); + if (nframes <= 0) { + return; + } + *len = nframes; +} +#elif (defined(JEMALLOC_PROF_LIBGCC)) +static _Unwind_Reason_Code +prof_unwind_init_callback(struct _Unwind_Context *context, void *arg) { + cassert(config_prof); + + return _URC_NO_REASON; +} + +static _Unwind_Reason_Code +prof_unwind_callback(struct _Unwind_Context *context, void *arg) { + prof_unwind_data_t *data = (prof_unwind_data_t *)arg; + void *ip; + + cassert(config_prof); + + ip = (void *)_Unwind_GetIP(context); + if (ip == NULL) { + return _URC_END_OF_STACK; + } + data->vec[*data->len] = ip; + (*data->len)++; + if (*data->len == data->max) { + return _URC_END_OF_STACK; + } + + return _URC_NO_REASON; +} + +static void +prof_backtrace_impl(void **vec, unsigned *len, unsigned max_len) { + prof_unwind_data_t data = {vec, len, max_len}; + + cassert(config_prof); + assert(vec != NULL); + assert(max_len <= PROF_BT_MAX_LIMIT); + + _Unwind_Backtrace(prof_unwind_callback, &data); +} +#elif (defined(JEMALLOC_PROF_GCC)) +JEMALLOC_DIAGNOSTIC_PUSH +JEMALLOC_DIAGNOSTIC_IGNORE_FRAME_ADDRESS +static void +prof_backtrace_impl(void **vec, unsigned *len, unsigned max_len) { +/* The input arg must be a constant for __builtin_return_address. */ +#define BT_FRAME(i) \ + if ((i) < max_len) { \ + void *p; \ + if (__builtin_frame_address(i) == 0) { \ + return; \ + } \ + p = __builtin_return_address(i); \ + if (p == NULL) { \ + return; \ + } \ + vec[(i)] = p; \ + *len = (i) + 1; \ + } else { \ + return; \ + } + + cassert(config_prof); + assert(vec != NULL); + assert(max_len <= PROF_BT_MAX_LIMIT); + + BT_FRAME(0) + BT_FRAME(1) + BT_FRAME(2) + BT_FRAME(3) + BT_FRAME(4) + BT_FRAME(5) + BT_FRAME(6) + BT_FRAME(7) + BT_FRAME(8) + BT_FRAME(9) + + BT_FRAME(10) + BT_FRAME(11) + BT_FRAME(12) + BT_FRAME(13) + BT_FRAME(14) + BT_FRAME(15) + BT_FRAME(16) + BT_FRAME(17) + BT_FRAME(18) + BT_FRAME(19) + + BT_FRAME(20) + BT_FRAME(21) + BT_FRAME(22) + BT_FRAME(23) + BT_FRAME(24) + BT_FRAME(25) + BT_FRAME(26) + BT_FRAME(27) + BT_FRAME(28) + BT_FRAME(29) + + BT_FRAME(30) + BT_FRAME(31) + BT_FRAME(32) + BT_FRAME(33) + BT_FRAME(34) + BT_FRAME(35) + BT_FRAME(36) + BT_FRAME(37) + BT_FRAME(38) + BT_FRAME(39) + + BT_FRAME(40) + BT_FRAME(41) + BT_FRAME(42) + BT_FRAME(43) + BT_FRAME(44) + BT_FRAME(45) + BT_FRAME(46) + BT_FRAME(47) + BT_FRAME(48) + BT_FRAME(49) + + BT_FRAME(50) + BT_FRAME(51) + BT_FRAME(52) + BT_FRAME(53) + BT_FRAME(54) + BT_FRAME(55) + BT_FRAME(56) + BT_FRAME(57) + BT_FRAME(58) + BT_FRAME(59) + + BT_FRAME(60) + BT_FRAME(61) + BT_FRAME(62) + BT_FRAME(63) + BT_FRAME(64) + BT_FRAME(65) + BT_FRAME(66) + BT_FRAME(67) + BT_FRAME(68) + BT_FRAME(69) + + BT_FRAME(70) + BT_FRAME(71) + BT_FRAME(72) + BT_FRAME(73) + BT_FRAME(74) + BT_FRAME(75) + BT_FRAME(76) + BT_FRAME(77) + BT_FRAME(78) + BT_FRAME(79) + + BT_FRAME(80) + BT_FRAME(81) + BT_FRAME(82) + BT_FRAME(83) + BT_FRAME(84) + BT_FRAME(85) + BT_FRAME(86) + BT_FRAME(87) + BT_FRAME(88) + BT_FRAME(89) + + BT_FRAME(90) + BT_FRAME(91) + BT_FRAME(92) + BT_FRAME(93) + BT_FRAME(94) + BT_FRAME(95) + BT_FRAME(96) + BT_FRAME(97) + BT_FRAME(98) + BT_FRAME(99) + + BT_FRAME(100) + BT_FRAME(101) + BT_FRAME(102) + BT_FRAME(103) + BT_FRAME(104) + BT_FRAME(105) + BT_FRAME(106) + BT_FRAME(107) + BT_FRAME(108) + BT_FRAME(109) + + BT_FRAME(110) + BT_FRAME(111) + BT_FRAME(112) + BT_FRAME(113) + BT_FRAME(114) + BT_FRAME(115) + BT_FRAME(116) + BT_FRAME(117) + BT_FRAME(118) + BT_FRAME(119) + + BT_FRAME(120) + BT_FRAME(121) + BT_FRAME(122) + BT_FRAME(123) + BT_FRAME(124) + BT_FRAME(125) + BT_FRAME(126) + BT_FRAME(127) + BT_FRAME(128) + BT_FRAME(129) + + BT_FRAME(130) + BT_FRAME(131) + BT_FRAME(132) + BT_FRAME(133) + BT_FRAME(134) + BT_FRAME(135) + BT_FRAME(136) + BT_FRAME(137) + BT_FRAME(138) + BT_FRAME(139) + + BT_FRAME(140) + BT_FRAME(141) + BT_FRAME(142) + BT_FRAME(143) + BT_FRAME(144) + BT_FRAME(145) + BT_FRAME(146) + BT_FRAME(147) + BT_FRAME(148) + BT_FRAME(149) + + BT_FRAME(150) + BT_FRAME(151) + BT_FRAME(152) + BT_FRAME(153) + BT_FRAME(154) + BT_FRAME(155) + BT_FRAME(156) + BT_FRAME(157) + BT_FRAME(158) + BT_FRAME(159) + + BT_FRAME(160) + BT_FRAME(161) + BT_FRAME(162) + BT_FRAME(163) + BT_FRAME(164) + BT_FRAME(165) + BT_FRAME(166) + BT_FRAME(167) + BT_FRAME(168) + BT_FRAME(169) + + BT_FRAME(170) + BT_FRAME(171) + BT_FRAME(172) + BT_FRAME(173) + BT_FRAME(174) + BT_FRAME(175) + BT_FRAME(176) + BT_FRAME(177) + BT_FRAME(178) + BT_FRAME(179) + + BT_FRAME(180) + BT_FRAME(181) + BT_FRAME(182) + BT_FRAME(183) + BT_FRAME(184) + BT_FRAME(185) + BT_FRAME(186) + BT_FRAME(187) + BT_FRAME(188) + BT_FRAME(189) + + BT_FRAME(190) + BT_FRAME(191) + BT_FRAME(192) + BT_FRAME(193) + BT_FRAME(194) + BT_FRAME(195) + BT_FRAME(196) + BT_FRAME(197) + BT_FRAME(198) + BT_FRAME(199) + + BT_FRAME(200) + BT_FRAME(201) + BT_FRAME(202) + BT_FRAME(203) + BT_FRAME(204) + BT_FRAME(205) + BT_FRAME(206) + BT_FRAME(207) + BT_FRAME(208) + BT_FRAME(209) + + BT_FRAME(210) + BT_FRAME(211) + BT_FRAME(212) + BT_FRAME(213) + BT_FRAME(214) + BT_FRAME(215) + BT_FRAME(216) + BT_FRAME(217) + BT_FRAME(218) + BT_FRAME(219) + + BT_FRAME(220) + BT_FRAME(221) + BT_FRAME(222) + BT_FRAME(223) + BT_FRAME(224) + BT_FRAME(225) + BT_FRAME(226) + BT_FRAME(227) + BT_FRAME(228) + BT_FRAME(229) + + BT_FRAME(230) + BT_FRAME(231) + BT_FRAME(232) + BT_FRAME(233) + BT_FRAME(234) + BT_FRAME(235) + BT_FRAME(236) + BT_FRAME(237) + BT_FRAME(238) + BT_FRAME(239) + + BT_FRAME(240) + BT_FRAME(241) + BT_FRAME(242) + BT_FRAME(243) + BT_FRAME(244) + BT_FRAME(245) + BT_FRAME(246) + BT_FRAME(247) + BT_FRAME(248) + BT_FRAME(249) + + BT_FRAME(250) + BT_FRAME(251) + BT_FRAME(252) + BT_FRAME(253) + BT_FRAME(254) + BT_FRAME(255) +#undef BT_FRAME +JEMALLOC_DIAGNOSTIC_POP +} +#else +static void +prof_backtrace_impl(void **vec, unsigned *len, unsigned max_len) { + cassert(config_prof); + not_reached(); +} +#endif + +void +prof_backtrace(tsd_t *tsd, prof_bt_t *bt) { + cassert(config_prof); + prof_backtrace_hook_t prof_backtrace_hook = prof_backtrace_hook_get(); + assert(prof_backtrace_hook != NULL); + + pre_reentrancy(tsd, NULL); + prof_backtrace_hook(bt->vec, &bt->len, opt_prof_bt_max); + post_reentrancy(tsd); +} + +void +prof_hooks_init(void) { + prof_backtrace_hook_set(&prof_backtrace_impl); + prof_dump_hook_set(NULL); + prof_sample_hook_set(NULL); + prof_sample_free_hook_set(NULL); +} + +void +prof_unwind_init(void) { +#ifdef JEMALLOC_PROF_LIBGCC + /* + * Cause the backtracing machinery to allocate its internal + * state before enabling profiling. + */ + _Unwind_Backtrace(prof_unwind_init_callback, NULL); +#endif +} + +static int +prof_sys_thread_name_read_impl(char *buf, size_t limit) { +#if defined(JEMALLOC_HAVE_PTHREAD_GETNAME_NP) + return pthread_getname_np(pthread_self(), buf, limit); +#elif defined(JEMALLOC_HAVE_PTHREAD_GET_NAME_NP) + pthread_get_name_np(pthread_self(), buf, limit); + return 0; +#else + return ENOSYS; +#endif +} +prof_sys_thread_name_read_t *JET_MUTABLE prof_sys_thread_name_read = + prof_sys_thread_name_read_impl; + +void +prof_sys_thread_name_fetch(tsd_t *tsd) { + prof_tdata_t *tdata = prof_tdata_get(tsd, true); + if (tdata == NULL) { + return; + } + + if (prof_sys_thread_name_read(tdata->thread_name, + PROF_THREAD_NAME_MAX_LEN) != 0) { + prof_thread_name_clear(tdata); + } + + tdata->thread_name[PROF_THREAD_NAME_MAX_LEN - 1] = '\0'; +} + +int +prof_getpid(void) { +#ifdef _WIN32 + return GetCurrentProcessId(); +#else + return getpid(); +#endif +} + +long +prof_get_pid_namespace() { + long ret = 0; + +#if defined(_WIN32) || defined(__APPLE__) + // Not supported, do nothing. +#else + char buf[PATH_MAX]; + const char* linkname = +# if defined(__FreeBSD__) || defined(__DragonFly__) + "/proc/curproc/ns/pid" +# else + "/proc/self/ns/pid" +# endif + ; + ssize_t linklen = +# ifndef JEMALLOC_READLINKAT + readlink(linkname, buf, PATH_MAX) +# else + readlinkat(AT_FDCWD, linkname, buf, PATH_MAX) +# endif + ; + + // namespace string is expected to be like pid:[4026531836] + if (linklen > 0) { + // Trim the trailing "]" + buf[linklen-1] = '\0'; + char* index = strtok(buf, "pid:["); + ret = atol(index); + } +#endif + + return ret; +} + +/* + * This buffer is rather large for stack allocation, so use a single buffer for + * all profile dumps; protected by prof_dump_mtx. + */ +static char prof_dump_buf[PROF_DUMP_BUFSIZE]; + +typedef struct prof_dump_arg_s prof_dump_arg_t; +struct prof_dump_arg_s { + /* + * Whether error should be handled locally: if true, then we print out + * error message as well as abort (if opt_abort is true) when an error + * occurred, and we also report the error back to the caller in the end; + * if false, then we only report the error back to the caller in the + * end. + */ + const bool handle_error_locally; + /* + * Whether there has been an error in the dumping process, which could + * have happened either in file opening or in file writing. When an + * error has already occurred, we will stop further writing to the file. + */ + bool error; + /* File descriptor of the dump file. */ + int prof_dump_fd; +}; + +static void +prof_dump_check_possible_error(prof_dump_arg_t *arg, bool err_cond, + const char *format, ...) { + assert(!arg->error); + if (!err_cond) { + return; + } + + arg->error = true; + if (!arg->handle_error_locally) { + return; + } + + va_list ap; + char buf[PROF_PRINTF_BUFSIZE]; + va_start(ap, format); + malloc_vsnprintf(buf, sizeof(buf), format, ap); + va_end(ap); + malloc_write(buf); + + if (opt_abort) { + abort(); + } +} + +static int +prof_dump_open_file_impl(const char *filename, int mode) { + return creat(filename, mode); +} +prof_dump_open_file_t *JET_MUTABLE prof_dump_open_file = + prof_dump_open_file_impl; + +static void +prof_dump_open(prof_dump_arg_t *arg, const char *filename) { + arg->prof_dump_fd = prof_dump_open_file(filename, 0644); + prof_dump_check_possible_error(arg, arg->prof_dump_fd == -1, + ": failed to open \"%s\"\n", filename); +} + +prof_dump_write_file_t *JET_MUTABLE prof_dump_write_file = malloc_write_fd; + +static void +prof_dump_flush(void *opaque, const char *s) { + cassert(config_prof); + prof_dump_arg_t *arg = (prof_dump_arg_t *)opaque; + if (!arg->error) { + ssize_t err = prof_dump_write_file(arg->prof_dump_fd, s, + strlen(s)); + prof_dump_check_possible_error(arg, err == -1, + ": failed to write during heap profile flush\n"); + } +} + +static void +prof_dump_close(prof_dump_arg_t *arg) { + if (arg->prof_dump_fd != -1) { + close(arg->prof_dump_fd); + } +} + +#ifdef __APPLE__ +#include + +#ifdef __LP64__ +typedef struct mach_header_64 mach_header_t; +typedef struct segment_command_64 segment_command_t; +#define MH_MAGIC_VALUE MH_MAGIC_64 +#define MH_CIGAM_VALUE MH_CIGAM_64 +#define LC_SEGMENT_VALUE LC_SEGMENT_64 +#else +typedef struct mach_header mach_header_t; +typedef struct segment_command segment_command_t; +#define MH_MAGIC_VALUE MH_MAGIC +#define MH_CIGAM_VALUE MH_CIGAM +#define LC_SEGMENT_VALUE LC_SEGMENT +#endif + +static void +prof_dump_dyld_image_vmaddr(buf_writer_t *buf_writer, uint32_t image_index) { + const mach_header_t *header = (const mach_header_t *) + _dyld_get_image_header(image_index); + if (header == NULL || (header->magic != MH_MAGIC_VALUE && + header->magic != MH_CIGAM_VALUE)) { + // Invalid header + return; + } + + intptr_t slide = _dyld_get_image_vmaddr_slide(image_index); + const char *name = _dyld_get_image_name(image_index); + struct load_command *load_cmd = (struct load_command *) + ((char *)header + sizeof(mach_header_t)); + for (uint32_t i = 0; load_cmd && (i < header->ncmds); i++) { + if (load_cmd->cmd == LC_SEGMENT_VALUE) { + const segment_command_t *segment_cmd = + (const segment_command_t *)load_cmd; + if (!strcmp(segment_cmd->segname, "__TEXT")) { + char buffer[PATH_MAX + 1]; + malloc_snprintf(buffer, sizeof(buffer), + "%016llx-%016llx: %s\n", segment_cmd->vmaddr + slide, + segment_cmd->vmaddr + slide + segment_cmd->vmsize, name); + buf_writer_cb(buf_writer, buffer); + return; + } + } + load_cmd = + (struct load_command *)((char *)load_cmd + load_cmd->cmdsize); + } +} + +static void +prof_dump_dyld_maps(buf_writer_t *buf_writer) { + uint32_t image_count = _dyld_image_count(); + for (uint32_t i = 0; i < image_count; i++) { + prof_dump_dyld_image_vmaddr(buf_writer, i); + } +} + +prof_dump_open_maps_t *JET_MUTABLE prof_dump_open_maps = NULL; + +static void +prof_dump_maps(buf_writer_t *buf_writer) { + buf_writer_cb(buf_writer, "\nMAPPED_LIBRARIES:\n"); + /* No proc map file to read on MacOS, dump dyld maps for backtrace. */ + prof_dump_dyld_maps(buf_writer); +} +#else /* !__APPLE__ */ +#ifndef _WIN32 +JEMALLOC_FORMAT_PRINTF(1, 2) +static int +prof_open_maps_internal(const char *format, ...) { + int mfd; + va_list ap; + char filename[PATH_MAX + 1]; + + va_start(ap, format); + malloc_vsnprintf(filename, sizeof(filename), format, ap); + va_end(ap); + +#if defined(O_CLOEXEC) + mfd = open(filename, O_RDONLY | O_CLOEXEC); +#else + mfd = open(filename, O_RDONLY); + if (mfd != -1) { + fcntl(mfd, F_SETFD, fcntl(mfd, F_GETFD) | FD_CLOEXEC); + } +#endif + + return mfd; +} +#endif + +static int +prof_dump_open_maps_impl(void) { + int mfd; + + cassert(config_prof); +#if defined(__FreeBSD__) || defined(__DragonFly__) + mfd = prof_open_maps_internal("/proc/curproc/map"); +#elif defined(_WIN32) + mfd = -1; // Not implemented +#else + int pid = prof_getpid(); + + mfd = prof_open_maps_internal("/proc/%d/task/%d/maps", pid, pid); + if (mfd == -1) { + mfd = prof_open_maps_internal("/proc/%d/maps", pid); + } +#endif + return mfd; +} +prof_dump_open_maps_t *JET_MUTABLE prof_dump_open_maps = + prof_dump_open_maps_impl; + +static ssize_t +prof_dump_read_maps_cb(void *read_cbopaque, void *buf, size_t limit) { + int mfd = *(int *)read_cbopaque; + assert(mfd != -1); + return malloc_read_fd(mfd, buf, limit); +} + +static void +prof_dump_maps(buf_writer_t *buf_writer) { + int mfd = prof_dump_open_maps(); + if (mfd == -1) { + return; + } + + buf_writer_cb(buf_writer, "\nMAPPED_LIBRARIES:\n"); + buf_writer_pipe(buf_writer, prof_dump_read_maps_cb, &mfd); + close(mfd); +} +#endif /* __APPLE__ */ + +static bool +prof_dump(tsd_t *tsd, bool propagate_err, const char *filename, + bool leakcheck) { + cassert(config_prof); + assert(tsd_reentrancy_level_get(tsd) == 0); + + prof_tdata_t * tdata = prof_tdata_get(tsd, true); + if (tdata == NULL) { + return true; + } + + prof_dump_arg_t arg = {/* handle_error_locally */ !propagate_err, + /* error */ false, /* prof_dump_fd */ -1}; + + pre_reentrancy(tsd, NULL); + malloc_mutex_lock(tsd_tsdn(tsd), &prof_dump_mtx); + + prof_dump_open(&arg, filename); + buf_writer_t buf_writer; + bool err = buf_writer_init(tsd_tsdn(tsd), &buf_writer, prof_dump_flush, + &arg, prof_dump_buf, PROF_DUMP_BUFSIZE); + assert(!err); + prof_dump_impl(tsd, buf_writer_cb, &buf_writer, tdata, leakcheck); + prof_dump_maps(&buf_writer); + buf_writer_terminate(tsd_tsdn(tsd), &buf_writer); + prof_dump_close(&arg); + + prof_dump_hook_t dump_hook = prof_dump_hook_get(); + if (dump_hook != NULL) { + dump_hook(filename); + } + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_dump_mtx); + post_reentrancy(tsd); + + return arg.error; +} + +/* + * If profiling is off, then PROF_DUMP_FILENAME_LEN is 1, so we'll end up + * calling strncpy with a size of 0, which triggers a -Wstringop-truncation + * warning (strncpy can never actually be called in this case, since we bail out + * much earlier when config_prof is false). This function works around the + * warning to let us leave the warning on. + */ +static inline void +prof_strncpy(char *UNUSED dest, const char *UNUSED src, size_t UNUSED size) { + cassert(config_prof); +#ifdef JEMALLOC_PROF + strncpy(dest, src, size); +#endif +} + +static const char * +prof_prefix_get(tsdn_t* tsdn) { + malloc_mutex_assert_owner(tsdn, &prof_dump_filename_mtx); + + return prof_prefix == NULL ? opt_prof_prefix : prof_prefix; +} + +static bool +prof_prefix_is_empty(tsdn_t *tsdn) { + malloc_mutex_lock(tsdn, &prof_dump_filename_mtx); + bool ret = (prof_prefix_get(tsdn)[0] == '\0'); + malloc_mutex_unlock(tsdn, &prof_dump_filename_mtx); + return ret; +} + +#define DUMP_FILENAME_BUFSIZE (PATH_MAX + 1) +#define VSEQ_INVALID UINT64_C(0xffffffffffffffff) +static void +prof_dump_filename(tsd_t *tsd, char *filename, char v, uint64_t vseq) { + cassert(config_prof); + + assert(tsd_reentrancy_level_get(tsd) == 0); + const char *prefix = prof_prefix_get(tsd_tsdn(tsd)); + + if (vseq != VSEQ_INVALID) { + if (opt_prof_pid_namespace) { + /* "....v.heap" */ + malloc_snprintf(filename, DUMP_FILENAME_BUFSIZE, + "%s.%ld.%d.%"FMTu64".%c%"FMTu64".heap", prefix, + prof_get_pid_namespace(), prof_getpid(), prof_dump_seq, v, + vseq); + } else { + /* "...v.heap" */ + malloc_snprintf(filename, DUMP_FILENAME_BUFSIZE, + "%s.%d.%"FMTu64".%c%"FMTu64".heap", prefix, prof_getpid(), + prof_dump_seq, v, vseq); + } + } else { + if (opt_prof_pid_namespace) { + /* ".....heap" */ + malloc_snprintf(filename, DUMP_FILENAME_BUFSIZE, + "%s.%ld.%d.%"FMTu64".%c.heap", prefix, + prof_get_pid_namespace(), prof_getpid(), prof_dump_seq, v); + } else { + /* "....heap" */ + malloc_snprintf(filename, DUMP_FILENAME_BUFSIZE, + "%s.%d.%"FMTu64".%c.heap", prefix, prof_getpid(), + prof_dump_seq, v); + } + } + prof_dump_seq++; +} + +void +prof_get_default_filename(tsdn_t *tsdn, char *filename, uint64_t ind) { + malloc_mutex_lock(tsdn, &prof_dump_filename_mtx); + if (opt_prof_pid_namespace) { + malloc_snprintf(filename, PROF_DUMP_FILENAME_LEN, + "%s.%ld.%d.%"FMTu64".json", prof_prefix_get(tsdn), + prof_get_pid_namespace(), prof_getpid(), ind); + } else { + malloc_snprintf(filename, PROF_DUMP_FILENAME_LEN, + "%s.%d.%"FMTu64".json", prof_prefix_get(tsdn), prof_getpid(), ind); + } + malloc_mutex_unlock(tsdn, &prof_dump_filename_mtx); +} + +void +prof_fdump_impl(tsd_t *tsd) { + char filename[DUMP_FILENAME_BUFSIZE]; + + assert(!prof_prefix_is_empty(tsd_tsdn(tsd))); + malloc_mutex_lock(tsd_tsdn(tsd), &prof_dump_filename_mtx); + prof_dump_filename(tsd, filename, 'f', VSEQ_INVALID); + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_dump_filename_mtx); + prof_dump(tsd, false, filename, opt_prof_leak); +} + +bool +prof_prefix_set(tsdn_t *tsdn, const char *prefix) { + cassert(config_prof); + ctl_mtx_assert_held(tsdn); + if (prefix == NULL) { + return true; + } + malloc_mutex_lock(tsdn, &prof_dump_filename_mtx); + if (prof_prefix == NULL) { + malloc_mutex_unlock(tsdn, &prof_dump_filename_mtx); + /* Everything is still guarded by ctl_mtx. */ + char *buffer = base_alloc(tsdn, prof_base, + PROF_DUMP_FILENAME_LEN, QUANTUM); + if (buffer == NULL) { + return true; + } + malloc_mutex_lock(tsdn, &prof_dump_filename_mtx); + prof_prefix = buffer; + } + assert(prof_prefix != NULL); + + prof_strncpy(prof_prefix, prefix, PROF_DUMP_FILENAME_LEN - 1); + prof_prefix[PROF_DUMP_FILENAME_LEN - 1] = '\0'; + malloc_mutex_unlock(tsdn, &prof_dump_filename_mtx); + + return false; +} + +void +prof_idump_impl(tsd_t *tsd) { + malloc_mutex_lock(tsd_tsdn(tsd), &prof_dump_filename_mtx); + if (prof_prefix_get(tsd_tsdn(tsd))[0] == '\0') { + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_dump_filename_mtx); + return; + } + char filename[PATH_MAX + 1]; + prof_dump_filename(tsd, filename, 'i', prof_dump_iseq); + prof_dump_iseq++; + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_dump_filename_mtx); + prof_dump(tsd, false, filename, false); +} + +bool +prof_mdump_impl(tsd_t *tsd, const char *filename) { + char filename_buf[DUMP_FILENAME_BUFSIZE]; + if (filename == NULL) { + /* No filename specified, so automatically generate one. */ + malloc_mutex_lock(tsd_tsdn(tsd), &prof_dump_filename_mtx); + if (prof_prefix_get(tsd_tsdn(tsd))[0] == '\0') { + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_dump_filename_mtx); + return true; + } + prof_dump_filename(tsd, filename_buf, 'm', prof_dump_mseq); + prof_dump_mseq++; + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_dump_filename_mtx); + filename = filename_buf; + } + return prof_dump(tsd, true, filename, false); +} + +void +prof_gdump_impl(tsd_t *tsd) { + tsdn_t *tsdn = tsd_tsdn(tsd); + malloc_mutex_lock(tsdn, &prof_dump_filename_mtx); + if (prof_prefix_get(tsdn)[0] == '\0') { + malloc_mutex_unlock(tsdn, &prof_dump_filename_mtx); + return; + } + char filename[DUMP_FILENAME_BUFSIZE]; + prof_dump_filename(tsd, filename, 'u', prof_dump_useq); + prof_dump_useq++; + malloc_mutex_unlock(tsdn, &prof_dump_filename_mtx); + prof_dump(tsd, false, filename, false); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/psset.c b/src/duckdb/extension/jemalloc/jemalloc/src/psset.c new file mode 100644 index 000000000..559668165 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/psset.c @@ -0,0 +1,376 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/psset.h" + +#include "jemalloc/internal/fb.h" + +void +psset_init(psset_t *psset) { + for (unsigned i = 0; i < PSSET_NPSIZES; i++) { + hpdata_age_heap_new(&psset->pageslabs[i]); + } + fb_init(psset->pageslab_bitmap, PSSET_NPSIZES); + memset(&psset->merged_stats, 0, sizeof(psset->merged_stats)); + memset(&psset->stats, 0, sizeof(psset->stats)); + hpdata_empty_list_init(&psset->empty); + for (int i = 0; i < PSSET_NPURGE_LISTS; i++) { + hpdata_purge_list_init(&psset->to_purge[i]); + } + fb_init(psset->purge_bitmap, PSSET_NPURGE_LISTS); + hpdata_hugify_list_init(&psset->to_hugify); +} + +static void +psset_bin_stats_accum(psset_bin_stats_t *dst, psset_bin_stats_t *src) { + dst->npageslabs += src->npageslabs; + dst->nactive += src->nactive; + dst->ndirty += src->ndirty; +} + +void +psset_stats_accum(psset_stats_t *dst, psset_stats_t *src) { + psset_bin_stats_accum(&dst->full_slabs[0], &src->full_slabs[0]); + psset_bin_stats_accum(&dst->full_slabs[1], &src->full_slabs[1]); + psset_bin_stats_accum(&dst->empty_slabs[0], &src->empty_slabs[0]); + psset_bin_stats_accum(&dst->empty_slabs[1], &src->empty_slabs[1]); + for (pszind_t i = 0; i < PSSET_NPSIZES; i++) { + psset_bin_stats_accum(&dst->nonfull_slabs[i][0], + &src->nonfull_slabs[i][0]); + psset_bin_stats_accum(&dst->nonfull_slabs[i][1], + &src->nonfull_slabs[i][1]); + } +} + +/* + * The stats maintenance strategy is to remove a pageslab's contribution to the + * stats when we call psset_update_begin, and re-add it (to a potentially new + * bin) when we call psset_update_end. + */ +JEMALLOC_ALWAYS_INLINE void +psset_bin_stats_insert_remove(psset_t *psset, psset_bin_stats_t *binstats, + hpdata_t *ps, bool insert) { + size_t mul = insert ? (size_t)1 : (size_t)-1; + size_t huge_idx = (size_t)hpdata_huge_get(ps); + + binstats[huge_idx].npageslabs += mul * 1; + binstats[huge_idx].nactive += mul * hpdata_nactive_get(ps); + binstats[huge_idx].ndirty += mul * hpdata_ndirty_get(ps); + + psset->merged_stats.npageslabs += mul * 1; + psset->merged_stats.nactive += mul * hpdata_nactive_get(ps); + psset->merged_stats.ndirty += mul * hpdata_ndirty_get(ps); + + if (config_debug) { + psset_bin_stats_t check_stats = {0}; + for (size_t huge = 0; huge <= 1; huge++) { + psset_bin_stats_accum(&check_stats, + &psset->stats.full_slabs[huge]); + psset_bin_stats_accum(&check_stats, + &psset->stats.empty_slabs[huge]); + for (pszind_t pind = 0; pind < PSSET_NPSIZES; pind++) { + psset_bin_stats_accum(&check_stats, + &psset->stats.nonfull_slabs[pind][huge]); + } + } + assert(psset->merged_stats.npageslabs + == check_stats.npageslabs); + assert(psset->merged_stats.nactive == check_stats.nactive); + assert(psset->merged_stats.ndirty == check_stats.ndirty); + } +} + +static void +psset_bin_stats_insert(psset_t *psset, psset_bin_stats_t *binstats, + hpdata_t *ps) { + psset_bin_stats_insert_remove(psset, binstats, ps, true); +} + +static void +psset_bin_stats_remove(psset_t *psset, psset_bin_stats_t *binstats, + hpdata_t *ps) { + psset_bin_stats_insert_remove(psset, binstats, ps, false); +} + +static pszind_t +psset_hpdata_heap_index(const hpdata_t *ps) { + assert(!hpdata_full(ps)); + assert(!hpdata_empty(ps)); + size_t longest_free_range = hpdata_longest_free_range_get(ps); + pszind_t pind = sz_psz2ind(sz_psz_quantize_floor( + longest_free_range << LG_PAGE)); + assert(pind < PSSET_NPSIZES); + return pind; +} + +static void +psset_hpdata_heap_remove(psset_t *psset, hpdata_t *ps) { + pszind_t pind = psset_hpdata_heap_index(ps); + hpdata_age_heap_remove(&psset->pageslabs[pind], ps); + if (hpdata_age_heap_empty(&psset->pageslabs[pind])) { + fb_unset(psset->pageslab_bitmap, PSSET_NPSIZES, (size_t)pind); + } +} + +static void +psset_hpdata_heap_insert(psset_t *psset, hpdata_t *ps) { + pszind_t pind = psset_hpdata_heap_index(ps); + if (hpdata_age_heap_empty(&psset->pageslabs[pind])) { + fb_set(psset->pageslab_bitmap, PSSET_NPSIZES, (size_t)pind); + } + hpdata_age_heap_insert(&psset->pageslabs[pind], ps); +} + +static void +psset_stats_insert(psset_t* psset, hpdata_t *ps) { + if (hpdata_empty(ps)) { + psset_bin_stats_insert(psset, psset->stats.empty_slabs, ps); + } else if (hpdata_full(ps)) { + psset_bin_stats_insert(psset, psset->stats.full_slabs, ps); + } else { + pszind_t pind = psset_hpdata_heap_index(ps); + psset_bin_stats_insert(psset, psset->stats.nonfull_slabs[pind], + ps); + } +} + +static void +psset_stats_remove(psset_t *psset, hpdata_t *ps) { + if (hpdata_empty(ps)) { + psset_bin_stats_remove(psset, psset->stats.empty_slabs, ps); + } else if (hpdata_full(ps)) { + psset_bin_stats_remove(psset, psset->stats.full_slabs, ps); + } else { + pszind_t pind = psset_hpdata_heap_index(ps); + psset_bin_stats_remove(psset, psset->stats.nonfull_slabs[pind], + ps); + } +} + +/* + * Put ps into some container so that it can be found during future allocation + * requests. + */ +static void +psset_alloc_container_insert(psset_t *psset, hpdata_t *ps) { + assert(!hpdata_in_psset_alloc_container_get(ps)); + hpdata_in_psset_alloc_container_set(ps, true); + if (hpdata_empty(ps)) { + /* + * This prepend, paired with popping the head in psset_fit, + * means we implement LIFO ordering for the empty slabs set, + * which seems reasonable. + */ + hpdata_empty_list_prepend(&psset->empty, ps); + } else if (hpdata_full(ps)) { + /* + * We don't need to keep track of the full slabs; we're never + * going to return them from a psset_pick_alloc call. + */ + } else { + psset_hpdata_heap_insert(psset, ps); + } +} + +/* Remove ps from those collections. */ +static void +psset_alloc_container_remove(psset_t *psset, hpdata_t *ps) { + assert(hpdata_in_psset_alloc_container_get(ps)); + hpdata_in_psset_alloc_container_set(ps, false); + + if (hpdata_empty(ps)) { + hpdata_empty_list_remove(&psset->empty, ps); + } else if (hpdata_full(ps)) { + /* Same as above -- do nothing in this case. */ + } else { + psset_hpdata_heap_remove(psset, ps); + } +} + +static size_t +psset_purge_list_ind(hpdata_t *ps) { + size_t ndirty = hpdata_ndirty_get(ps); + /* Shouldn't have something with no dirty pages purgeable. */ + assert(ndirty > 0); + /* + * Higher indices correspond to lists we'd like to purge earlier; make + * the two highest indices correspond to empty lists, which we attempt + * to purge before purging any non-empty list. This has two advantages: + * - Empty page slabs are the least likely to get reused (we'll only + * pick them for an allocation if we have no other choice). + * - Empty page slabs can purge every dirty page they contain in a + * single call, which is not usually the case. + * + * We purge hugeified empty slabs before nonhugeified ones, on the basis + * that they are fully dirty, while nonhugified slabs might not be, so + * we free up more pages more easily. + */ + if (hpdata_nactive_get(ps) == 0) { + if (hpdata_huge_get(ps)) { + return PSSET_NPURGE_LISTS - 1; + } else { + return PSSET_NPURGE_LISTS - 2; + } + } + + pszind_t pind = sz_psz2ind(sz_psz_quantize_floor(ndirty << LG_PAGE)); + /* + * For non-empty slabs, we may reuse them again. Prefer purging + * non-hugeified slabs before hugeified ones then, among pages of + * similar dirtiness. We still get some benefit from the hugification. + */ + return (size_t)pind * 2 + (hpdata_huge_get(ps) ? 0 : 1); +} + +static void +psset_maybe_remove_purge_list(psset_t *psset, hpdata_t *ps) { + /* + * Remove the hpdata from its purge list (if it's in one). Even if it's + * going to stay in the same one, by appending it during + * psset_update_end, we move it to the end of its queue, so that we + * purge LRU within a given dirtiness bucket. + */ + if (hpdata_purge_allowed_get(ps)) { + size_t ind = psset_purge_list_ind(ps); + hpdata_purge_list_t *purge_list = &psset->to_purge[ind]; + hpdata_purge_list_remove(purge_list, ps); + if (hpdata_purge_list_empty(purge_list)) { + fb_unset(psset->purge_bitmap, PSSET_NPURGE_LISTS, ind); + } + } +} + +static void +psset_maybe_insert_purge_list(psset_t *psset, hpdata_t *ps) { + if (hpdata_purge_allowed_get(ps)) { + size_t ind = psset_purge_list_ind(ps); + hpdata_purge_list_t *purge_list = &psset->to_purge[ind]; + if (hpdata_purge_list_empty(purge_list)) { + fb_set(psset->purge_bitmap, PSSET_NPURGE_LISTS, ind); + } + hpdata_purge_list_append(purge_list, ps); + } + +} + +void +psset_update_begin(psset_t *psset, hpdata_t *ps) { + hpdata_assert_consistent(ps); + assert(hpdata_in_psset_get(ps)); + hpdata_updating_set(ps, true); + psset_stats_remove(psset, ps); + if (hpdata_in_psset_alloc_container_get(ps)) { + /* + * Some metadata updates can break alloc container invariants + * (e.g. the longest free range determines the hpdata_heap_t the + * pageslab lives in). + */ + assert(hpdata_alloc_allowed_get(ps)); + psset_alloc_container_remove(psset, ps); + } + psset_maybe_remove_purge_list(psset, ps); + /* + * We don't update presence in the hugify list; we try to keep it FIFO, + * even in the presence of other metadata updates. We'll update + * presence at the end of the metadata update if necessary. + */ +} + +void +psset_update_end(psset_t *psset, hpdata_t *ps) { + assert(hpdata_in_psset_get(ps)); + hpdata_updating_set(ps, false); + psset_stats_insert(psset, ps); + + /* + * The update begin should have removed ps from whatever alloc container + * it was in. + */ + assert(!hpdata_in_psset_alloc_container_get(ps)); + if (hpdata_alloc_allowed_get(ps)) { + psset_alloc_container_insert(psset, ps); + } + psset_maybe_insert_purge_list(psset, ps); + + if (hpdata_hugify_allowed_get(ps) + && !hpdata_in_psset_hugify_container_get(ps)) { + hpdata_in_psset_hugify_container_set(ps, true); + hpdata_hugify_list_append(&psset->to_hugify, ps); + } else if (!hpdata_hugify_allowed_get(ps) + && hpdata_in_psset_hugify_container_get(ps)) { + hpdata_in_psset_hugify_container_set(ps, false); + hpdata_hugify_list_remove(&psset->to_hugify, ps); + } + hpdata_assert_consistent(ps); +} + +hpdata_t * +psset_pick_alloc(psset_t *psset, size_t size) { + assert((size & PAGE_MASK) == 0); + assert(size <= HUGEPAGE); + + pszind_t min_pind = sz_psz2ind(sz_psz_quantize_ceil(size)); + pszind_t pind = (pszind_t)fb_ffs(psset->pageslab_bitmap, PSSET_NPSIZES, + (size_t)min_pind); + if (pind == PSSET_NPSIZES) { + return hpdata_empty_list_first(&psset->empty); + } + hpdata_t *ps = hpdata_age_heap_first(&psset->pageslabs[pind]); + if (ps == NULL) { + return NULL; + } + + hpdata_assert_consistent(ps); + + return ps; +} + +hpdata_t * +psset_pick_purge(psset_t *psset) { + ssize_t ind_ssz = fb_fls(psset->purge_bitmap, PSSET_NPURGE_LISTS, + PSSET_NPURGE_LISTS - 1); + if (ind_ssz < 0) { + return NULL; + } + pszind_t ind = (pszind_t)ind_ssz; + assert(ind < PSSET_NPURGE_LISTS); + hpdata_t *ps = hpdata_purge_list_first(&psset->to_purge[ind]); + assert(ps != NULL); + return ps; +} + +hpdata_t * +psset_pick_hugify(psset_t *psset) { + return hpdata_hugify_list_first(&psset->to_hugify); +} + +void +psset_insert(psset_t *psset, hpdata_t *ps) { + hpdata_in_psset_set(ps, true); + + psset_stats_insert(psset, ps); + if (hpdata_alloc_allowed_get(ps)) { + psset_alloc_container_insert(psset, ps); + } + psset_maybe_insert_purge_list(psset, ps); + + if (hpdata_hugify_allowed_get(ps)) { + hpdata_in_psset_hugify_container_set(ps, true); + hpdata_hugify_list_append(&psset->to_hugify, ps); + } +} + +void +psset_remove(psset_t *psset, hpdata_t *ps) { + hpdata_in_psset_set(ps, false); + + psset_stats_remove(psset, ps); + if (hpdata_in_psset_alloc_container_get(ps)) { + psset_alloc_container_remove(psset, ps); + } + psset_maybe_remove_purge_list(psset, ps); + if (hpdata_in_psset_hugify_container_get(ps)) { + hpdata_in_psset_hugify_container_set(ps, false); + hpdata_hugify_list_remove(&psset->to_hugify, ps); + } +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/rtree.c b/src/duckdb/extension/jemalloc/jemalloc/src/rtree.c new file mode 100644 index 000000000..b6ac04b75 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/rtree.c @@ -0,0 +1,261 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/mutex.h" + +/* + * Only the most significant bits of keys passed to rtree_{read,write}() are + * used. + */ +bool +rtree_new(rtree_t *rtree, base_t *base, bool zeroed) { +#ifdef JEMALLOC_JET + if (!zeroed) { + memset(rtree, 0, sizeof(rtree_t)); /* Clear root. */ + } +#else + assert(zeroed); +#endif + rtree->base = base; + + if (malloc_mutex_init(&rtree->init_lock, "rtree", WITNESS_RANK_RTREE, + malloc_mutex_rank_exclusive)) { + return true; + } + + return false; +} + +static rtree_node_elm_t * +rtree_node_alloc(tsdn_t *tsdn, rtree_t *rtree, size_t nelms) { + return (rtree_node_elm_t *)base_alloc_rtree(tsdn, rtree->base, + nelms * sizeof(rtree_node_elm_t)); +} + +static rtree_leaf_elm_t * +rtree_leaf_alloc(tsdn_t *tsdn, rtree_t *rtree, size_t nelms) { + return (rtree_leaf_elm_t *)base_alloc_rtree(tsdn, rtree->base, + nelms * sizeof(rtree_leaf_elm_t)); +} + +static rtree_node_elm_t * +rtree_node_init(tsdn_t *tsdn, rtree_t *rtree, unsigned level, + atomic_p_t *elmp) { + malloc_mutex_lock(tsdn, &rtree->init_lock); + /* + * If *elmp is non-null, then it was initialized with the init lock + * held, so we can get by with 'relaxed' here. + */ + rtree_node_elm_t *node = atomic_load_p(elmp, ATOMIC_RELAXED); + if (node == NULL) { + node = rtree_node_alloc(tsdn, rtree, ZU(1) << + rtree_levels[level].bits); + if (node == NULL) { + malloc_mutex_unlock(tsdn, &rtree->init_lock); + return NULL; + } + /* + * Even though we hold the lock, a later reader might not; we + * need release semantics. + */ + atomic_store_p(elmp, node, ATOMIC_RELEASE); + } + malloc_mutex_unlock(tsdn, &rtree->init_lock); + + return node; +} + +static rtree_leaf_elm_t * +rtree_leaf_init(tsdn_t *tsdn, rtree_t *rtree, atomic_p_t *elmp) { + malloc_mutex_lock(tsdn, &rtree->init_lock); + /* + * If *elmp is non-null, then it was initialized with the init lock + * held, so we can get by with 'relaxed' here. + */ + rtree_leaf_elm_t *leaf = atomic_load_p(elmp, ATOMIC_RELAXED); + if (leaf == NULL) { + leaf = rtree_leaf_alloc(tsdn, rtree, ZU(1) << + rtree_levels[RTREE_HEIGHT-1].bits); + if (leaf == NULL) { + malloc_mutex_unlock(tsdn, &rtree->init_lock); + return NULL; + } + /* + * Even though we hold the lock, a later reader might not; we + * need release semantics. + */ + atomic_store_p(elmp, leaf, ATOMIC_RELEASE); + } + malloc_mutex_unlock(tsdn, &rtree->init_lock); + + return leaf; +} + +static bool +rtree_node_valid(rtree_node_elm_t *node) { + return ((uintptr_t)node != (uintptr_t)0); +} + +static bool +rtree_leaf_valid(rtree_leaf_elm_t *leaf) { + return ((uintptr_t)leaf != (uintptr_t)0); +} + +static rtree_node_elm_t * +rtree_child_node_tryread(rtree_node_elm_t *elm, bool dependent) { + rtree_node_elm_t *node; + + if (dependent) { + node = (rtree_node_elm_t *)atomic_load_p(&elm->child, + ATOMIC_RELAXED); + } else { + node = (rtree_node_elm_t *)atomic_load_p(&elm->child, + ATOMIC_ACQUIRE); + } + + assert(!dependent || node != NULL); + return node; +} + +static rtree_node_elm_t * +rtree_child_node_read(tsdn_t *tsdn, rtree_t *rtree, rtree_node_elm_t *elm, + unsigned level, bool dependent) { + rtree_node_elm_t *node; + + node = rtree_child_node_tryread(elm, dependent); + if (!dependent && unlikely(!rtree_node_valid(node))) { + node = rtree_node_init(tsdn, rtree, level + 1, &elm->child); + } + assert(!dependent || node != NULL); + return node; +} + +static rtree_leaf_elm_t * +rtree_child_leaf_tryread(rtree_node_elm_t *elm, bool dependent) { + rtree_leaf_elm_t *leaf; + + if (dependent) { + leaf = (rtree_leaf_elm_t *)atomic_load_p(&elm->child, + ATOMIC_RELAXED); + } else { + leaf = (rtree_leaf_elm_t *)atomic_load_p(&elm->child, + ATOMIC_ACQUIRE); + } + + assert(!dependent || leaf != NULL); + return leaf; +} + +static rtree_leaf_elm_t * +rtree_child_leaf_read(tsdn_t *tsdn, rtree_t *rtree, rtree_node_elm_t *elm, + unsigned level, bool dependent) { + rtree_leaf_elm_t *leaf; + + leaf = rtree_child_leaf_tryread(elm, dependent); + if (!dependent && unlikely(!rtree_leaf_valid(leaf))) { + leaf = rtree_leaf_init(tsdn, rtree, &elm->child); + } + assert(!dependent || leaf != NULL); + return leaf; +} + +rtree_leaf_elm_t * +rtree_leaf_elm_lookup_hard(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t key, bool dependent, bool init_missing) { + rtree_node_elm_t *node; + rtree_leaf_elm_t *leaf; +#if RTREE_HEIGHT > 1 + node = rtree->root; +#else + leaf = rtree->root; +#endif + + if (config_debug) { + uintptr_t leafkey = rtree_leafkey(key); + for (unsigned i = 0; i < RTREE_CTX_NCACHE; i++) { + assert(rtree_ctx->cache[i].leafkey != leafkey); + } + for (unsigned i = 0; i < RTREE_CTX_NCACHE_L2; i++) { + assert(rtree_ctx->l2_cache[i].leafkey != leafkey); + } + } + +#define RTREE_GET_CHILD(level) { \ + assert(level < RTREE_HEIGHT-1); \ + if (level != 0 && !dependent && \ + unlikely(!rtree_node_valid(node))) { \ + return NULL; \ + } \ + uintptr_t subkey = rtree_subkey(key, level); \ + if (level + 2 < RTREE_HEIGHT) { \ + node = init_missing ? \ + rtree_child_node_read(tsdn, rtree, \ + &node[subkey], level, dependent) : \ + rtree_child_node_tryread(&node[subkey], \ + dependent); \ + } else { \ + leaf = init_missing ? \ + rtree_child_leaf_read(tsdn, rtree, \ + &node[subkey], level, dependent) : \ + rtree_child_leaf_tryread(&node[subkey], \ + dependent); \ + } \ + } + /* + * Cache replacement upon hard lookup (i.e. L1 & L2 rtree cache miss): + * (1) evict last entry in L2 cache; (2) move the collision slot from L1 + * cache down to L2; and 3) fill L1. + */ +#define RTREE_GET_LEAF(level) { \ + assert(level == RTREE_HEIGHT-1); \ + if (!dependent && unlikely(!rtree_leaf_valid(leaf))) { \ + return NULL; \ + } \ + if (RTREE_CTX_NCACHE_L2 > 1) { \ + memmove(&rtree_ctx->l2_cache[1], \ + &rtree_ctx->l2_cache[0], \ + sizeof(rtree_ctx_cache_elm_t) * \ + (RTREE_CTX_NCACHE_L2 - 1)); \ + } \ + size_t slot = rtree_cache_direct_map(key); \ + rtree_ctx->l2_cache[0].leafkey = \ + rtree_ctx->cache[slot].leafkey; \ + rtree_ctx->l2_cache[0].leaf = \ + rtree_ctx->cache[slot].leaf; \ + uintptr_t leafkey = rtree_leafkey(key); \ + rtree_ctx->cache[slot].leafkey = leafkey; \ + rtree_ctx->cache[slot].leaf = leaf; \ + uintptr_t subkey = rtree_subkey(key, level); \ + return &leaf[subkey]; \ + } + if (RTREE_HEIGHT > 1) { + RTREE_GET_CHILD(0) + } + if (RTREE_HEIGHT > 2) { + RTREE_GET_CHILD(1) + } + if (RTREE_HEIGHT > 3) { + for (unsigned i = 2; i < RTREE_HEIGHT-1; i++) { + RTREE_GET_CHILD(i) + } + } + RTREE_GET_LEAF(RTREE_HEIGHT-1) +#undef RTREE_GET_CHILD +#undef RTREE_GET_LEAF + not_reached(); +} + +void +rtree_ctx_data_init(rtree_ctx_t *ctx) { + for (unsigned i = 0; i < RTREE_CTX_NCACHE; i++) { + rtree_ctx_cache_elm_t *cache = &ctx->cache[i]; + cache->leafkey = RTREE_LEAFKEY_INVALID; + cache->leaf = NULL; + } + for (unsigned i = 0; i < RTREE_CTX_NCACHE_L2; i++) { + rtree_ctx_cache_elm_t *cache = &ctx->l2_cache[i]; + cache->leafkey = RTREE_LEAFKEY_INVALID; + cache->leaf = NULL; + } +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/safety_check.c b/src/duckdb/extension/jemalloc/jemalloc/src/safety_check.c new file mode 100644 index 000000000..d3f68fbc1 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/safety_check.c @@ -0,0 +1,46 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +static safety_check_abort_hook_t safety_check_abort; + +void safety_check_fail_sized_dealloc(bool current_dealloc, const void *ptr, + size_t true_size, size_t input_size) { + char *src = current_dealloc ? "the current pointer being freed" : + "in thread cache, possibly from previous deallocations"; + char *suggest_debug_build = config_debug ? "" : " --enable-debug or"; + + safety_check_fail(": size mismatch detected (true size %zu " + "vs input size %zu), likely caused by application sized " + "deallocation bugs (source address: %p, %s). Suggest building with" + "%s address sanitizer for debugging. Abort.\n", + true_size, input_size, ptr, src, suggest_debug_build); +} + +void safety_check_set_abort(safety_check_abort_hook_t abort_fn) { + safety_check_abort = abort_fn; +} + +/* + * In addition to malloc_write, also embed hint msg in the abort function name + * because there are cases only logging crash stack traces. + */ +static void +safety_check_detected_heap_corruption___run_address_sanitizer_build_to_debug(const char *buf) { + if (safety_check_abort == NULL) { + malloc_write(buf); + abort(); + } else { + safety_check_abort(buf); + } +} + +void safety_check_fail(const char *format, ...) { + char buf[MALLOC_PRINTF_BUFSIZE]; + + va_list ap; + va_start(ap, format); + malloc_vsnprintf(buf, MALLOC_PRINTF_BUFSIZE, format, ap); + va_end(ap); + + safety_check_detected_heap_corruption___run_address_sanitizer_build_to_debug(buf); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/san.c b/src/duckdb/extension/jemalloc/jemalloc/src/san.c new file mode 100644 index 000000000..28ea3d7cb --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/san.c @@ -0,0 +1,208 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/ehooks.h" +#include "jemalloc/internal/san.h" +#include "jemalloc/internal/tsd.h" + +/* The sanitizer options. */ +size_t opt_san_guard_large = SAN_GUARD_LARGE_EVERY_N_EXTENTS_DEFAULT; +size_t opt_san_guard_small = SAN_GUARD_SMALL_EVERY_N_EXTENTS_DEFAULT; + +/* Aligned (-1 is off) ptrs will be junked & stashed on dealloc. */ +ssize_t opt_lg_san_uaf_align = SAN_LG_UAF_ALIGN_DEFAULT; + +/* + * Initialized in san_init(). When disabled, the mask is set to (uintptr_t)-1 + * to always fail the nonfast_align check. + */ +uintptr_t san_cache_bin_nonfast_mask = SAN_CACHE_BIN_NONFAST_MASK_DEFAULT; + +static inline void +san_find_guarded_addr(edata_t *edata, void **guard1, void **guard2, + void **addr, size_t size, bool left, bool right) { + assert(!edata_guarded_get(edata)); + assert(size % PAGE == 0); + *addr = edata_base_get(edata); + if (left) { + *guard1 = *addr; + *addr = ((byte_t *)*addr) + SAN_PAGE_GUARD; + } else { + *guard1 = NULL; + } + + if (right) { + *guard2 = ((byte_t *)*addr) + size; + } else { + *guard2 = NULL; + } +} + +static inline void +san_find_unguarded_addr(edata_t *edata, void **guard1, void **guard2, + void **addr, size_t size, bool left, bool right) { + assert(edata_guarded_get(edata)); + assert(size % PAGE == 0); + *addr = edata_base_get(edata); + if (right) { + *guard2 = ((byte_t *)*addr) + size; + } else { + *guard2 = NULL; + } + + if (left) { + *guard1 = ((byte_t *)*addr) - SAN_PAGE_GUARD; + assert(*guard1 != NULL); + *addr = *guard1; + } else { + *guard1 = NULL; + } +} + +void +san_guard_pages(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, emap_t *emap, + bool left, bool right, bool remap) { + assert(left || right); + if (remap) { + emap_deregister_boundary(tsdn, emap, edata); + } + + size_t size_with_guards = edata_size_get(edata); + size_t usize = (left && right) + ? san_two_side_unguarded_sz(size_with_guards) + : san_one_side_unguarded_sz(size_with_guards); + + void *guard1, *guard2, *addr; + san_find_guarded_addr(edata, &guard1, &guard2, &addr, usize, left, + right); + + assert(edata_state_get(edata) == extent_state_active); + ehooks_guard(tsdn, ehooks, guard1, guard2); + + /* Update the guarded addr and usable size of the edata. */ + edata_size_set(edata, usize); + edata_addr_set(edata, addr); + edata_guarded_set(edata, true); + + if (remap) { + emap_register_boundary(tsdn, emap, edata, SC_NSIZES, + /* slab */ false); + } +} + +static void +san_unguard_pages_impl(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + emap_t *emap, bool left, bool right, bool remap) { + assert(left || right); + /* Remove the inner boundary which no longer exists. */ + if (remap) { + assert(edata_state_get(edata) == extent_state_active); + emap_deregister_boundary(tsdn, emap, edata); + } else { + assert(edata_state_get(edata) == extent_state_retained); + } + + size_t size = edata_size_get(edata); + size_t size_with_guards = (left && right) + ? san_two_side_guarded_sz(size) + : san_one_side_guarded_sz(size); + + void *guard1, *guard2, *addr; + san_find_unguarded_addr(edata, &guard1, &guard2, &addr, size, left, + right); + + ehooks_unguard(tsdn, ehooks, (void *)guard1, (void *)guard2); + + /* Update the true addr and usable size of the edata. */ + edata_size_set(edata, size_with_guards); + edata_addr_set(edata, (void *)addr); + edata_guarded_set(edata, false); + + /* + * Then re-register the outer boundary including the guards, if + * requested. + */ + if (remap) { + emap_register_boundary(tsdn, emap, edata, SC_NSIZES, + /* slab */ false); + } +} + +void +san_unguard_pages(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + emap_t *emap, bool left, bool right) { + san_unguard_pages_impl(tsdn, ehooks, edata, emap, left, right, + /* remap */ true); +} + +void +san_unguard_pages_pre_destroy(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + emap_t *emap) { + emap_assert_not_mapped(tsdn, emap, edata); + /* + * We don't want to touch the emap of about to be destroyed extents, as + * they have been unmapped upon eviction from the retained ecache. Also, + * we unguard the extents to the right, because retained extents only + * own their right guard page per san_bump_alloc's logic. + */ + san_unguard_pages_impl(tsdn, ehooks, edata, emap, /* left */ false, + /* right */ true, /* remap */ false); +} + +static bool +san_stashed_corrupted(void *ptr, size_t size) { + if (san_junk_ptr_should_slow()) { + for (size_t i = 0; i < size; i++) { + if (((char *)ptr)[i] != (char)uaf_detect_junk) { + return true; + } + } + return false; + } + + void *first, *mid, *last; + san_junk_ptr_locations(ptr, size, &first, &mid, &last); + if (*(uintptr_t *)first != uaf_detect_junk || + *(uintptr_t *)mid != uaf_detect_junk || + *(uintptr_t *)last != uaf_detect_junk) { + return true; + } + + return false; +} + +void +san_check_stashed_ptrs(void **ptrs, size_t nstashed, size_t usize) { + /* + * Verify that the junked-filled & stashed pointers remain unchanged, to + * detect write-after-free. + */ + for (size_t n = 0; n < nstashed; n++) { + void *stashed = ptrs[n]; + assert(stashed != NULL); + assert(cache_bin_nonfast_aligned(stashed)); + if (unlikely(san_stashed_corrupted(stashed, usize))) { + safety_check_fail(": Write-after-free " + "detected on deallocated pointer %p (size %zu).\n", + stashed, usize); + } + } +} + +void +tsd_san_init(tsd_t *tsd) { + *tsd_san_extents_until_guard_smallp_get(tsd) = opt_san_guard_small; + *tsd_san_extents_until_guard_largep_get(tsd) = opt_san_guard_large; +} + +void +san_init(ssize_t lg_san_uaf_align) { + assert(lg_san_uaf_align == -1 || lg_san_uaf_align >= LG_PAGE); + if (lg_san_uaf_align == -1) { + san_cache_bin_nonfast_mask = (uintptr_t)-1; + return; + } + + san_cache_bin_nonfast_mask = ((uintptr_t)1 << lg_san_uaf_align) - 1; +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/san_bump.c b/src/duckdb/extension/jemalloc/jemalloc/src/san_bump.c new file mode 100644 index 000000000..888974555 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/san_bump.c @@ -0,0 +1,104 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/san_bump.h" +#include "jemalloc/internal/pac.h" +#include "jemalloc/internal/san.h" +#include "jemalloc/internal/ehooks.h" +#include "jemalloc/internal/edata_cache.h" + +static bool +san_bump_grow_locked(tsdn_t *tsdn, san_bump_alloc_t *sba, pac_t *pac, + ehooks_t *ehooks, size_t size); + +edata_t * +san_bump_alloc(tsdn_t *tsdn, san_bump_alloc_t* sba, pac_t *pac, + ehooks_t *ehooks, size_t size, bool zero) { + assert(san_bump_enabled()); + + edata_t* to_destroy; + size_t guarded_size = san_one_side_guarded_sz(size); + + malloc_mutex_lock(tsdn, &sba->mtx); + + if (sba->curr_reg == NULL || + edata_size_get(sba->curr_reg) < guarded_size) { + /* + * If the current region can't accommodate the allocation, + * try replacing it with a larger one and destroy current if the + * replacement succeeds. + */ + to_destroy = sba->curr_reg; + bool err = san_bump_grow_locked(tsdn, sba, pac, ehooks, + guarded_size); + if (err) { + goto label_err; + } + } else { + to_destroy = NULL; + } + assert(guarded_size <= edata_size_get(sba->curr_reg)); + size_t trail_size = edata_size_get(sba->curr_reg) - guarded_size; + + edata_t* edata; + if (trail_size != 0) { + edata_t* curr_reg_trail = extent_split_wrapper(tsdn, pac, + ehooks, sba->curr_reg, guarded_size, trail_size, + /* holding_core_locks */ true); + if (curr_reg_trail == NULL) { + goto label_err; + } + edata = sba->curr_reg; + sba->curr_reg = curr_reg_trail; + } else { + edata = sba->curr_reg; + sba->curr_reg = NULL; + } + + malloc_mutex_unlock(tsdn, &sba->mtx); + + assert(!edata_guarded_get(edata)); + assert(sba->curr_reg == NULL || !edata_guarded_get(sba->curr_reg)); + assert(to_destroy == NULL || !edata_guarded_get(to_destroy)); + + if (to_destroy != NULL) { + extent_destroy_wrapper(tsdn, pac, ehooks, to_destroy); + } + + san_guard_pages(tsdn, ehooks, edata, pac->emap, /* left */ false, + /* right */ true, /* remap */ true); + + if (extent_commit_zero(tsdn, ehooks, edata, /* commit */ true, zero, + /* growing_retained */ false)) { + extent_record(tsdn, pac, ehooks, &pac->ecache_retained, + edata); + return NULL; + } + + if (config_prof) { + extent_gdump_add(tsdn, edata); + } + + return edata; +label_err: + malloc_mutex_unlock(tsdn, &sba->mtx); + return NULL; +} + +static bool +san_bump_grow_locked(tsdn_t *tsdn, san_bump_alloc_t *sba, pac_t *pac, + ehooks_t *ehooks, size_t size) { + malloc_mutex_assert_owner(tsdn, &sba->mtx); + + bool committed = false, zeroed = false; + size_t alloc_size = size > SBA_RETAINED_ALLOC_SIZE ? size : + SBA_RETAINED_ALLOC_SIZE; + assert((alloc_size & PAGE_MASK) == 0); + sba->curr_reg = extent_alloc_wrapper(tsdn, pac, ehooks, NULL, + alloc_size, PAGE, zeroed, &committed, + /* growing_retained */ true); + if (sba->curr_reg == NULL) { + return true; + } + return false; +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/sc.c b/src/duckdb/extension/jemalloc/jemalloc/src/sc.c new file mode 100644 index 000000000..e4a94d89f --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/sc.c @@ -0,0 +1,306 @@ +#include "jemalloc/internal/jemalloc_preamble.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/bit_util.h" +#include "jemalloc/internal/bitmap.h" +#include "jemalloc/internal/pages.h" +#include "jemalloc/internal/sc.h" + +/* + * This module computes the size classes used to satisfy allocations. The logic + * here was ported more or less line-by-line from a shell script, and because of + * that is not the most idiomatic C. Eventually we should fix this, but for now + * at least the damage is compartmentalized to this file. + */ + +size_t +reg_size_compute(int lg_base, int lg_delta, int ndelta) { + return (ZU(1) << lg_base) + (ZU(ndelta) << lg_delta); +} + +/* Returns the number of pages in the slab. */ +static int +slab_size(int lg_page, int lg_base, int lg_delta, int ndelta) { + size_t page = (ZU(1) << lg_page); + size_t reg_size = reg_size_compute(lg_base, lg_delta, ndelta); + + size_t try_slab_size = page; + size_t try_nregs = try_slab_size / reg_size; + size_t perfect_slab_size = 0; + bool perfect = false; + /* + * This loop continues until we find the least common multiple of the + * page size and size class size. Size classes are all of the form + * base + ndelta * delta == (ndelta + base/ndelta) * delta, which is + * (ndelta + ngroup) * delta. The way we choose slabbing strategies + * means that delta is at most the page size and ndelta < ngroup. So + * the loop executes for at most 2 * ngroup - 1 iterations, which is + * also the bound on the number of pages in a slab chosen by default. + * With the current default settings, this is at most 7. + */ + while (!perfect) { + perfect_slab_size = try_slab_size; + size_t perfect_nregs = try_nregs; + try_slab_size += page; + try_nregs = try_slab_size / reg_size; + if (perfect_slab_size == perfect_nregs * reg_size) { + perfect = true; + } + } + return (int)(perfect_slab_size / page); +} + +static void +size_class( + /* Output. */ + sc_t *sc, + /* Configuration decisions. */ + int lg_max_lookup, int lg_page, int lg_ngroup, + /* Inputs specific to the size class. */ + int index, int lg_base, int lg_delta, int ndelta) { + sc->index = index; + sc->lg_base = lg_base; + sc->lg_delta = lg_delta; + sc->ndelta = ndelta; + size_t size = reg_size_compute(lg_base, lg_delta, ndelta); + sc->psz = (size % (ZU(1) << lg_page) == 0); + if (index == 0) { + assert(!sc->psz); + } + if (size < (ZU(1) << (lg_page + lg_ngroup))) { + sc->bin = true; + sc->pgs = slab_size(lg_page, lg_base, lg_delta, ndelta); + } else { + sc->bin = false; + sc->pgs = 0; + } + if (size <= (ZU(1) << lg_max_lookup)) { + sc->lg_delta_lookup = lg_delta; + } else { + sc->lg_delta_lookup = 0; + } +} + +static void +size_classes( + /* Output. */ + sc_data_t *sc_data, + /* Determined by the system. */ + size_t lg_ptr_size, int lg_quantum, + /* Configuration decisions. */ + int lg_tiny_min, int lg_max_lookup, int lg_page, int lg_ngroup) { + int ptr_bits = (1 << lg_ptr_size) * 8; + int ngroup = (1 << lg_ngroup); + int ntiny = 0; + int nlbins = 0; + int lg_tiny_maxclass = (unsigned)-1; + int nbins = 0; + int npsizes = 0; + + int index = 0; + + int ndelta = 0; + int lg_base = lg_tiny_min; + int lg_delta = lg_base; + + /* Outputs that we update as we go. */ + size_t lookup_maxclass = 0; + size_t small_maxclass = 0; + int lg_large_minclass = 0; + size_t large_maxclass = 0; + + /* Tiny size classes. */ + while (lg_base < lg_quantum) { + sc_t *sc = &sc_data->sc[index]; + size_class(sc, lg_max_lookup, lg_page, lg_ngroup, index, + lg_base, lg_delta, ndelta); + if (sc->lg_delta_lookup != 0) { + nlbins = index + 1; + } + if (sc->psz) { + npsizes++; + } + if (sc->bin) { + nbins++; + } + ntiny++; + /* Final written value is correct. */ + lg_tiny_maxclass = lg_base; + index++; + lg_delta = lg_base; + lg_base++; + } + + /* First non-tiny (pseudo) group. */ + if (ntiny != 0) { + sc_t *sc = &sc_data->sc[index]; + /* + * See the note in sc.h; the first non-tiny size class has an + * unusual encoding. + */ + lg_base--; + ndelta = 1; + size_class(sc, lg_max_lookup, lg_page, lg_ngroup, index, + lg_base, lg_delta, ndelta); + index++; + lg_base++; + lg_delta++; + if (sc->psz) { + npsizes++; + } + if (sc->bin) { + nbins++; + } + } + while (ndelta < ngroup) { + sc_t *sc = &sc_data->sc[index]; + size_class(sc, lg_max_lookup, lg_page, lg_ngroup, index, + lg_base, lg_delta, ndelta); + index++; + ndelta++; + if (sc->psz) { + npsizes++; + } + if (sc->bin) { + nbins++; + } + } + + /* All remaining groups. */ + lg_base = lg_base + lg_ngroup; + while (lg_base < ptr_bits - 1) { + ndelta = 1; + int ndelta_limit; + if (lg_base == ptr_bits - 2) { + ndelta_limit = ngroup - 1; + } else { + ndelta_limit = ngroup; + } + while (ndelta <= ndelta_limit) { + sc_t *sc = &sc_data->sc[index]; + size_class(sc, lg_max_lookup, lg_page, lg_ngroup, index, + lg_base, lg_delta, ndelta); + if (sc->lg_delta_lookup != 0) { + nlbins = index + 1; + /* Final written value is correct. */ + lookup_maxclass = (ZU(1) << lg_base) + + (ZU(ndelta) << lg_delta); + } + if (sc->psz) { + npsizes++; + } + if (sc->bin) { + nbins++; + /* Final written value is correct. */ + small_maxclass = (ZU(1) << lg_base) + + (ZU(ndelta) << lg_delta); + if (lg_ngroup > 0) { + lg_large_minclass = lg_base + 1; + } else { + lg_large_minclass = lg_base + 2; + } + } + large_maxclass = (ZU(1) << lg_base) + + (ZU(ndelta) << lg_delta); + index++; + ndelta++; + } + lg_base++; + lg_delta++; + } + /* Additional outputs. */ + int nsizes = index; + unsigned lg_ceil_nsizes = lg_ceil(nsizes); + + /* Fill in the output data. */ + sc_data->ntiny = ntiny; + sc_data->nlbins = nlbins; + sc_data->nbins = nbins; + sc_data->nsizes = nsizes; + sc_data->lg_ceil_nsizes = lg_ceil_nsizes; + sc_data->npsizes = npsizes; + sc_data->lg_tiny_maxclass = lg_tiny_maxclass; + sc_data->lookup_maxclass = lookup_maxclass; + sc_data->small_maxclass = small_maxclass; + sc_data->lg_large_minclass = lg_large_minclass; + sc_data->large_minclass = (ZU(1) << lg_large_minclass); + sc_data->large_maxclass = large_maxclass; + + /* + * We compute these values in two ways: + * - Incrementally, as above. + * - In macros, in sc.h. + * The computation is easier when done incrementally, but putting it in + * a constant makes it available to the fast paths without having to + * touch the extra global cacheline. We assert, however, that the two + * computations are equivalent. + */ + assert(sc_data->npsizes == SC_NPSIZES); + assert(sc_data->lg_tiny_maxclass == SC_LG_TINY_MAXCLASS); + assert(sc_data->small_maxclass == SC_SMALL_MAXCLASS); + assert(sc_data->large_minclass == SC_LARGE_MINCLASS); + assert(sc_data->lg_large_minclass == SC_LG_LARGE_MINCLASS); + assert(sc_data->large_maxclass == SC_LARGE_MAXCLASS); + + /* + * In the allocation fastpath, we want to assume that we can + * unconditionally subtract the requested allocation size from + * a ssize_t, and detect passing through 0 correctly. This + * results in optimal generated code. For this to work, the + * maximum allocation size must be less than SSIZE_MAX. + */ + assert(SC_LARGE_MAXCLASS < SSIZE_MAX); +} + +void +sc_data_init(sc_data_t *sc_data) { + size_classes(sc_data, LG_SIZEOF_PTR, LG_QUANTUM, SC_LG_TINY_MIN, + SC_LG_MAX_LOOKUP, LG_PAGE, SC_LG_NGROUP); + + sc_data->initialized = true; +} + +static void +sc_data_update_sc_slab_size(sc_t *sc, size_t reg_size, size_t pgs_guess) { + size_t min_pgs = reg_size / PAGE; + if (reg_size % PAGE != 0) { + min_pgs++; + } + /* + * BITMAP_MAXBITS is actually determined by putting the smallest + * possible size-class on one page, so this can never be 0. + */ + size_t max_pgs = BITMAP_MAXBITS * reg_size / PAGE; + + assert(min_pgs <= max_pgs); + assert(min_pgs > 0); + assert(max_pgs >= 1); + if (pgs_guess < min_pgs) { + sc->pgs = (int)min_pgs; + } else if (pgs_guess > max_pgs) { + sc->pgs = (int)max_pgs; + } else { + sc->pgs = (int)pgs_guess; + } +} + +void +sc_data_update_slab_size(sc_data_t *data, size_t begin, size_t end, int pgs) { + assert(data->initialized); + for (int i = 0; i < data->nsizes; i++) { + sc_t *sc = &data->sc[i]; + if (!sc->bin) { + break; + } + size_t reg_size = reg_size_compute(sc->lg_base, sc->lg_delta, + sc->ndelta); + if (begin <= reg_size && reg_size <= end) { + sc_data_update_sc_slab_size(sc, reg_size, pgs); + } + } +} + +void +sc_boot(sc_data_t *data) { + sc_data_init(data); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/sec.c b/src/duckdb/extension/jemalloc/jemalloc/src/sec.c new file mode 100644 index 000000000..19d69ff45 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/sec.c @@ -0,0 +1,423 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/sec.h" + +static edata_t *sec_alloc(tsdn_t *tsdn, pai_t *self, size_t size, + size_t alignment, bool zero, bool guarded, bool frequent_reuse, + bool *deferred_work_generated); +static bool sec_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, + size_t old_size, size_t new_size, bool zero, bool *deferred_work_generated); +static bool sec_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, + size_t old_size, size_t new_size, bool *deferred_work_generated); +static void sec_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata, + bool *deferred_work_generated); + +static void +sec_bin_init(sec_bin_t *bin) { + bin->being_batch_filled = false; + bin->bytes_cur = 0; + edata_list_active_init(&bin->freelist); +} + +bool +sec_init(tsdn_t *tsdn, sec_t *sec, base_t *base, pai_t *fallback, + const sec_opts_t *opts) { + assert(opts->max_alloc >= PAGE); + + size_t max_alloc = PAGE_FLOOR(opts->max_alloc); + pszind_t npsizes = sz_psz2ind(max_alloc) + 1; + + size_t sz_shards = opts->nshards * sizeof(sec_shard_t); + size_t sz_bins = opts->nshards * (size_t)npsizes * sizeof(sec_bin_t); + size_t sz_alloc = sz_shards + sz_bins; + void *dynalloc = base_alloc(tsdn, base, sz_alloc, CACHELINE); + if (dynalloc == NULL) { + return true; + } + sec_shard_t *shard_cur = (sec_shard_t *)dynalloc; + sec->shards = shard_cur; + sec_bin_t *bin_cur = (sec_bin_t *)&shard_cur[opts->nshards]; + /* Just for asserts, below. */ + sec_bin_t *bin_start = bin_cur; + + for (size_t i = 0; i < opts->nshards; i++) { + sec_shard_t *shard = shard_cur; + shard_cur++; + bool err = malloc_mutex_init(&shard->mtx, "sec_shard", + WITNESS_RANK_SEC_SHARD, malloc_mutex_rank_exclusive); + if (err) { + return true; + } + shard->enabled = true; + shard->bins = bin_cur; + for (pszind_t j = 0; j < npsizes; j++) { + sec_bin_init(&shard->bins[j]); + bin_cur++; + } + shard->bytes_cur = 0; + shard->to_flush_next = 0; + } + /* + * Should have exactly matched the bin_start to the first unused byte + * after the shards. + */ + assert((void *)shard_cur == (void *)bin_start); + /* And the last bin to use up the last bytes of the allocation. */ + assert((char *)bin_cur == ((char *)dynalloc + sz_alloc)); + sec->fallback = fallback; + + + sec->opts = *opts; + sec->npsizes = npsizes; + + /* + * Initialize these last so that an improper use of an SEC whose + * initialization failed will segfault in an easy-to-spot way. + */ + sec->pai.alloc = &sec_alloc; + sec->pai.alloc_batch = &pai_alloc_batch_default; + sec->pai.expand = &sec_expand; + sec->pai.shrink = &sec_shrink; + sec->pai.dalloc = &sec_dalloc; + sec->pai.dalloc_batch = &pai_dalloc_batch_default; + + return false; +} + +static sec_shard_t * +sec_shard_pick(tsdn_t *tsdn, sec_t *sec) { + /* + * Eventually, we should implement affinity, tracking source shard using + * the edata_t's newly freed up fields. For now, just randomly + * distribute across all shards. + */ + if (tsdn_null(tsdn)) { + return &sec->shards[0]; + } + tsd_t *tsd = tsdn_tsd(tsdn); + uint8_t *idxp = tsd_sec_shardp_get(tsd); + if (*idxp == (uint8_t)-1) { + /* + * First use; initialize using the trick from Daniel Lemire's + * "A fast alternative to the modulo reduction. Use a 64 bit + * number to store 32 bits, since we'll deliberately overflow + * when we multiply by the number of shards. + */ + uint64_t rand32 = prng_lg_range_u64(tsd_prng_statep_get(tsd), 32); + uint32_t idx = + (uint32_t)((rand32 * (uint64_t)sec->opts.nshards) >> 32); + assert(idx < (uint32_t)sec->opts.nshards); + *idxp = (uint8_t)idx; + } + return &sec->shards[*idxp]; +} + +/* + * Perhaps surprisingly, this can be called on the alloc pathways; if we hit an + * empty cache, we'll try to fill it, which can push the shard over it's limit. + */ +static void +sec_flush_some_and_unlock(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + edata_list_active_t to_flush; + edata_list_active_init(&to_flush); + while (shard->bytes_cur > sec->opts.bytes_after_flush) { + /* Pick a victim. */ + sec_bin_t *bin = &shard->bins[shard->to_flush_next]; + + /* Update our victim-picking state. */ + shard->to_flush_next++; + if (shard->to_flush_next == sec->npsizes) { + shard->to_flush_next = 0; + } + + assert(shard->bytes_cur >= bin->bytes_cur); + if (bin->bytes_cur != 0) { + shard->bytes_cur -= bin->bytes_cur; + bin->bytes_cur = 0; + edata_list_active_concat(&to_flush, &bin->freelist); + } + /* + * Either bin->bytes_cur was 0, in which case we didn't touch + * the bin list but it should be empty anyways (or else we + * missed a bytes_cur update on a list modification), or it + * *was* 0 and we emptied it ourselves. Either way, it should + * be empty now. + */ + assert(edata_list_active_empty(&bin->freelist)); + } + + malloc_mutex_unlock(tsdn, &shard->mtx); + bool deferred_work_generated = false; + pai_dalloc_batch(tsdn, sec->fallback, &to_flush, + &deferred_work_generated); +} + +static edata_t * +sec_shard_alloc_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard, + sec_bin_t *bin) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + if (!shard->enabled) { + return NULL; + } + edata_t *edata = edata_list_active_first(&bin->freelist); + if (edata != NULL) { + edata_list_active_remove(&bin->freelist, edata); + assert(edata_size_get(edata) <= bin->bytes_cur); + bin->bytes_cur -= edata_size_get(edata); + assert(edata_size_get(edata) <= shard->bytes_cur); + shard->bytes_cur -= edata_size_get(edata); + } + return edata; +} + +static edata_t * +sec_batch_fill_and_alloc(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard, + sec_bin_t *bin, size_t size, bool frequent_reuse) { + malloc_mutex_assert_not_owner(tsdn, &shard->mtx); + + edata_list_active_t result; + edata_list_active_init(&result); + bool deferred_work_generated = false; + size_t nalloc = pai_alloc_batch(tsdn, sec->fallback, size, + 1 + sec->opts.batch_fill_extra, &result, frequent_reuse, + &deferred_work_generated); + + edata_t *ret = edata_list_active_first(&result); + if (ret != NULL) { + edata_list_active_remove(&result, ret); + } + + malloc_mutex_lock(tsdn, &shard->mtx); + bin->being_batch_filled = false; + /* + * Handle the easy case first: nothing to cache. Note that this can + * only happen in case of OOM, since sec_alloc checks the expected + * number of allocs, and doesn't bother going down the batch_fill + * pathway if there won't be anything left to cache. So to be in this + * code path, we must have asked for > 1 alloc, but only gotten 1 back. + */ + if (nalloc <= 1) { + malloc_mutex_unlock(tsdn, &shard->mtx); + return ret; + } + + size_t new_cached_bytes = (nalloc - 1) * size; + + edata_list_active_concat(&bin->freelist, &result); + bin->bytes_cur += new_cached_bytes; + shard->bytes_cur += new_cached_bytes; + + if (shard->bytes_cur > sec->opts.max_bytes) { + sec_flush_some_and_unlock(tsdn, sec, shard); + } else { + malloc_mutex_unlock(tsdn, &shard->mtx); + } + + return ret; +} + +static edata_t * +sec_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero, + bool guarded, bool frequent_reuse, bool *deferred_work_generated) { + assert((size & PAGE_MASK) == 0); + assert(!guarded); + + sec_t *sec = (sec_t *)self; + + if (zero || alignment > PAGE || sec->opts.nshards == 0 + || size > sec->opts.max_alloc) { + return pai_alloc(tsdn, sec->fallback, size, alignment, zero, + /* guarded */ false, frequent_reuse, + deferred_work_generated); + } + pszind_t pszind = sz_psz2ind(size); + assert(pszind < sec->npsizes); + + sec_shard_t *shard = sec_shard_pick(tsdn, sec); + sec_bin_t *bin = &shard->bins[pszind]; + bool do_batch_fill = false; + + malloc_mutex_lock(tsdn, &shard->mtx); + edata_t *edata = sec_shard_alloc_locked(tsdn, sec, shard, bin); + if (edata == NULL) { + if (!bin->being_batch_filled + && sec->opts.batch_fill_extra > 0) { + bin->being_batch_filled = true; + do_batch_fill = true; + } + } + malloc_mutex_unlock(tsdn, &shard->mtx); + if (edata == NULL) { + if (do_batch_fill) { + edata = sec_batch_fill_and_alloc(tsdn, sec, shard, bin, + size, frequent_reuse); + } else { + edata = pai_alloc(tsdn, sec->fallback, size, alignment, + zero, /* guarded */ false, frequent_reuse, + deferred_work_generated); + } + } + return edata; +} + +static bool +sec_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, + size_t new_size, bool zero, bool *deferred_work_generated) { + sec_t *sec = (sec_t *)self; + return pai_expand(tsdn, sec->fallback, edata, old_size, new_size, zero, + deferred_work_generated); +} + +static bool +sec_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, + size_t new_size, bool *deferred_work_generated) { + sec_t *sec = (sec_t *)self; + return pai_shrink(tsdn, sec->fallback, edata, old_size, new_size, + deferred_work_generated); +} + +static void +sec_flush_all_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + shard->bytes_cur = 0; + edata_list_active_t to_flush; + edata_list_active_init(&to_flush); + for (pszind_t i = 0; i < sec->npsizes; i++) { + sec_bin_t *bin = &shard->bins[i]; + bin->bytes_cur = 0; + edata_list_active_concat(&to_flush, &bin->freelist); + } + + /* + * Ordinarily we would try to avoid doing the batch deallocation while + * holding the shard mutex, but the flush_all pathways only happen when + * we're disabling the HPA or resetting the arena, both of which are + * rare pathways. + */ + bool deferred_work_generated = false; + pai_dalloc_batch(tsdn, sec->fallback, &to_flush, + &deferred_work_generated); +} + +static void +sec_shard_dalloc_and_unlock(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard, + edata_t *edata) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + assert(shard->bytes_cur <= sec->opts.max_bytes); + size_t size = edata_size_get(edata); + pszind_t pszind = sz_psz2ind(size); + assert(pszind < sec->npsizes); + /* + * Prepending here results in LIFO allocation per bin, which seems + * reasonable. + */ + sec_bin_t *bin = &shard->bins[pszind]; + edata_list_active_prepend(&bin->freelist, edata); + bin->bytes_cur += size; + shard->bytes_cur += size; + if (shard->bytes_cur > sec->opts.max_bytes) { + /* + * We've exceeded the shard limit. We make two nods in the + * direction of fragmentation avoidance: we flush everything in + * the shard, rather than one particular bin, and we hold the + * lock while flushing (in case one of the extents we flush is + * highly preferred from a fragmentation-avoidance perspective + * in the backing allocator). This has the extra advantage of + * not requiring advanced cache balancing strategies. + */ + sec_flush_some_and_unlock(tsdn, sec, shard); + malloc_mutex_assert_not_owner(tsdn, &shard->mtx); + } else { + malloc_mutex_unlock(tsdn, &shard->mtx); + } +} + +static void +sec_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata, + bool *deferred_work_generated) { + sec_t *sec = (sec_t *)self; + if (sec->opts.nshards == 0 + || edata_size_get(edata) > sec->opts.max_alloc) { + pai_dalloc(tsdn, sec->fallback, edata, + deferred_work_generated); + return; + } + sec_shard_t *shard = sec_shard_pick(tsdn, sec); + malloc_mutex_lock(tsdn, &shard->mtx); + if (shard->enabled) { + sec_shard_dalloc_and_unlock(tsdn, sec, shard, edata); + } else { + malloc_mutex_unlock(tsdn, &shard->mtx); + pai_dalloc(tsdn, sec->fallback, edata, + deferred_work_generated); + } +} + +void +sec_flush(tsdn_t *tsdn, sec_t *sec) { + for (size_t i = 0; i < sec->opts.nshards; i++) { + malloc_mutex_lock(tsdn, &sec->shards[i].mtx); + sec_flush_all_locked(tsdn, sec, &sec->shards[i]); + malloc_mutex_unlock(tsdn, &sec->shards[i].mtx); + } +} + +void +sec_disable(tsdn_t *tsdn, sec_t *sec) { + for (size_t i = 0; i < sec->opts.nshards; i++) { + malloc_mutex_lock(tsdn, &sec->shards[i].mtx); + sec->shards[i].enabled = false; + sec_flush_all_locked(tsdn, sec, &sec->shards[i]); + malloc_mutex_unlock(tsdn, &sec->shards[i].mtx); + } +} + +void +sec_stats_merge(tsdn_t *tsdn, sec_t *sec, sec_stats_t *stats) { + size_t sum = 0; + for (size_t i = 0; i < sec->opts.nshards; i++) { + /* + * We could save these lock acquisitions by making bytes_cur + * atomic, but stats collection is rare anyways and we expect + * the number and type of stats to get more interesting. + */ + malloc_mutex_lock(tsdn, &sec->shards[i].mtx); + sum += sec->shards[i].bytes_cur; + malloc_mutex_unlock(tsdn, &sec->shards[i].mtx); + } + stats->bytes += sum; +} + +void +sec_mutex_stats_read(tsdn_t *tsdn, sec_t *sec, + mutex_prof_data_t *mutex_prof_data) { + for (size_t i = 0; i < sec->opts.nshards; i++) { + malloc_mutex_lock(tsdn, &sec->shards[i].mtx); + malloc_mutex_prof_accum(tsdn, mutex_prof_data, + &sec->shards[i].mtx); + malloc_mutex_unlock(tsdn, &sec->shards[i].mtx); + } +} + +void +sec_prefork2(tsdn_t *tsdn, sec_t *sec) { + for (size_t i = 0; i < sec->opts.nshards; i++) { + malloc_mutex_prefork(tsdn, &sec->shards[i].mtx); + } +} + +void +sec_postfork_parent(tsdn_t *tsdn, sec_t *sec) { + for (size_t i = 0; i < sec->opts.nshards; i++) { + malloc_mutex_postfork_parent(tsdn, &sec->shards[i].mtx); + } +} + +void +sec_postfork_child(tsdn_t *tsdn, sec_t *sec) { + for (size_t i = 0; i < sec->opts.nshards; i++) { + malloc_mutex_postfork_child(tsdn, &sec->shards[i].mtx); + } +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/stats.c b/src/duckdb/extension/jemalloc/jemalloc/src/stats.c new file mode 100644 index 000000000..fbfacabf2 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/stats.c @@ -0,0 +1,2080 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/ctl.h" +#include "jemalloc/internal/emitter.h" +#include "jemalloc/internal/fxp.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/mutex_prof.h" +#include "jemalloc/internal/prof_stats.h" + +static const char *const global_mutex_names[mutex_prof_num_global_mutexes] = { +#define OP(mtx) #mtx, + MUTEX_PROF_GLOBAL_MUTEXES +#undef OP +}; + +static const char *const arena_mutex_names[mutex_prof_num_arena_mutexes] = { +#define OP(mtx) #mtx, + MUTEX_PROF_ARENA_MUTEXES +#undef OP +}; + +#define CTL_GET(n, v, t) do { \ + size_t sz = sizeof(t); \ + xmallctl(n, (void *)v, &sz, NULL, 0); \ +} while (0) + +#define CTL_LEAF_PREPARE(mib, miblen, name) do { \ + assert(miblen < CTL_MAX_DEPTH); \ + size_t miblen_new = CTL_MAX_DEPTH; \ + xmallctlmibnametomib(mib, miblen, name, &miblen_new); \ + assert(miblen_new > miblen); \ +} while (0) + +#define CTL_LEAF(mib, miblen, leaf, v, t) do { \ + assert(miblen < CTL_MAX_DEPTH); \ + size_t miblen_new = CTL_MAX_DEPTH; \ + size_t sz = sizeof(t); \ + xmallctlbymibname(mib, miblen, leaf, &miblen_new, (void *)v, \ + &sz, NULL, 0); \ + assert(miblen_new == miblen + 1); \ +} while (0) + +#define CTL_MIB_GET(n, i, v, t, ind) do { \ + size_t mib[CTL_MAX_DEPTH]; \ + size_t miblen = sizeof(mib) / sizeof(size_t); \ + size_t sz = sizeof(t); \ + xmallctlnametomib(n, mib, &miblen); \ + mib[(ind)] = (i); \ + xmallctlbymib(mib, miblen, (void *)v, &sz, NULL, 0); \ +} while (0) + +#define CTL_M1_GET(n, i, v, t) CTL_MIB_GET(n, i, v, t, 1) +#define CTL_M2_GET(n, i, v, t) CTL_MIB_GET(n, i, v, t, 2) + +/******************************************************************************/ +/* Data. */ + +bool opt_stats_print = false; +char opt_stats_print_opts[stats_print_tot_num_options+1] = ""; + +int64_t opt_stats_interval = STATS_INTERVAL_DEFAULT; +char opt_stats_interval_opts[stats_print_tot_num_options+1] = ""; + +static counter_accum_t stats_interval_accumulated; +/* Per thread batch accum size for stats_interval. */ +static uint64_t stats_interval_accum_batch; + +/******************************************************************************/ + +static uint64_t +rate_per_second(uint64_t value, uint64_t uptime_ns) { + uint64_t billion = 1000000000; + if (uptime_ns == 0 || value == 0) { + return 0; + } + if (uptime_ns < billion) { + return value; + } else { + uint64_t uptime_s = uptime_ns / billion; + return value / uptime_s; + } +} + +/* Calculate x.yyy and output a string (takes a fixed sized char array). */ +static bool +get_rate_str(uint64_t dividend, uint64_t divisor, char str[6]) { + if (divisor == 0 || dividend > divisor) { + /* The rate is not supposed to be greater than 1. */ + return true; + } + if (dividend > 0) { + assert(UINT64_MAX / dividend >= 1000); + } + + unsigned n = (unsigned)((dividend * 1000) / divisor); + if (n < 10) { + malloc_snprintf(str, 6, "0.00%u", n); + } else if (n < 100) { + malloc_snprintf(str, 6, "0.0%u", n); + } else if (n < 1000) { + malloc_snprintf(str, 6, "0.%u", n); + } else { + malloc_snprintf(str, 6, "1"); + } + + return false; +} + +static void +mutex_stats_init_cols(emitter_row_t *row, const char *table_name, + emitter_col_t *name, + emitter_col_t col_uint64_t[mutex_prof_num_uint64_t_counters], + emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters]) { + mutex_prof_uint64_t_counter_ind_t k_uint64_t = 0; + mutex_prof_uint32_t_counter_ind_t k_uint32_t = 0; + + emitter_col_t *col; + + if (name != NULL) { + emitter_col_init(name, row); + name->justify = emitter_justify_left; + name->width = 21; + name->type = emitter_type_title; + name->str_val = table_name; + } + +#define WIDTH_uint32_t 12 +#define WIDTH_uint64_t 16 +#define OP(counter, counter_type, human, derived, base_counter) \ + col = &col_##counter_type[k_##counter_type]; \ + ++k_##counter_type; \ + emitter_col_init(col, row); \ + col->justify = emitter_justify_right; \ + col->width = derived ? 8 : WIDTH_##counter_type; \ + col->type = emitter_type_title; \ + col->str_val = human; + MUTEX_PROF_COUNTERS +#undef OP +#undef WIDTH_uint32_t +#undef WIDTH_uint64_t + col_uint64_t[mutex_counter_total_wait_time_ps].width = 10; +} + +static void +mutex_stats_read_global(size_t mib[], size_t miblen, const char *name, + emitter_col_t *col_name, + emitter_col_t col_uint64_t[mutex_prof_num_uint64_t_counters], + emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters], + uint64_t uptime) { + CTL_LEAF_PREPARE(mib, miblen, name); + size_t miblen_name = miblen + 1; + + col_name->str_val = name; + + emitter_col_t *dst; +#define EMITTER_TYPE_uint32_t emitter_type_uint32 +#define EMITTER_TYPE_uint64_t emitter_type_uint64 +#define OP(counter, counter_type, human, derived, base_counter) \ + dst = &col_##counter_type[mutex_counter_##counter]; \ + dst->type = EMITTER_TYPE_##counter_type; \ + if (!derived) { \ + CTL_LEAF(mib, miblen_name, #counter, \ + (counter_type *)&dst->bool_val, counter_type); \ + } else { \ + emitter_col_t *base = \ + &col_##counter_type[mutex_counter_##base_counter]; \ + dst->counter_type##_val = \ + (counter_type)rate_per_second( \ + base->counter_type##_val, uptime); \ + } + MUTEX_PROF_COUNTERS +#undef OP +#undef EMITTER_TYPE_uint32_t +#undef EMITTER_TYPE_uint64_t +} + +static void +mutex_stats_read_arena(size_t mib[], size_t miblen, const char *name, + emitter_col_t *col_name, + emitter_col_t col_uint64_t[mutex_prof_num_uint64_t_counters], + emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters], + uint64_t uptime) { + CTL_LEAF_PREPARE(mib, miblen, name); + size_t miblen_name = miblen + 1; + + col_name->str_val = name; + + emitter_col_t *dst; +#define EMITTER_TYPE_uint32_t emitter_type_uint32 +#define EMITTER_TYPE_uint64_t emitter_type_uint64 +#define OP(counter, counter_type, human, derived, base_counter) \ + dst = &col_##counter_type[mutex_counter_##counter]; \ + dst->type = EMITTER_TYPE_##counter_type; \ + if (!derived) { \ + CTL_LEAF(mib, miblen_name, #counter, \ + (counter_type *)&dst->bool_val, counter_type); \ + } else { \ + emitter_col_t *base = \ + &col_##counter_type[mutex_counter_##base_counter]; \ + dst->counter_type##_val = \ + (counter_type)rate_per_second( \ + base->counter_type##_val, uptime); \ + } + MUTEX_PROF_COUNTERS +#undef OP +#undef EMITTER_TYPE_uint32_t +#undef EMITTER_TYPE_uint64_t +} + +static void +mutex_stats_read_arena_bin(size_t mib[], size_t miblen, + emitter_col_t col_uint64_t[mutex_prof_num_uint64_t_counters], + emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters], + uint64_t uptime) { + CTL_LEAF_PREPARE(mib, miblen, "mutex"); + size_t miblen_mutex = miblen + 1; + + emitter_col_t *dst; + +#define EMITTER_TYPE_uint32_t emitter_type_uint32 +#define EMITTER_TYPE_uint64_t emitter_type_uint64 +#define OP(counter, counter_type, human, derived, base_counter) \ + dst = &col_##counter_type[mutex_counter_##counter]; \ + dst->type = EMITTER_TYPE_##counter_type; \ + if (!derived) { \ + CTL_LEAF(mib, miblen_mutex, #counter, \ + (counter_type *)&dst->bool_val, counter_type); \ + } else { \ + emitter_col_t *base = \ + &col_##counter_type[mutex_counter_##base_counter]; \ + dst->counter_type##_val = \ + (counter_type)rate_per_second( \ + base->counter_type##_val, uptime); \ + } + MUTEX_PROF_COUNTERS +#undef OP +#undef EMITTER_TYPE_uint32_t +#undef EMITTER_TYPE_uint64_t +} + +/* "row" can be NULL to avoid emitting in table mode. */ +static void +mutex_stats_emit(emitter_t *emitter, emitter_row_t *row, + emitter_col_t col_uint64_t[mutex_prof_num_uint64_t_counters], + emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters]) { + if (row != NULL) { + emitter_table_row(emitter, row); + } + + mutex_prof_uint64_t_counter_ind_t k_uint64_t = 0; + mutex_prof_uint32_t_counter_ind_t k_uint32_t = 0; + + emitter_col_t *col; + +#define EMITTER_TYPE_uint32_t emitter_type_uint32 +#define EMITTER_TYPE_uint64_t emitter_type_uint64 +#define OP(counter, type, human, derived, base_counter) \ + if (!derived) { \ + col = &col_##type[k_##type]; \ + ++k_##type; \ + emitter_json_kv(emitter, #counter, EMITTER_TYPE_##type, \ + (const void *)&col->bool_val); \ + } + MUTEX_PROF_COUNTERS; +#undef OP +#undef EMITTER_TYPE_uint32_t +#undef EMITTER_TYPE_uint64_t +} + +#define COL_DECLARE(column_name) \ + emitter_col_t col_##column_name; + +#define COL_INIT(row_name, column_name, left_or_right, col_width, etype)\ + emitter_col_init(&col_##column_name, &row_name); \ + col_##column_name.justify = emitter_justify_##left_or_right; \ + col_##column_name.width = col_width; \ + col_##column_name.type = emitter_type_##etype; + +#define COL(row_name, column_name, left_or_right, col_width, etype) \ + COL_DECLARE(column_name); \ + COL_INIT(row_name, column_name, left_or_right, col_width, etype) + +#define COL_HDR_DECLARE(column_name) \ + COL_DECLARE(column_name); \ + emitter_col_t header_##column_name; + +#define COL_HDR_INIT(row_name, column_name, human, left_or_right, \ + col_width, etype) \ + COL_INIT(row_name, column_name, left_or_right, col_width, etype)\ + emitter_col_init(&header_##column_name, &header_##row_name); \ + header_##column_name.justify = emitter_justify_##left_or_right; \ + header_##column_name.width = col_width; \ + header_##column_name.type = emitter_type_title; \ + header_##column_name.str_val = human ? human : #column_name; + +#define COL_HDR(row_name, column_name, human, left_or_right, col_width, \ + etype) \ + COL_HDR_DECLARE(column_name) \ + COL_HDR_INIT(row_name, column_name, human, left_or_right, \ + col_width, etype) + +JEMALLOC_COLD +static void +stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i, + uint64_t uptime) { + size_t page; + bool in_gap, in_gap_prev; + unsigned nbins, j; + + CTL_GET("arenas.page", &page, size_t); + + CTL_GET("arenas.nbins", &nbins, unsigned); + + emitter_row_t header_row; + emitter_row_init(&header_row); + + emitter_row_t row; + emitter_row_init(&row); + + bool prof_stats_on = config_prof && opt_prof && opt_prof_stats + && i == MALLCTL_ARENAS_ALL; + + COL_HDR(row, size, NULL, right, 20, size) + COL_HDR(row, ind, NULL, right, 4, unsigned) + COL_HDR(row, allocated, NULL, right, 13, size) + COL_HDR(row, nmalloc, NULL, right, 13, uint64) + COL_HDR(row, nmalloc_ps, "(#/sec)", right, 8, uint64) + COL_HDR(row, ndalloc, NULL, right, 13, uint64) + COL_HDR(row, ndalloc_ps, "(#/sec)", right, 8, uint64) + COL_HDR(row, nrequests, NULL, right, 13, uint64) + COL_HDR(row, nrequests_ps, "(#/sec)", right, 10, uint64) + COL_HDR_DECLARE(prof_live_requested); + COL_HDR_DECLARE(prof_live_count); + COL_HDR_DECLARE(prof_accum_requested); + COL_HDR_DECLARE(prof_accum_count); + if (prof_stats_on) { + COL_HDR_INIT(row, prof_live_requested, NULL, right, 21, uint64) + COL_HDR_INIT(row, prof_live_count, NULL, right, 17, uint64) + COL_HDR_INIT(row, prof_accum_requested, NULL, right, 21, uint64) + COL_HDR_INIT(row, prof_accum_count, NULL, right, 17, uint64) + } + COL_HDR(row, nshards, NULL, right, 9, unsigned) + COL_HDR(row, curregs, NULL, right, 13, size) + COL_HDR(row, curslabs, NULL, right, 13, size) + COL_HDR(row, nonfull_slabs, NULL, right, 15, size) + COL_HDR(row, regs, NULL, right, 5, unsigned) + COL_HDR(row, pgs, NULL, right, 4, size) + /* To buffer a right- and left-justified column. */ + COL_HDR(row, justify_spacer, NULL, right, 1, title) + COL_HDR(row, util, NULL, right, 6, title) + COL_HDR(row, nfills, NULL, right, 13, uint64) + COL_HDR(row, nfills_ps, "(#/sec)", right, 8, uint64) + COL_HDR(row, nflushes, NULL, right, 13, uint64) + COL_HDR(row, nflushes_ps, "(#/sec)", right, 8, uint64) + COL_HDR(row, nslabs, NULL, right, 13, uint64) + COL_HDR(row, nreslabs, NULL, right, 13, uint64) + COL_HDR(row, nreslabs_ps, "(#/sec)", right, 8, uint64) + + COL_HDR(row, pops, NULL, right, 10, uint64) + COL_HDR(row, pops_ps, "(#/sec)", right, 8, uint64) + COL_HDR(row, failed_push, NULL, right, 13, uint64) + COL_HDR(row, failed_push_ps, "(#/sec)", right, 8, uint64) + COL_HDR(row, push, NULL, right, 7, uint64) + COL_HDR(row, push_ps, "(#/sec)", right, 8, uint64) + COL_HDR(row, push_elem, NULL, right, 12, uint64) + COL_HDR(row, push_elem_ps, "(#/sec)", right, 8, uint64) + + /* Don't want to actually print the name. */ + header_justify_spacer.str_val = " "; + col_justify_spacer.str_val = " "; + + emitter_col_t col_mutex64[mutex_prof_num_uint64_t_counters]; + emitter_col_t col_mutex32[mutex_prof_num_uint32_t_counters]; + + emitter_col_t header_mutex64[mutex_prof_num_uint64_t_counters]; + emitter_col_t header_mutex32[mutex_prof_num_uint32_t_counters]; + + if (mutex) { + mutex_stats_init_cols(&row, NULL, NULL, col_mutex64, + col_mutex32); + mutex_stats_init_cols(&header_row, NULL, NULL, header_mutex64, + header_mutex32); + } + + /* + * We print a "bins:" header as part of the table row; we need to adjust + * the header size column to compensate. + */ + header_size.width -=5; + emitter_table_printf(emitter, "bins:"); + emitter_table_row(emitter, &header_row); + emitter_json_array_kv_begin(emitter, "bins"); + + size_t stats_arenas_mib[CTL_MAX_DEPTH]; + CTL_LEAF_PREPARE(stats_arenas_mib, 0, "stats.arenas"); + stats_arenas_mib[2] = i; + CTL_LEAF_PREPARE(stats_arenas_mib, 3, "bins"); + + size_t arenas_bin_mib[CTL_MAX_DEPTH]; + CTL_LEAF_PREPARE(arenas_bin_mib, 0, "arenas.bin"); + + size_t prof_stats_mib[CTL_MAX_DEPTH]; + if (prof_stats_on) { + CTL_LEAF_PREPARE(prof_stats_mib, 0, "prof.stats.bins"); + } + + for (j = 0, in_gap = false; j < nbins; j++) { + uint64_t nslabs; + size_t reg_size, slab_size, curregs; + size_t curslabs; + size_t nonfull_slabs; + uint32_t nregs, nshards; + uint64_t nmalloc, ndalloc, nrequests, nfills, nflushes; + uint64_t nreslabs; + uint64_t batch_pops, batch_failed_pushes, batch_pushes, + batch_pushed_elems; + prof_stats_t prof_live; + prof_stats_t prof_accum; + + stats_arenas_mib[4] = j; + arenas_bin_mib[2] = j; + + CTL_LEAF(stats_arenas_mib, 5, "nslabs", &nslabs, uint64_t); + + if (prof_stats_on) { + prof_stats_mib[3] = j; + CTL_LEAF(prof_stats_mib, 4, "live", &prof_live, + prof_stats_t); + CTL_LEAF(prof_stats_mib, 4, "accum", &prof_accum, + prof_stats_t); + } + + in_gap_prev = in_gap; + if (prof_stats_on) { + in_gap = (nslabs == 0 && prof_accum.count == 0); + } else { + in_gap = (nslabs == 0); + } + + if (in_gap_prev && !in_gap) { + emitter_table_printf(emitter, + " ---\n"); + } + + if (in_gap && !emitter_outputs_json(emitter)) { + continue; + } + + CTL_LEAF(arenas_bin_mib, 3, "size", ®_size, size_t); + CTL_LEAF(arenas_bin_mib, 3, "nregs", &nregs, uint32_t); + CTL_LEAF(arenas_bin_mib, 3, "slab_size", &slab_size, size_t); + CTL_LEAF(arenas_bin_mib, 3, "nshards", &nshards, uint32_t); + CTL_LEAF(stats_arenas_mib, 5, "nmalloc", &nmalloc, uint64_t); + CTL_LEAF(stats_arenas_mib, 5, "ndalloc", &ndalloc, uint64_t); + CTL_LEAF(stats_arenas_mib, 5, "curregs", &curregs, size_t); + CTL_LEAF(stats_arenas_mib, 5, "nrequests", &nrequests, + uint64_t); + CTL_LEAF(stats_arenas_mib, 5, "nfills", &nfills, uint64_t); + CTL_LEAF(stats_arenas_mib, 5, "nflushes", &nflushes, uint64_t); + CTL_LEAF(stats_arenas_mib, 5, "nreslabs", &nreslabs, uint64_t); + CTL_LEAF(stats_arenas_mib, 5, "curslabs", &curslabs, size_t); + CTL_LEAF(stats_arenas_mib, 5, "nonfull_slabs", &nonfull_slabs, + size_t); + + CTL_LEAF(stats_arenas_mib, 5, "batch_pops", &batch_pops, + uint64_t); + CTL_LEAF(stats_arenas_mib, 5, "batch_failed_pushes", + &batch_failed_pushes, uint64_t); + CTL_LEAF(stats_arenas_mib, 5, "batch_pushes", + &batch_pushes, uint64_t); + CTL_LEAF(stats_arenas_mib, 5, "batch_pushed_elems", + &batch_pushed_elems, uint64_t); + + if (mutex) { + mutex_stats_read_arena_bin(stats_arenas_mib, 5, + col_mutex64, col_mutex32, uptime); + } + + emitter_json_object_begin(emitter); + emitter_json_kv(emitter, "nmalloc", emitter_type_uint64, + &nmalloc); + emitter_json_kv(emitter, "ndalloc", emitter_type_uint64, + &ndalloc); + emitter_json_kv(emitter, "curregs", emitter_type_size, + &curregs); + emitter_json_kv(emitter, "nrequests", emitter_type_uint64, + &nrequests); + if (prof_stats_on) { + emitter_json_kv(emitter, "prof_live_requested", + emitter_type_uint64, &prof_live.req_sum); + emitter_json_kv(emitter, "prof_live_count", + emitter_type_uint64, &prof_live.count); + emitter_json_kv(emitter, "prof_accum_requested", + emitter_type_uint64, &prof_accum.req_sum); + emitter_json_kv(emitter, "prof_accum_count", + emitter_type_uint64, &prof_accum.count); + } + emitter_json_kv(emitter, "nfills", emitter_type_uint64, + &nfills); + emitter_json_kv(emitter, "nflushes", emitter_type_uint64, + &nflushes); + emitter_json_kv(emitter, "nreslabs", emitter_type_uint64, + &nreslabs); + emitter_json_kv(emitter, "curslabs", emitter_type_size, + &curslabs); + emitter_json_kv(emitter, "nonfull_slabs", emitter_type_size, + &nonfull_slabs); + emitter_json_kv(emitter, "batch_pops", + emitter_type_uint64, &batch_pops); + emitter_json_kv(emitter, "batch_failed_pushes", + emitter_type_uint64, &batch_failed_pushes); + emitter_json_kv(emitter, "batch_pushes", + emitter_type_uint64, &batch_pushes); + emitter_json_kv(emitter, "batch_pushed_elems", + emitter_type_uint64, &batch_pushed_elems); + if (mutex) { + emitter_json_object_kv_begin(emitter, "mutex"); + mutex_stats_emit(emitter, NULL, col_mutex64, + col_mutex32); + emitter_json_object_end(emitter); + } + emitter_json_object_end(emitter); + + size_t availregs = nregs * curslabs; + char util[6]; + if (get_rate_str((uint64_t)curregs, (uint64_t)availregs, util)) + { + if (availregs == 0) { + malloc_snprintf(util, sizeof(util), "1"); + } else if (curregs > availregs) { + /* + * Race detected: the counters were read in + * separate mallctl calls and concurrent + * operations happened in between. In this case + * no meaningful utilization can be computed. + */ + malloc_snprintf(util, sizeof(util), " race"); + } else { + not_reached(); + } + } + + col_size.size_val = reg_size; + col_ind.unsigned_val = j; + col_allocated.size_val = curregs * reg_size; + col_nmalloc.uint64_val = nmalloc; + col_nmalloc_ps.uint64_val = rate_per_second(nmalloc, uptime); + col_ndalloc.uint64_val = ndalloc; + col_ndalloc_ps.uint64_val = rate_per_second(ndalloc, uptime); + col_nrequests.uint64_val = nrequests; + col_nrequests_ps.uint64_val = rate_per_second(nrequests, uptime); + if (prof_stats_on) { + col_prof_live_requested.uint64_val = prof_live.req_sum; + col_prof_live_count.uint64_val = prof_live.count; + col_prof_accum_requested.uint64_val = + prof_accum.req_sum; + col_prof_accum_count.uint64_val = prof_accum.count; + } + col_nshards.unsigned_val = nshards; + col_curregs.size_val = curregs; + col_curslabs.size_val = curslabs; + col_nonfull_slabs.size_val = nonfull_slabs; + col_regs.unsigned_val = nregs; + col_pgs.size_val = slab_size / page; + col_util.str_val = util; + col_nfills.uint64_val = nfills; + col_nfills_ps.uint64_val = rate_per_second(nfills, uptime); + col_nflushes.uint64_val = nflushes; + col_nflushes_ps.uint64_val = rate_per_second(nflushes, uptime); + col_nslabs.uint64_val = nslabs; + col_nreslabs.uint64_val = nreslabs; + col_nreslabs_ps.uint64_val = rate_per_second(nreslabs, uptime); + + col_pops.uint64_val = batch_pops; + col_pops_ps.uint64_val + = rate_per_second(batch_pops, uptime); + + col_failed_push.uint64_val = batch_failed_pushes; + col_failed_push_ps.uint64_val + = rate_per_second(batch_failed_pushes, uptime); + col_push.uint64_val = batch_pushes; + col_push_ps.uint64_val + = rate_per_second(batch_pushes, uptime); + + col_push_elem.uint64_val = batch_pushed_elems; + col_push_elem_ps.uint64_val + = rate_per_second(batch_pushed_elems, uptime); + + /* + * Note that mutex columns were initialized above, if mutex == + * true. + */ + + emitter_table_row(emitter, &row); + } + emitter_json_array_end(emitter); /* Close "bins". */ + + if (in_gap) { + emitter_table_printf(emitter, " ---\n"); + } +} + +JEMALLOC_COLD +static void +stats_arena_lextents_print(emitter_t *emitter, unsigned i, uint64_t uptime) { + unsigned nbins, nlextents, j; + bool in_gap, in_gap_prev; + + CTL_GET("arenas.nbins", &nbins, unsigned); + CTL_GET("arenas.nlextents", &nlextents, unsigned); + + emitter_row_t header_row; + emitter_row_init(&header_row); + emitter_row_t row; + emitter_row_init(&row); + + bool prof_stats_on = config_prof && opt_prof && opt_prof_stats + && i == MALLCTL_ARENAS_ALL; + + COL_HDR(row, size, NULL, right, 20, size) + COL_HDR(row, ind, NULL, right, 4, unsigned) + COL_HDR(row, allocated, NULL, right, 13, size) + COL_HDR(row, nmalloc, NULL, right, 13, uint64) + COL_HDR(row, nmalloc_ps, "(#/sec)", right, 8, uint64) + COL_HDR(row, ndalloc, NULL, right, 13, uint64) + COL_HDR(row, ndalloc_ps, "(#/sec)", right, 8, uint64) + COL_HDR(row, nrequests, NULL, right, 13, uint64) + COL_HDR(row, nrequests_ps, "(#/sec)", right, 8, uint64) + COL_HDR_DECLARE(prof_live_requested) + COL_HDR_DECLARE(prof_live_count) + COL_HDR_DECLARE(prof_accum_requested) + COL_HDR_DECLARE(prof_accum_count) + if (prof_stats_on) { + COL_HDR_INIT(row, prof_live_requested, NULL, right, 21, uint64) + COL_HDR_INIT(row, prof_live_count, NULL, right, 17, uint64) + COL_HDR_INIT(row, prof_accum_requested, NULL, right, 21, uint64) + COL_HDR_INIT(row, prof_accum_count, NULL, right, 17, uint64) + } + COL_HDR(row, curlextents, NULL, right, 13, size) + + /* As with bins, we label the large extents table. */ + header_size.width -= 6; + emitter_table_printf(emitter, "large:"); + emitter_table_row(emitter, &header_row); + emitter_json_array_kv_begin(emitter, "lextents"); + + size_t stats_arenas_mib[CTL_MAX_DEPTH]; + CTL_LEAF_PREPARE(stats_arenas_mib, 0, "stats.arenas"); + stats_arenas_mib[2] = i; + CTL_LEAF_PREPARE(stats_arenas_mib, 3, "lextents"); + + size_t arenas_lextent_mib[CTL_MAX_DEPTH]; + CTL_LEAF_PREPARE(arenas_lextent_mib, 0, "arenas.lextent"); + + size_t prof_stats_mib[CTL_MAX_DEPTH]; + if (prof_stats_on) { + CTL_LEAF_PREPARE(prof_stats_mib, 0, "prof.stats.lextents"); + } + + for (j = 0, in_gap = false; j < nlextents; j++) { + uint64_t nmalloc, ndalloc, nrequests; + size_t lextent_size, curlextents; + prof_stats_t prof_live; + prof_stats_t prof_accum; + + stats_arenas_mib[4] = j; + arenas_lextent_mib[2] = j; + + CTL_LEAF(stats_arenas_mib, 5, "nmalloc", &nmalloc, uint64_t); + CTL_LEAF(stats_arenas_mib, 5, "ndalloc", &ndalloc, uint64_t); + CTL_LEAF(stats_arenas_mib, 5, "nrequests", &nrequests, + uint64_t); + + in_gap_prev = in_gap; + in_gap = (nrequests == 0); + + if (in_gap_prev && !in_gap) { + emitter_table_printf(emitter, + " ---\n"); + } + + CTL_LEAF(arenas_lextent_mib, 3, "size", &lextent_size, size_t); + CTL_LEAF(stats_arenas_mib, 5, "curlextents", &curlextents, + size_t); + + if (prof_stats_on) { + prof_stats_mib[3] = j; + CTL_LEAF(prof_stats_mib, 4, "live", &prof_live, + prof_stats_t); + CTL_LEAF(prof_stats_mib, 4, "accum", &prof_accum, + prof_stats_t); + } + + emitter_json_object_begin(emitter); + if (prof_stats_on) { + emitter_json_kv(emitter, "prof_live_requested", + emitter_type_uint64, &prof_live.req_sum); + emitter_json_kv(emitter, "prof_live_count", + emitter_type_uint64, &prof_live.count); + emitter_json_kv(emitter, "prof_accum_requested", + emitter_type_uint64, &prof_accum.req_sum); + emitter_json_kv(emitter, "prof_accum_count", + emitter_type_uint64, &prof_accum.count); + } + emitter_json_kv(emitter, "curlextents", emitter_type_size, + &curlextents); + emitter_json_object_end(emitter); + + col_size.size_val = lextent_size; + col_ind.unsigned_val = nbins + j; + col_allocated.size_val = curlextents * lextent_size; + col_nmalloc.uint64_val = nmalloc; + col_nmalloc_ps.uint64_val = rate_per_second(nmalloc, uptime); + col_ndalloc.uint64_val = ndalloc; + col_ndalloc_ps.uint64_val = rate_per_second(ndalloc, uptime); + col_nrequests.uint64_val = nrequests; + col_nrequests_ps.uint64_val = rate_per_second(nrequests, uptime); + if (prof_stats_on) { + col_prof_live_requested.uint64_val = prof_live.req_sum; + col_prof_live_count.uint64_val = prof_live.count; + col_prof_accum_requested.uint64_val = + prof_accum.req_sum; + col_prof_accum_count.uint64_val = prof_accum.count; + } + col_curlextents.size_val = curlextents; + + if (!in_gap) { + emitter_table_row(emitter, &row); + } + } + emitter_json_array_end(emitter); /* Close "lextents". */ + if (in_gap) { + emitter_table_printf(emitter, " ---\n"); + } +} + +JEMALLOC_COLD +static void +stats_arena_extents_print(emitter_t *emitter, unsigned i) { + unsigned j; + bool in_gap, in_gap_prev; + emitter_row_t header_row; + emitter_row_init(&header_row); + emitter_row_t row; + emitter_row_init(&row); + + COL_HDR(row, size, NULL, right, 20, size) + COL_HDR(row, ind, NULL, right, 4, unsigned) + COL_HDR(row, ndirty, NULL, right, 13, size) + COL_HDR(row, dirty, NULL, right, 13, size) + COL_HDR(row, nmuzzy, NULL, right, 13, size) + COL_HDR(row, muzzy, NULL, right, 13, size) + COL_HDR(row, nretained, NULL, right, 13, size) + COL_HDR(row, retained, NULL, right, 13, size) + COL_HDR(row, ntotal, NULL, right, 13, size) + COL_HDR(row, total, NULL, right, 13, size) + + /* Label this section. */ + header_size.width -= 8; + emitter_table_printf(emitter, "extents:"); + emitter_table_row(emitter, &header_row); + emitter_json_array_kv_begin(emitter, "extents"); + + size_t stats_arenas_mib[CTL_MAX_DEPTH]; + CTL_LEAF_PREPARE(stats_arenas_mib, 0, "stats.arenas"); + stats_arenas_mib[2] = i; + CTL_LEAF_PREPARE(stats_arenas_mib, 3, "extents"); + + in_gap = false; + for (j = 0; j < SC_NPSIZES; j++) { + size_t ndirty, nmuzzy, nretained, total, dirty_bytes, + muzzy_bytes, retained_bytes, total_bytes; + stats_arenas_mib[4] = j; + + CTL_LEAF(stats_arenas_mib, 5, "ndirty", &ndirty, size_t); + CTL_LEAF(stats_arenas_mib, 5, "nmuzzy", &nmuzzy, size_t); + CTL_LEAF(stats_arenas_mib, 5, "nretained", &nretained, size_t); + CTL_LEAF(stats_arenas_mib, 5, "dirty_bytes", &dirty_bytes, + size_t); + CTL_LEAF(stats_arenas_mib, 5, "muzzy_bytes", &muzzy_bytes, + size_t); + CTL_LEAF(stats_arenas_mib, 5, "retained_bytes", + &retained_bytes, size_t); + + total = ndirty + nmuzzy + nretained; + total_bytes = dirty_bytes + muzzy_bytes + retained_bytes; + + in_gap_prev = in_gap; + in_gap = (total == 0); + + if (in_gap_prev && !in_gap) { + emitter_table_printf(emitter, + " ---\n"); + } + + emitter_json_object_begin(emitter); + emitter_json_kv(emitter, "ndirty", emitter_type_size, &ndirty); + emitter_json_kv(emitter, "nmuzzy", emitter_type_size, &nmuzzy); + emitter_json_kv(emitter, "nretained", emitter_type_size, + &nretained); + + emitter_json_kv(emitter, "dirty_bytes", emitter_type_size, + &dirty_bytes); + emitter_json_kv(emitter, "muzzy_bytes", emitter_type_size, + &muzzy_bytes); + emitter_json_kv(emitter, "retained_bytes", emitter_type_size, + &retained_bytes); + emitter_json_object_end(emitter); + + col_size.size_val = sz_pind2sz(j); + col_ind.size_val = j; + col_ndirty.size_val = ndirty; + col_dirty.size_val = dirty_bytes; + col_nmuzzy.size_val = nmuzzy; + col_muzzy.size_val = muzzy_bytes; + col_nretained.size_val = nretained; + col_retained.size_val = retained_bytes; + col_ntotal.size_val = total; + col_total.size_val = total_bytes; + + if (!in_gap) { + emitter_table_row(emitter, &row); + } + } + emitter_json_array_end(emitter); /* Close "extents". */ + if (in_gap) { + emitter_table_printf(emitter, " ---\n"); + } +} + +static void +stats_arena_hpa_shard_print(emitter_t *emitter, unsigned i, uint64_t uptime) { + emitter_row_t header_row; + emitter_row_init(&header_row); + emitter_row_t row; + emitter_row_init(&row); + + uint64_t npurge_passes; + uint64_t npurges; + uint64_t nhugifies; + uint64_t ndehugifies; + + CTL_M2_GET("stats.arenas.0.hpa_shard.npurge_passes", + i, &npurge_passes, uint64_t); + CTL_M2_GET("stats.arenas.0.hpa_shard.npurges", + i, &npurges, uint64_t); + CTL_M2_GET("stats.arenas.0.hpa_shard.nhugifies", + i, &nhugifies, uint64_t); + CTL_M2_GET("stats.arenas.0.hpa_shard.ndehugifies", + i, &ndehugifies, uint64_t); + + size_t npageslabs_huge; + size_t nactive_huge; + size_t ndirty_huge; + + size_t npageslabs_nonhuge; + size_t nactive_nonhuge; + size_t ndirty_nonhuge; + size_t nretained_nonhuge; + + size_t sec_bytes; + CTL_M2_GET("stats.arenas.0.hpa_sec_bytes", i, &sec_bytes, size_t); + emitter_kv(emitter, "sec_bytes", "Bytes in small extent cache", + emitter_type_size, &sec_bytes); + + /* First, global stats. */ + emitter_table_printf(emitter, + "HPA shard stats:\n" + " Purge passes: %" FMTu64 " (%" FMTu64 " / sec)\n" + " Purges: %" FMTu64 " (%" FMTu64 " / sec)\n" + " Hugeifies: %" FMTu64 " (%" FMTu64 " / sec)\n" + " Dehugifies: %" FMTu64 " (%" FMTu64 " / sec)\n" + "\n", + npurge_passes, rate_per_second(npurge_passes, uptime), + npurges, rate_per_second(npurges, uptime), + nhugifies, rate_per_second(nhugifies, uptime), + ndehugifies, rate_per_second(ndehugifies, uptime)); + + emitter_json_object_kv_begin(emitter, "hpa_shard"); + emitter_json_kv(emitter, "npurge_passes", emitter_type_uint64, + &npurge_passes); + emitter_json_kv(emitter, "npurges", emitter_type_uint64, + &npurges); + emitter_json_kv(emitter, "nhugifies", emitter_type_uint64, + &nhugifies); + emitter_json_kv(emitter, "ndehugifies", emitter_type_uint64, + &ndehugifies); + + /* Next, full slab stats. */ + CTL_M2_GET("stats.arenas.0.hpa_shard.full_slabs.npageslabs_huge", + i, &npageslabs_huge, size_t); + CTL_M2_GET("stats.arenas.0.hpa_shard.full_slabs.nactive_huge", + i, &nactive_huge, size_t); + CTL_M2_GET("stats.arenas.0.hpa_shard.full_slabs.ndirty_huge", + i, &ndirty_huge, size_t); + + CTL_M2_GET("stats.arenas.0.hpa_shard.full_slabs.npageslabs_nonhuge", + i, &npageslabs_nonhuge, size_t); + CTL_M2_GET("stats.arenas.0.hpa_shard.full_slabs.nactive_nonhuge", + i, &nactive_nonhuge, size_t); + CTL_M2_GET("stats.arenas.0.hpa_shard.full_slabs.ndirty_nonhuge", + i, &ndirty_nonhuge, size_t); + nretained_nonhuge = npageslabs_nonhuge * HUGEPAGE_PAGES + - nactive_nonhuge - ndirty_nonhuge; + + emitter_table_printf(emitter, + " In full slabs:\n" + " npageslabs: %zu huge, %zu nonhuge\n" + " nactive: %zu huge, %zu nonhuge \n" + " ndirty: %zu huge, %zu nonhuge \n" + " nretained: 0 huge, %zu nonhuge \n", + npageslabs_huge, npageslabs_nonhuge, + nactive_huge, nactive_nonhuge, + ndirty_huge, ndirty_nonhuge, + nretained_nonhuge); + + emitter_json_object_kv_begin(emitter, "full_slabs"); + emitter_json_kv(emitter, "npageslabs_huge", emitter_type_size, + &npageslabs_huge); + emitter_json_kv(emitter, "nactive_huge", emitter_type_size, + &nactive_huge); + emitter_json_kv(emitter, "nactive_huge", emitter_type_size, + &nactive_huge); + emitter_json_kv(emitter, "npageslabs_nonhuge", emitter_type_size, + &npageslabs_nonhuge); + emitter_json_kv(emitter, "nactive_nonhuge", emitter_type_size, + &nactive_nonhuge); + emitter_json_kv(emitter, "ndirty_nonhuge", emitter_type_size, + &ndirty_nonhuge); + emitter_json_object_end(emitter); /* End "full_slabs" */ + + /* Next, empty slab stats. */ + CTL_M2_GET("stats.arenas.0.hpa_shard.empty_slabs.npageslabs_huge", + i, &npageslabs_huge, size_t); + CTL_M2_GET("stats.arenas.0.hpa_shard.empty_slabs.nactive_huge", + i, &nactive_huge, size_t); + CTL_M2_GET("stats.arenas.0.hpa_shard.empty_slabs.ndirty_huge", + i, &ndirty_huge, size_t); + + CTL_M2_GET("stats.arenas.0.hpa_shard.empty_slabs.npageslabs_nonhuge", + i, &npageslabs_nonhuge, size_t); + CTL_M2_GET("stats.arenas.0.hpa_shard.empty_slabs.nactive_nonhuge", + i, &nactive_nonhuge, size_t); + CTL_M2_GET("stats.arenas.0.hpa_shard.empty_slabs.ndirty_nonhuge", + i, &ndirty_nonhuge, size_t); + nretained_nonhuge = npageslabs_nonhuge * HUGEPAGE_PAGES + - nactive_nonhuge - ndirty_nonhuge; + + emitter_table_printf(emitter, + " In empty slabs:\n" + " npageslabs: %zu huge, %zu nonhuge\n" + " nactive: %zu huge, %zu nonhuge \n" + " ndirty: %zu huge, %zu nonhuge \n" + " nretained: 0 huge, %zu nonhuge \n", + npageslabs_huge, npageslabs_nonhuge, + nactive_huge, nactive_nonhuge, + ndirty_huge, ndirty_nonhuge, + nretained_nonhuge); + + emitter_json_object_kv_begin(emitter, "empty_slabs"); + emitter_json_kv(emitter, "npageslabs_huge", emitter_type_size, + &npageslabs_huge); + emitter_json_kv(emitter, "nactive_huge", emitter_type_size, + &nactive_huge); + emitter_json_kv(emitter, "nactive_huge", emitter_type_size, + &nactive_huge); + emitter_json_kv(emitter, "npageslabs_nonhuge", emitter_type_size, + &npageslabs_nonhuge); + emitter_json_kv(emitter, "nactive_nonhuge", emitter_type_size, + &nactive_nonhuge); + emitter_json_kv(emitter, "ndirty_nonhuge", emitter_type_size, + &ndirty_nonhuge); + emitter_json_object_end(emitter); /* End "empty_slabs" */ + + /* Last, nonfull slab stats. */ + COL_HDR(row, size, NULL, right, 20, size) + COL_HDR(row, ind, NULL, right, 4, unsigned) + COL_HDR(row, npageslabs_huge, NULL, right, 16, size) + COL_HDR(row, nactive_huge, NULL, right, 16, size) + COL_HDR(row, ndirty_huge, NULL, right, 16, size) + COL_HDR(row, npageslabs_nonhuge, NULL, right, 20, size) + COL_HDR(row, nactive_nonhuge, NULL, right, 20, size) + COL_HDR(row, ndirty_nonhuge, NULL, right, 20, size) + COL_HDR(row, nretained_nonhuge, NULL, right, 20, size) + + size_t stats_arenas_mib[CTL_MAX_DEPTH]; + CTL_LEAF_PREPARE(stats_arenas_mib, 0, "stats.arenas"); + stats_arenas_mib[2] = i; + CTL_LEAF_PREPARE(stats_arenas_mib, 3, "hpa_shard.nonfull_slabs"); + + emitter_table_printf(emitter, " In nonfull slabs:\n"); + emitter_table_row(emitter, &header_row); + emitter_json_array_kv_begin(emitter, "nonfull_slabs"); + bool in_gap = false; + for (pszind_t j = 0; j < PSSET_NPSIZES && j < SC_NPSIZES; j++) { + stats_arenas_mib[5] = j; + + CTL_LEAF(stats_arenas_mib, 6, "npageslabs_huge", + &npageslabs_huge, size_t); + CTL_LEAF(stats_arenas_mib, 6, "nactive_huge", + &nactive_huge, size_t); + CTL_LEAF(stats_arenas_mib, 6, "ndirty_huge", + &ndirty_huge, size_t); + + CTL_LEAF(stats_arenas_mib, 6, "npageslabs_nonhuge", + &npageslabs_nonhuge, size_t); + CTL_LEAF(stats_arenas_mib, 6, "nactive_nonhuge", + &nactive_nonhuge, size_t); + CTL_LEAF(stats_arenas_mib, 6, "ndirty_nonhuge", + &ndirty_nonhuge, size_t); + nretained_nonhuge = npageslabs_nonhuge * HUGEPAGE_PAGES + - nactive_nonhuge - ndirty_nonhuge; + + bool in_gap_prev = in_gap; + in_gap = (npageslabs_huge == 0 && npageslabs_nonhuge == 0); + if (in_gap_prev && !in_gap) { + emitter_table_printf(emitter, + " ---\n"); + } + + col_size.size_val = sz_pind2sz(j); + col_ind.size_val = j; + col_npageslabs_huge.size_val = npageslabs_huge; + col_nactive_huge.size_val = nactive_huge; + col_ndirty_huge.size_val = ndirty_huge; + col_npageslabs_nonhuge.size_val = npageslabs_nonhuge; + col_nactive_nonhuge.size_val = nactive_nonhuge; + col_ndirty_nonhuge.size_val = ndirty_nonhuge; + col_nretained_nonhuge.size_val = nretained_nonhuge; + if (!in_gap) { + emitter_table_row(emitter, &row); + } + + emitter_json_object_begin(emitter); + emitter_json_kv(emitter, "npageslabs_huge", emitter_type_size, + &npageslabs_huge); + emitter_json_kv(emitter, "nactive_huge", emitter_type_size, + &nactive_huge); + emitter_json_kv(emitter, "ndirty_huge", emitter_type_size, + &ndirty_huge); + emitter_json_kv(emitter, "npageslabs_nonhuge", emitter_type_size, + &npageslabs_nonhuge); + emitter_json_kv(emitter, "nactive_nonhuge", emitter_type_size, + &nactive_nonhuge); + emitter_json_kv(emitter, "ndirty_nonhuge", emitter_type_size, + &ndirty_nonhuge); + emitter_json_object_end(emitter); + } + emitter_json_array_end(emitter); /* End "nonfull_slabs" */ + emitter_json_object_end(emitter); /* End "hpa_shard" */ + if (in_gap) { + emitter_table_printf(emitter, " ---\n"); + } +} + +static void +stats_arena_mutexes_print(emitter_t *emitter, unsigned arena_ind, uint64_t uptime) { + emitter_row_t row; + emitter_col_t col_name; + emitter_col_t col64[mutex_prof_num_uint64_t_counters]; + emitter_col_t col32[mutex_prof_num_uint32_t_counters]; + + emitter_row_init(&row); + mutex_stats_init_cols(&row, "", &col_name, col64, col32); + + emitter_json_object_kv_begin(emitter, "mutexes"); + emitter_table_row(emitter, &row); + + size_t stats_arenas_mib[CTL_MAX_DEPTH]; + CTL_LEAF_PREPARE(stats_arenas_mib, 0, "stats.arenas"); + stats_arenas_mib[2] = arena_ind; + CTL_LEAF_PREPARE(stats_arenas_mib, 3, "mutexes"); + + for (mutex_prof_arena_ind_t i = 0; i < mutex_prof_num_arena_mutexes; + i++) { + const char *name = arena_mutex_names[i]; + emitter_json_object_kv_begin(emitter, name); + mutex_stats_read_arena(stats_arenas_mib, 4, name, &col_name, + col64, col32, uptime); + mutex_stats_emit(emitter, &row, col64, col32); + emitter_json_object_end(emitter); /* Close the mutex dict. */ + } + emitter_json_object_end(emitter); /* End "mutexes". */ +} + +JEMALLOC_COLD +static void +stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large, + bool mutex, bool extents, bool hpa) { + char name[ARENA_NAME_LEN]; + char *namep = name; + unsigned nthreads; + const char *dss; + ssize_t dirty_decay_ms, muzzy_decay_ms; + size_t page, pactive, pdirty, pmuzzy, mapped, retained; + size_t base, internal, resident, metadata_edata, metadata_rtree, + metadata_thp, extent_avail; + uint64_t dirty_npurge, dirty_nmadvise, dirty_purged; + uint64_t muzzy_npurge, muzzy_nmadvise, muzzy_purged; + size_t small_allocated; + uint64_t small_nmalloc, small_ndalloc, small_nrequests, small_nfills, + small_nflushes; + size_t large_allocated; + uint64_t large_nmalloc, large_ndalloc, large_nrequests, large_nfills, + large_nflushes; + size_t tcache_bytes, tcache_stashed_bytes, abandoned_vm; + uint64_t uptime; + + CTL_GET("arenas.page", &page, size_t); + if (i != MALLCTL_ARENAS_ALL && i != MALLCTL_ARENAS_DESTROYED) { + CTL_M1_GET("arena.0.name", i, (void *)&namep, const char *); + emitter_kv(emitter, "name", "name", emitter_type_string, &namep); + } + + CTL_M2_GET("stats.arenas.0.nthreads", i, &nthreads, unsigned); + emitter_kv(emitter, "nthreads", "assigned threads", + emitter_type_unsigned, &nthreads); + + CTL_M2_GET("stats.arenas.0.uptime", i, &uptime, uint64_t); + emitter_kv(emitter, "uptime_ns", "uptime", emitter_type_uint64, + &uptime); + + CTL_M2_GET("stats.arenas.0.dss", i, &dss, const char *); + emitter_kv(emitter, "dss", "dss allocation precedence", + emitter_type_string, &dss); + + CTL_M2_GET("stats.arenas.0.dirty_decay_ms", i, &dirty_decay_ms, + ssize_t); + CTL_M2_GET("stats.arenas.0.muzzy_decay_ms", i, &muzzy_decay_ms, + ssize_t); + CTL_M2_GET("stats.arenas.0.pactive", i, &pactive, size_t); + CTL_M2_GET("stats.arenas.0.pdirty", i, &pdirty, size_t); + CTL_M2_GET("stats.arenas.0.pmuzzy", i, &pmuzzy, size_t); + CTL_M2_GET("stats.arenas.0.dirty_npurge", i, &dirty_npurge, uint64_t); + CTL_M2_GET("stats.arenas.0.dirty_nmadvise", i, &dirty_nmadvise, + uint64_t); + CTL_M2_GET("stats.arenas.0.dirty_purged", i, &dirty_purged, uint64_t); + CTL_M2_GET("stats.arenas.0.muzzy_npurge", i, &muzzy_npurge, uint64_t); + CTL_M2_GET("stats.arenas.0.muzzy_nmadvise", i, &muzzy_nmadvise, + uint64_t); + CTL_M2_GET("stats.arenas.0.muzzy_purged", i, &muzzy_purged, uint64_t); + + emitter_row_t decay_row; + emitter_row_init(&decay_row); + + /* JSON-style emission. */ + emitter_json_kv(emitter, "dirty_decay_ms", emitter_type_ssize, + &dirty_decay_ms); + emitter_json_kv(emitter, "muzzy_decay_ms", emitter_type_ssize, + &muzzy_decay_ms); + + emitter_json_kv(emitter, "pactive", emitter_type_size, &pactive); + emitter_json_kv(emitter, "pdirty", emitter_type_size, &pdirty); + emitter_json_kv(emitter, "pmuzzy", emitter_type_size, &pmuzzy); + + emitter_json_kv(emitter, "dirty_npurge", emitter_type_uint64, + &dirty_npurge); + emitter_json_kv(emitter, "dirty_nmadvise", emitter_type_uint64, + &dirty_nmadvise); + emitter_json_kv(emitter, "dirty_purged", emitter_type_uint64, + &dirty_purged); + + emitter_json_kv(emitter, "muzzy_npurge", emitter_type_uint64, + &muzzy_npurge); + emitter_json_kv(emitter, "muzzy_nmadvise", emitter_type_uint64, + &muzzy_nmadvise); + emitter_json_kv(emitter, "muzzy_purged", emitter_type_uint64, + &muzzy_purged); + + /* Table-style emission. */ + COL(decay_row, decay_type, right, 9, title); + col_decay_type.str_val = "decaying:"; + + COL(decay_row, decay_time, right, 6, title); + col_decay_time.str_val = "time"; + + COL(decay_row, decay_npages, right, 13, title); + col_decay_npages.str_val = "npages"; + + COL(decay_row, decay_sweeps, right, 13, title); + col_decay_sweeps.str_val = "sweeps"; + + COL(decay_row, decay_madvises, right, 13, title); + col_decay_madvises.str_val = "madvises"; + + COL(decay_row, decay_purged, right, 13, title); + col_decay_purged.str_val = "purged"; + + /* Title row. */ + emitter_table_row(emitter, &decay_row); + + /* Dirty row. */ + col_decay_type.str_val = "dirty:"; + + if (dirty_decay_ms >= 0) { + col_decay_time.type = emitter_type_ssize; + col_decay_time.ssize_val = dirty_decay_ms; + } else { + col_decay_time.type = emitter_type_title; + col_decay_time.str_val = "N/A"; + } + + col_decay_npages.type = emitter_type_size; + col_decay_npages.size_val = pdirty; + + col_decay_sweeps.type = emitter_type_uint64; + col_decay_sweeps.uint64_val = dirty_npurge; + + col_decay_madvises.type = emitter_type_uint64; + col_decay_madvises.uint64_val = dirty_nmadvise; + + col_decay_purged.type = emitter_type_uint64; + col_decay_purged.uint64_val = dirty_purged; + + emitter_table_row(emitter, &decay_row); + + /* Muzzy row. */ + col_decay_type.str_val = "muzzy:"; + + if (muzzy_decay_ms >= 0) { + col_decay_time.type = emitter_type_ssize; + col_decay_time.ssize_val = muzzy_decay_ms; + } else { + col_decay_time.type = emitter_type_title; + col_decay_time.str_val = "N/A"; + } + + col_decay_npages.type = emitter_type_size; + col_decay_npages.size_val = pmuzzy; + + col_decay_sweeps.type = emitter_type_uint64; + col_decay_sweeps.uint64_val = muzzy_npurge; + + col_decay_madvises.type = emitter_type_uint64; + col_decay_madvises.uint64_val = muzzy_nmadvise; + + col_decay_purged.type = emitter_type_uint64; + col_decay_purged.uint64_val = muzzy_purged; + + emitter_table_row(emitter, &decay_row); + + /* Small / large / total allocation counts. */ + emitter_row_t alloc_count_row; + emitter_row_init(&alloc_count_row); + + COL(alloc_count_row, count_title, left, 21, title); + col_count_title.str_val = ""; + + COL(alloc_count_row, count_allocated, right, 16, title); + col_count_allocated.str_val = "allocated"; + + COL(alloc_count_row, count_nmalloc, right, 16, title); + col_count_nmalloc.str_val = "nmalloc"; + COL(alloc_count_row, count_nmalloc_ps, right, 10, title); + col_count_nmalloc_ps.str_val = "(#/sec)"; + + COL(alloc_count_row, count_ndalloc, right, 16, title); + col_count_ndalloc.str_val = "ndalloc"; + COL(alloc_count_row, count_ndalloc_ps, right, 10, title); + col_count_ndalloc_ps.str_val = "(#/sec)"; + + COL(alloc_count_row, count_nrequests, right, 16, title); + col_count_nrequests.str_val = "nrequests"; + COL(alloc_count_row, count_nrequests_ps, right, 10, title); + col_count_nrequests_ps.str_val = "(#/sec)"; + + COL(alloc_count_row, count_nfills, right, 16, title); + col_count_nfills.str_val = "nfill"; + COL(alloc_count_row, count_nfills_ps, right, 10, title); + col_count_nfills_ps.str_val = "(#/sec)"; + + COL(alloc_count_row, count_nflushes, right, 16, title); + col_count_nflushes.str_val = "nflush"; + COL(alloc_count_row, count_nflushes_ps, right, 10, title); + col_count_nflushes_ps.str_val = "(#/sec)"; + + emitter_table_row(emitter, &alloc_count_row); + + col_count_nmalloc_ps.type = emitter_type_uint64; + col_count_ndalloc_ps.type = emitter_type_uint64; + col_count_nrequests_ps.type = emitter_type_uint64; + col_count_nfills_ps.type = emitter_type_uint64; + col_count_nflushes_ps.type = emitter_type_uint64; + +#define GET_AND_EMIT_ALLOC_STAT(small_or_large, name, valtype) \ + CTL_M2_GET("stats.arenas.0." #small_or_large "." #name, i, \ + &small_or_large##_##name, valtype##_t); \ + emitter_json_kv(emitter, #name, emitter_type_##valtype, \ + &small_or_large##_##name); \ + col_count_##name.type = emitter_type_##valtype; \ + col_count_##name.valtype##_val = small_or_large##_##name; + + emitter_json_object_kv_begin(emitter, "small"); + col_count_title.str_val = "small:"; + + GET_AND_EMIT_ALLOC_STAT(small, allocated, size) + GET_AND_EMIT_ALLOC_STAT(small, nmalloc, uint64) + col_count_nmalloc_ps.uint64_val = + rate_per_second(col_count_nmalloc.uint64_val, uptime); + GET_AND_EMIT_ALLOC_STAT(small, ndalloc, uint64) + col_count_ndalloc_ps.uint64_val = + rate_per_second(col_count_ndalloc.uint64_val, uptime); + GET_AND_EMIT_ALLOC_STAT(small, nrequests, uint64) + col_count_nrequests_ps.uint64_val = + rate_per_second(col_count_nrequests.uint64_val, uptime); + GET_AND_EMIT_ALLOC_STAT(small, nfills, uint64) + col_count_nfills_ps.uint64_val = + rate_per_second(col_count_nfills.uint64_val, uptime); + GET_AND_EMIT_ALLOC_STAT(small, nflushes, uint64) + col_count_nflushes_ps.uint64_val = + rate_per_second(col_count_nflushes.uint64_val, uptime); + + emitter_table_row(emitter, &alloc_count_row); + emitter_json_object_end(emitter); /* Close "small". */ + + emitter_json_object_kv_begin(emitter, "large"); + col_count_title.str_val = "large:"; + + GET_AND_EMIT_ALLOC_STAT(large, allocated, size) + GET_AND_EMIT_ALLOC_STAT(large, nmalloc, uint64) + col_count_nmalloc_ps.uint64_val = + rate_per_second(col_count_nmalloc.uint64_val, uptime); + GET_AND_EMIT_ALLOC_STAT(large, ndalloc, uint64) + col_count_ndalloc_ps.uint64_val = + rate_per_second(col_count_ndalloc.uint64_val, uptime); + GET_AND_EMIT_ALLOC_STAT(large, nrequests, uint64) + col_count_nrequests_ps.uint64_val = + rate_per_second(col_count_nrequests.uint64_val, uptime); + GET_AND_EMIT_ALLOC_STAT(large, nfills, uint64) + col_count_nfills_ps.uint64_val = + rate_per_second(col_count_nfills.uint64_val, uptime); + GET_AND_EMIT_ALLOC_STAT(large, nflushes, uint64) + col_count_nflushes_ps.uint64_val = + rate_per_second(col_count_nflushes.uint64_val, uptime); + + emitter_table_row(emitter, &alloc_count_row); + emitter_json_object_end(emitter); /* Close "large". */ + +#undef GET_AND_EMIT_ALLOC_STAT + + /* Aggregated small + large stats are emitter only in table mode. */ + col_count_title.str_val = "total:"; + col_count_allocated.size_val = small_allocated + large_allocated; + col_count_nmalloc.uint64_val = small_nmalloc + large_nmalloc; + col_count_ndalloc.uint64_val = small_ndalloc + large_ndalloc; + col_count_nrequests.uint64_val = small_nrequests + large_nrequests; + col_count_nfills.uint64_val = small_nfills + large_nfills; + col_count_nflushes.uint64_val = small_nflushes + large_nflushes; + col_count_nmalloc_ps.uint64_val = + rate_per_second(col_count_nmalloc.uint64_val, uptime); + col_count_ndalloc_ps.uint64_val = + rate_per_second(col_count_ndalloc.uint64_val, uptime); + col_count_nrequests_ps.uint64_val = + rate_per_second(col_count_nrequests.uint64_val, uptime); + col_count_nfills_ps.uint64_val = + rate_per_second(col_count_nfills.uint64_val, uptime); + col_count_nflushes_ps.uint64_val = + rate_per_second(col_count_nflushes.uint64_val, uptime); + emitter_table_row(emitter, &alloc_count_row); + + emitter_row_t mem_count_row; + emitter_row_init(&mem_count_row); + + emitter_col_t mem_count_title; + emitter_col_init(&mem_count_title, &mem_count_row); + mem_count_title.justify = emitter_justify_left; + mem_count_title.width = 21; + mem_count_title.type = emitter_type_title; + mem_count_title.str_val = ""; + + emitter_col_t mem_count_val; + emitter_col_init(&mem_count_val, &mem_count_row); + mem_count_val.justify = emitter_justify_right; + mem_count_val.width = 16; + mem_count_val.type = emitter_type_title; + mem_count_val.str_val = ""; + + emitter_table_row(emitter, &mem_count_row); + mem_count_val.type = emitter_type_size; + + /* Active count in bytes is emitted only in table mode. */ + mem_count_title.str_val = "active:"; + mem_count_val.size_val = pactive * page; + emitter_table_row(emitter, &mem_count_row); + +#define GET_AND_EMIT_MEM_STAT(stat) \ + CTL_M2_GET("stats.arenas.0."#stat, i, &stat, size_t); \ + emitter_json_kv(emitter, #stat, emitter_type_size, &stat); \ + mem_count_title.str_val = #stat":"; \ + mem_count_val.size_val = stat; \ + emitter_table_row(emitter, &mem_count_row); + + GET_AND_EMIT_MEM_STAT(mapped) + GET_AND_EMIT_MEM_STAT(retained) + GET_AND_EMIT_MEM_STAT(base) + GET_AND_EMIT_MEM_STAT(internal) + GET_AND_EMIT_MEM_STAT(metadata_edata) + GET_AND_EMIT_MEM_STAT(metadata_rtree) + GET_AND_EMIT_MEM_STAT(metadata_thp) + GET_AND_EMIT_MEM_STAT(tcache_bytes) + GET_AND_EMIT_MEM_STAT(tcache_stashed_bytes) + GET_AND_EMIT_MEM_STAT(resident) + GET_AND_EMIT_MEM_STAT(abandoned_vm) + GET_AND_EMIT_MEM_STAT(extent_avail) +#undef GET_AND_EMIT_MEM_STAT + + if (mutex) { + stats_arena_mutexes_print(emitter, i, uptime); + } + if (bins) { + stats_arena_bins_print(emitter, mutex, i, uptime); + } + if (large) { + stats_arena_lextents_print(emitter, i, uptime); + } + if (extents) { + stats_arena_extents_print(emitter, i); + } + if (hpa) { + stats_arena_hpa_shard_print(emitter, i, uptime); + } +} + +JEMALLOC_COLD +static void +stats_general_print(emitter_t *emitter) { + const char *cpv; + bool bv, bv2; + unsigned uv; + uint32_t u32v; + uint64_t u64v; + int64_t i64v; + ssize_t ssv, ssv2; + size_t sv, bsz, usz, u32sz, u64sz, i64sz, ssz, sssz, cpsz; + + bsz = sizeof(bool); + usz = sizeof(unsigned); + ssz = sizeof(size_t); + sssz = sizeof(ssize_t); + cpsz = sizeof(const char *); + u32sz = sizeof(uint32_t); + i64sz = sizeof(int64_t); + u64sz = sizeof(uint64_t); + + CTL_GET("version", &cpv, const char *); + emitter_kv(emitter, "version", "Version", emitter_type_string, &cpv); + + /* config. */ + emitter_dict_begin(emitter, "config", "Build-time option settings"); +#define CONFIG_WRITE_BOOL(name) \ + do { \ + CTL_GET("config."#name, &bv, bool); \ + emitter_kv(emitter, #name, "config."#name, \ + emitter_type_bool, &bv); \ + } while (0) + + CONFIG_WRITE_BOOL(cache_oblivious); + CONFIG_WRITE_BOOL(debug); + CONFIG_WRITE_BOOL(fill); + CONFIG_WRITE_BOOL(lazy_lock); + emitter_kv(emitter, "malloc_conf", "config.malloc_conf", + emitter_type_string, &config_malloc_conf); + + CONFIG_WRITE_BOOL(opt_safety_checks); + CONFIG_WRITE_BOOL(prof); + CONFIG_WRITE_BOOL(prof_libgcc); + CONFIG_WRITE_BOOL(prof_libunwind); + CONFIG_WRITE_BOOL(stats); + CONFIG_WRITE_BOOL(utrace); + CONFIG_WRITE_BOOL(xmalloc); +#undef CONFIG_WRITE_BOOL + emitter_dict_end(emitter); /* Close "config" dict. */ + + /* opt. */ +#define OPT_WRITE(name, var, size, emitter_type) \ + if (je_mallctl("opt."name, (void *)&var, &size, NULL, 0) == \ + 0) { \ + emitter_kv(emitter, name, "opt."name, emitter_type, \ + &var); \ + } + +#define OPT_WRITE_MUTABLE(name, var1, var2, size, emitter_type, \ + altname) \ + if (je_mallctl("opt."name, (void *)&var1, &size, NULL, 0) == \ + 0 && je_mallctl(altname, (void *)&var2, &size, NULL, 0) \ + == 0) { \ + emitter_kv_note(emitter, name, "opt."name, \ + emitter_type, &var1, altname, emitter_type, \ + &var2); \ + } + +#define OPT_WRITE_BOOL(name) OPT_WRITE(name, bv, bsz, emitter_type_bool) +#define OPT_WRITE_BOOL_MUTABLE(name, altname) \ + OPT_WRITE_MUTABLE(name, bv, bv2, bsz, emitter_type_bool, altname) + +#define OPT_WRITE_UNSIGNED(name) \ + OPT_WRITE(name, uv, usz, emitter_type_unsigned) + +#define OPT_WRITE_INT64(name) \ + OPT_WRITE(name, i64v, i64sz, emitter_type_int64) +#define OPT_WRITE_UINT64(name) \ + OPT_WRITE(name, u64v, u64sz, emitter_type_uint64) + +#define OPT_WRITE_SIZE_T(name) \ + OPT_WRITE(name, sv, ssz, emitter_type_size) +#define OPT_WRITE_SSIZE_T(name) \ + OPT_WRITE(name, ssv, sssz, emitter_type_ssize) +#define OPT_WRITE_SSIZE_T_MUTABLE(name, altname) \ + OPT_WRITE_MUTABLE(name, ssv, ssv2, sssz, emitter_type_ssize, \ + altname) + +#define OPT_WRITE_CHAR_P(name) \ + OPT_WRITE(name, cpv, cpsz, emitter_type_string) + + emitter_dict_begin(emitter, "opt", "Run-time option settings"); + + /* + * opt.malloc_conf. + * + * Sources are documented in https://jemalloc.net/jemalloc.3.html#tuning + * - (Not Included Here) The string specified via --with-malloc-conf, + * which is already printed out above as config.malloc_conf + * - (Included) The string pointed to by the global variable malloc_conf + * - (Included) The “name” of the file referenced by the symbolic link + * named /etc/malloc.conf + * - (Included) The value of the environment variable MALLOC_CONF + * - (Optional, Unofficial) The string pointed to by the global variable + * malloc_conf_2_conf_harder, which is hidden from the public. + * + * Note: The outputs are strictly ordered by priorities (low -> high). + * + */ +#define MALLOC_CONF_WRITE(name, message) \ + if (je_mallctl("opt.malloc_conf."name, (void *)&cpv, &cpsz, NULL, 0) != \ + 0) { \ + cpv = ""; \ + } \ + emitter_kv(emitter, name, message, emitter_type_string, &cpv); + + MALLOC_CONF_WRITE("global_var", "Global variable malloc_conf"); + MALLOC_CONF_WRITE("symlink", "Symbolic link malloc.conf"); + MALLOC_CONF_WRITE("env_var", "Environment variable MALLOC_CONF"); + /* As this config is unofficial, skip the output if it's NULL */ + if (je_mallctl("opt.malloc_conf.global_var_2_conf_harder", + (void *)&cpv, &cpsz, NULL, 0) == 0) { + emitter_kv(emitter, "global_var_2_conf_harder", "Global " + "variable malloc_conf_2_conf_harder", emitter_type_string, &cpv); + } +#undef MALLOC_CONF_WRITE + + OPT_WRITE_BOOL("abort") + OPT_WRITE_BOOL("abort_conf") + OPT_WRITE_BOOL("cache_oblivious") + OPT_WRITE_BOOL("confirm_conf") + OPT_WRITE_BOOL("retain") + OPT_WRITE_CHAR_P("dss") + OPT_WRITE_UNSIGNED("narenas") + OPT_WRITE_CHAR_P("percpu_arena") + OPT_WRITE_SIZE_T("oversize_threshold") + OPT_WRITE_BOOL("hpa") + OPT_WRITE_SIZE_T("hpa_slab_max_alloc") + OPT_WRITE_SIZE_T("hpa_hugification_threshold") + OPT_WRITE_UINT64("hpa_hugify_delay_ms") + OPT_WRITE_UINT64("hpa_min_purge_interval_ms") + OPT_WRITE_BOOL("hpa_strict_min_purge_interval") + if (je_mallctl("opt.hpa_dirty_mult", (void *)&u32v, &u32sz, NULL, 0) + == 0) { + /* + * We cheat a little and "know" the secret meaning of this + * representation. + */ + if (u32v == (uint32_t)-1) { + const char *neg1 = "-1"; + emitter_kv(emitter, "hpa_dirty_mult", + "opt.hpa_dirty_mult", emitter_type_string, &neg1); + } else { + char buf[FXP_BUF_SIZE]; + fxp_print(u32v, buf); + const char *bufp = buf; + emitter_kv(emitter, "hpa_dirty_mult", + "opt.hpa_dirty_mult", emitter_type_string, &bufp); + } + } + OPT_WRITE_SIZE_T("hpa_sec_nshards") + OPT_WRITE_SIZE_T("hpa_sec_max_alloc") + OPT_WRITE_SIZE_T("hpa_sec_max_bytes") + OPT_WRITE_SIZE_T("hpa_sec_bytes_after_flush") + OPT_WRITE_SIZE_T("hpa_sec_batch_fill_extra") + OPT_WRITE_CHAR_P("metadata_thp") + OPT_WRITE_INT64("mutex_max_spin") + OPT_WRITE_BOOL_MUTABLE("background_thread", "background_thread") + OPT_WRITE_SSIZE_T_MUTABLE("dirty_decay_ms", "arenas.dirty_decay_ms") + OPT_WRITE_SSIZE_T_MUTABLE("muzzy_decay_ms", "arenas.muzzy_decay_ms") + OPT_WRITE_SIZE_T("lg_extent_max_active_fit") + OPT_WRITE_CHAR_P("junk") + OPT_WRITE_BOOL("zero") + OPT_WRITE_BOOL("utrace") + OPT_WRITE_BOOL("xmalloc") + OPT_WRITE_BOOL("experimental_infallible_new") + OPT_WRITE_SIZE_T("max_batched_size") + OPT_WRITE_SIZE_T("remote_free_max") + OPT_WRITE_SIZE_T("remote_free_max_batch") + OPT_WRITE_BOOL("tcache") + OPT_WRITE_SIZE_T("tcache_max") + OPT_WRITE_UNSIGNED("tcache_nslots_small_min") + OPT_WRITE_UNSIGNED("tcache_nslots_small_max") + OPT_WRITE_UNSIGNED("tcache_nslots_large") + OPT_WRITE_SSIZE_T("lg_tcache_nslots_mul") + OPT_WRITE_SIZE_T("tcache_gc_incr_bytes") + OPT_WRITE_SIZE_T("tcache_gc_delay_bytes") + OPT_WRITE_UNSIGNED("lg_tcache_flush_small_div") + OPT_WRITE_UNSIGNED("lg_tcache_flush_large_div") + OPT_WRITE_UNSIGNED("debug_double_free_max_scan") + OPT_WRITE_CHAR_P("thp") + OPT_WRITE_BOOL("prof") + OPT_WRITE_UNSIGNED("prof_bt_max") + OPT_WRITE_CHAR_P("prof_prefix") + OPT_WRITE_BOOL_MUTABLE("prof_active", "prof.active") + OPT_WRITE_BOOL_MUTABLE("prof_thread_active_init", + "prof.thread_active_init") + OPT_WRITE_SSIZE_T_MUTABLE("lg_prof_sample", "prof.lg_sample") + OPT_WRITE_BOOL("prof_accum") + OPT_WRITE_SSIZE_T("lg_prof_interval") + OPT_WRITE_BOOL("prof_gdump") + OPT_WRITE_BOOL("prof_final") + OPT_WRITE_BOOL("prof_leak") + OPT_WRITE_BOOL("prof_leak_error") + OPT_WRITE_BOOL("stats_print") + OPT_WRITE_CHAR_P("stats_print_opts") + OPT_WRITE_BOOL("stats_print") + OPT_WRITE_CHAR_P("stats_print_opts") + OPT_WRITE_INT64("stats_interval") + OPT_WRITE_CHAR_P("stats_interval_opts") + OPT_WRITE_CHAR_P("zero_realloc") + + emitter_dict_end(emitter); /* Close "opt". */ + +#undef OPT_WRITE +#undef OPT_WRITE_MUTABLE +#undef OPT_WRITE_BOOL +#undef OPT_WRITE_BOOL_MUTABLE +#undef OPT_WRITE_UNSIGNED +#undef OPT_WRITE_SSIZE_T +#undef OPT_WRITE_SSIZE_T_MUTABLE +#undef OPT_WRITE_CHAR_P + + /* prof. */ + if (config_prof) { + emitter_dict_begin(emitter, "prof", "Profiling settings"); + + CTL_GET("prof.thread_active_init", &bv, bool); + emitter_kv(emitter, "thread_active_init", + "prof.thread_active_init", emitter_type_bool, &bv); + + CTL_GET("prof.active", &bv, bool); + emitter_kv(emitter, "active", "prof.active", emitter_type_bool, + &bv); + + CTL_GET("prof.gdump", &bv, bool); + emitter_kv(emitter, "gdump", "prof.gdump", emitter_type_bool, + &bv); + + CTL_GET("prof.interval", &u64v, uint64_t); + emitter_kv(emitter, "interval", "prof.interval", + emitter_type_uint64, &u64v); + + CTL_GET("prof.lg_sample", &ssv, ssize_t); + emitter_kv(emitter, "lg_sample", "prof.lg_sample", + emitter_type_ssize, &ssv); + + emitter_dict_end(emitter); /* Close "prof". */ + } + + /* arenas. */ + /* + * The json output sticks arena info into an "arenas" dict; the table + * output puts them at the top-level. + */ + emitter_json_object_kv_begin(emitter, "arenas"); + + CTL_GET("arenas.narenas", &uv, unsigned); + emitter_kv(emitter, "narenas", "Arenas", emitter_type_unsigned, &uv); + + /* + * Decay settings are emitted only in json mode; in table mode, they're + * emitted as notes with the opt output, above. + */ + CTL_GET("arenas.dirty_decay_ms", &ssv, ssize_t); + emitter_json_kv(emitter, "dirty_decay_ms", emitter_type_ssize, &ssv); + + CTL_GET("arenas.muzzy_decay_ms", &ssv, ssize_t); + emitter_json_kv(emitter, "muzzy_decay_ms", emitter_type_ssize, &ssv); + + CTL_GET("arenas.quantum", &sv, size_t); + emitter_kv(emitter, "quantum", "Quantum size", emitter_type_size, &sv); + + CTL_GET("arenas.page", &sv, size_t); + emitter_kv(emitter, "page", "Page size", emitter_type_size, &sv); + + CTL_GET("arenas.hugepage", &sv, size_t); + emitter_kv(emitter, "hugepage", "Hugepage size", emitter_type_size, + &sv); + + if (je_mallctl("arenas.tcache_max", (void *)&sv, &ssz, NULL, 0) == 0) { + emitter_kv(emitter, "tcache_max", + "Maximum thread-cached size class", emitter_type_size, &sv); + } + + unsigned arenas_nbins; + CTL_GET("arenas.nbins", &arenas_nbins, unsigned); + emitter_kv(emitter, "nbins", "Number of bin size classes", + emitter_type_unsigned, &arenas_nbins); + + unsigned arenas_nhbins; + CTL_GET("arenas.nhbins", &arenas_nhbins, unsigned); + emitter_kv(emitter, "nhbins", "Number of thread-cache bin size classes", + emitter_type_unsigned, &arenas_nhbins); + + /* + * We do enough mallctls in a loop that we actually want to omit them + * (not just omit the printing). + */ + if (emitter_outputs_json(emitter)) { + emitter_json_array_kv_begin(emitter, "bin"); + size_t arenas_bin_mib[CTL_MAX_DEPTH]; + CTL_LEAF_PREPARE(arenas_bin_mib, 0, "arenas.bin"); + for (unsigned i = 0; i < arenas_nbins; i++) { + arenas_bin_mib[2] = i; + emitter_json_object_begin(emitter); + + CTL_LEAF(arenas_bin_mib, 3, "size", &sv, size_t); + emitter_json_kv(emitter, "size", emitter_type_size, + &sv); + + CTL_LEAF(arenas_bin_mib, 3, "nregs", &u32v, uint32_t); + emitter_json_kv(emitter, "nregs", emitter_type_uint32, + &u32v); + + CTL_LEAF(arenas_bin_mib, 3, "slab_size", &sv, size_t); + emitter_json_kv(emitter, "slab_size", emitter_type_size, + &sv); + + CTL_LEAF(arenas_bin_mib, 3, "nshards", &u32v, uint32_t); + emitter_json_kv(emitter, "nshards", emitter_type_uint32, + &u32v); + + emitter_json_object_end(emitter); + } + emitter_json_array_end(emitter); /* Close "bin". */ + } + + unsigned nlextents; + CTL_GET("arenas.nlextents", &nlextents, unsigned); + emitter_kv(emitter, "nlextents", "Number of large size classes", + emitter_type_unsigned, &nlextents); + + if (emitter_outputs_json(emitter)) { + emitter_json_array_kv_begin(emitter, "lextent"); + size_t arenas_lextent_mib[CTL_MAX_DEPTH]; + CTL_LEAF_PREPARE(arenas_lextent_mib, 0, "arenas.lextent"); + for (unsigned i = 0; i < nlextents; i++) { + arenas_lextent_mib[2] = i; + emitter_json_object_begin(emitter); + + CTL_LEAF(arenas_lextent_mib, 3, "size", &sv, size_t); + emitter_json_kv(emitter, "size", emitter_type_size, + &sv); + + emitter_json_object_end(emitter); + } + emitter_json_array_end(emitter); /* Close "lextent". */ + } + + emitter_json_object_end(emitter); /* Close "arenas" */ +} + +JEMALLOC_COLD +static void +stats_print_helper(emitter_t *emitter, bool merged, bool destroyed, + bool unmerged, bool bins, bool large, bool mutex, bool extents, bool hpa) { + /* + * These should be deleted. We keep them around for a while, to aid in + * the transition to the emitter code. + */ + size_t allocated, active, metadata, metadata_edata, metadata_rtree, + metadata_thp, resident, mapped, retained; + size_t num_background_threads; + size_t zero_reallocs; + uint64_t background_thread_num_runs, background_thread_run_interval; + + CTL_GET("stats.allocated", &allocated, size_t); + CTL_GET("stats.active", &active, size_t); + CTL_GET("stats.metadata", &metadata, size_t); + CTL_GET("stats.metadata_edata", &metadata_edata, size_t); + CTL_GET("stats.metadata_rtree", &metadata_rtree, size_t); + CTL_GET("stats.metadata_thp", &metadata_thp, size_t); + CTL_GET("stats.resident", &resident, size_t); + CTL_GET("stats.mapped", &mapped, size_t); + CTL_GET("stats.retained", &retained, size_t); + + CTL_GET("stats.zero_reallocs", &zero_reallocs, size_t); + + if (have_background_thread) { + CTL_GET("stats.background_thread.num_threads", + &num_background_threads, size_t); + CTL_GET("stats.background_thread.num_runs", + &background_thread_num_runs, uint64_t); + CTL_GET("stats.background_thread.run_interval", + &background_thread_run_interval, uint64_t); + } else { + num_background_threads = 0; + background_thread_num_runs = 0; + background_thread_run_interval = 0; + } + + /* Generic global stats. */ + emitter_json_object_kv_begin(emitter, "stats"); + emitter_json_kv(emitter, "allocated", emitter_type_size, &allocated); + emitter_json_kv(emitter, "active", emitter_type_size, &active); + emitter_json_kv(emitter, "metadata", emitter_type_size, &metadata); + emitter_json_kv(emitter, "metadata_edata", emitter_type_size, + &metadata_edata); + emitter_json_kv(emitter, "metadata_rtree", emitter_type_size, + &metadata_rtree); + emitter_json_kv(emitter, "metadata_thp", emitter_type_size, + &metadata_thp); + emitter_json_kv(emitter, "resident", emitter_type_size, &resident); + emitter_json_kv(emitter, "mapped", emitter_type_size, &mapped); + emitter_json_kv(emitter, "retained", emitter_type_size, &retained); + emitter_json_kv(emitter, "zero_reallocs", emitter_type_size, + &zero_reallocs); + + emitter_table_printf(emitter, "Allocated: %zu, active: %zu, " + "metadata: %zu (n_thp %zu, edata %zu, rtree %zu), resident: %zu, " + "mapped: %zu, retained: %zu\n", allocated, active, metadata, + metadata_thp, metadata_edata, metadata_rtree, resident, mapped, + retained); + + /* Strange behaviors */ + emitter_table_printf(emitter, + "Count of realloc(non-null-ptr, 0) calls: %zu\n", zero_reallocs); + + /* Background thread stats. */ + emitter_json_object_kv_begin(emitter, "background_thread"); + emitter_json_kv(emitter, "num_threads", emitter_type_size, + &num_background_threads); + emitter_json_kv(emitter, "num_runs", emitter_type_uint64, + &background_thread_num_runs); + emitter_json_kv(emitter, "run_interval", emitter_type_uint64, + &background_thread_run_interval); + emitter_json_object_end(emitter); /* Close "background_thread". */ + + emitter_table_printf(emitter, "Background threads: %zu, " + "num_runs: %"FMTu64", run_interval: %"FMTu64" ns\n", + num_background_threads, background_thread_num_runs, + background_thread_run_interval); + + if (mutex) { + emitter_row_t row; + emitter_col_t name; + emitter_col_t col64[mutex_prof_num_uint64_t_counters]; + emitter_col_t col32[mutex_prof_num_uint32_t_counters]; + uint64_t uptime; + + emitter_row_init(&row); + mutex_stats_init_cols(&row, "", &name, col64, col32); + + emitter_table_row(emitter, &row); + emitter_json_object_kv_begin(emitter, "mutexes"); + + CTL_M2_GET("stats.arenas.0.uptime", 0, &uptime, uint64_t); + + size_t stats_mutexes_mib[CTL_MAX_DEPTH]; + CTL_LEAF_PREPARE(stats_mutexes_mib, 0, "stats.mutexes"); + for (int i = 0; i < mutex_prof_num_global_mutexes; i++) { + mutex_stats_read_global(stats_mutexes_mib, 2, + global_mutex_names[i], &name, col64, col32, uptime); + emitter_json_object_kv_begin(emitter, global_mutex_names[i]); + mutex_stats_emit(emitter, &row, col64, col32); + emitter_json_object_end(emitter); + } + + emitter_json_object_end(emitter); /* Close "mutexes". */ + } + + emitter_json_object_end(emitter); /* Close "stats". */ + + if (merged || destroyed || unmerged) { + unsigned narenas; + + emitter_json_object_kv_begin(emitter, "stats.arenas"); + + CTL_GET("arenas.narenas", &narenas, unsigned); + size_t mib[3]; + size_t miblen = sizeof(mib) / sizeof(size_t); + size_t sz; + VARIABLE_ARRAY_UNSAFE(bool, initialized, narenas); + bool destroyed_initialized; + unsigned i, ninitialized; + + xmallctlnametomib("arena.0.initialized", mib, &miblen); + for (i = ninitialized = 0; i < narenas; i++) { + mib[1] = i; + sz = sizeof(bool); + xmallctlbymib(mib, miblen, &initialized[i], &sz, + NULL, 0); + if (initialized[i]) { + ninitialized++; + } + } + mib[1] = MALLCTL_ARENAS_DESTROYED; + sz = sizeof(bool); + xmallctlbymib(mib, miblen, &destroyed_initialized, &sz, + NULL, 0); + + /* Merged stats. */ + if (merged && (ninitialized > 1 || !unmerged)) { + /* Print merged arena stats. */ + emitter_table_printf(emitter, "Merged arenas stats:\n"); + emitter_json_object_kv_begin(emitter, "merged"); + stats_arena_print(emitter, MALLCTL_ARENAS_ALL, bins, + large, mutex, extents, hpa); + emitter_json_object_end(emitter); /* Close "merged". */ + } + + /* Destroyed stats. */ + if (destroyed_initialized && destroyed) { + /* Print destroyed arena stats. */ + emitter_table_printf(emitter, + "Destroyed arenas stats:\n"); + emitter_json_object_kv_begin(emitter, "destroyed"); + stats_arena_print(emitter, MALLCTL_ARENAS_DESTROYED, + bins, large, mutex, extents, hpa); + emitter_json_object_end(emitter); /* Close "destroyed". */ + } + + /* Unmerged stats. */ + if (unmerged) { + for (i = 0; i < narenas; i++) { + if (initialized[i]) { + char arena_ind_str[20]; + malloc_snprintf(arena_ind_str, + sizeof(arena_ind_str), "%u", i); + emitter_json_object_kv_begin(emitter, + arena_ind_str); + emitter_table_printf(emitter, + "arenas[%s]:\n", arena_ind_str); + stats_arena_print(emitter, i, bins, + large, mutex, extents, hpa); + /* Close "". */ + emitter_json_object_end(emitter); + } + } + } + emitter_json_object_end(emitter); /* Close "stats.arenas". */ + } +} + +void +stats_print(write_cb_t *write_cb, void *cbopaque, const char *opts) { + int err; + uint64_t epoch; + size_t u64sz; +#define OPTION(o, v, d, s) bool v = d; + STATS_PRINT_OPTIONS +#undef OPTION + + /* + * Refresh stats, in case mallctl() was called by the application. + * + * Check for OOM here, since refreshing the ctl cache can trigger + * allocation. In practice, none of the subsequent mallctl()-related + * calls in this function will cause OOM if this one succeeds. + * */ + epoch = 1; + u64sz = sizeof(uint64_t); + err = je_mallctl("epoch", (void *)&epoch, &u64sz, (void *)&epoch, + sizeof(uint64_t)); + if (err != 0) { + if (err == EAGAIN) { + malloc_write(": Memory allocation failure in " + "mallctl(\"epoch\", ...)\n"); + return; + } + malloc_write(": Failure in mallctl(\"epoch\", " + "...)\n"); + abort(); + } + + if (opts != NULL) { + for (unsigned i = 0; opts[i] != '\0'; i++) { + switch (opts[i]) { +#define OPTION(o, v, d, s) case o: v = s; break; + STATS_PRINT_OPTIONS +#undef OPTION + default:; + } + } + } + + emitter_t emitter; + emitter_init(&emitter, + json ? emitter_output_json_compact : emitter_output_table, + write_cb, cbopaque); + emitter_begin(&emitter); + emitter_table_printf(&emitter, "___ Begin jemalloc statistics ___\n"); + emitter_json_object_kv_begin(&emitter, "jemalloc"); + + if (general) { + stats_general_print(&emitter); + } + if (config_stats) { + stats_print_helper(&emitter, merged, destroyed, unmerged, + bins, large, mutex, extents, hpa); + } + + emitter_json_object_end(&emitter); /* Closes the "jemalloc" dict. */ + emitter_table_printf(&emitter, "--- End jemalloc statistics ---\n"); + emitter_end(&emitter); +} + +uint64_t +stats_interval_new_event_wait(tsd_t *tsd) { + return stats_interval_accum_batch; +} + +uint64_t +stats_interval_postponed_event_wait(tsd_t *tsd) { + return TE_MIN_START_WAIT; +} + +void +stats_interval_event_handler(tsd_t *tsd, uint64_t elapsed) { + assert(elapsed > 0 && elapsed != TE_INVALID_ELAPSED); + if (counter_accum(tsd_tsdn(tsd), &stats_interval_accumulated, + elapsed)) { + je_malloc_stats_print(NULL, NULL, opt_stats_interval_opts); + } +} + +bool +stats_boot(void) { + uint64_t stats_interval; + if (opt_stats_interval < 0) { + assert(opt_stats_interval == -1); + stats_interval = 0; + stats_interval_accum_batch = 0; + } else{ + /* See comments in stats.h */ + stats_interval = (opt_stats_interval > 0) ? + opt_stats_interval : 1; + uint64_t batch = stats_interval >> + STATS_INTERVAL_ACCUM_LG_BATCH_SIZE; + if (batch > STATS_INTERVAL_ACCUM_BATCH_MAX) { + batch = STATS_INTERVAL_ACCUM_BATCH_MAX; + } else if (batch == 0) { + batch = 1; + } + stats_interval_accum_batch = batch; + } + + return counter_accum_init(&stats_interval_accumulated, stats_interval); +} + +void +stats_prefork(tsdn_t *tsdn) { + counter_prefork(tsdn, &stats_interval_accumulated); +} + +void +stats_postfork_parent(tsdn_t *tsdn) { + counter_postfork_parent(tsdn, &stats_interval_accumulated); +} + +void +stats_postfork_child(tsdn_t *tsdn) { + counter_postfork_child(tsdn, &stats_interval_accumulated); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/sz.c b/src/duckdb/extension/jemalloc/jemalloc/src/sz.c new file mode 100644 index 000000000..89def9d50 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/sz.c @@ -0,0 +1,115 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" +#include "jemalloc/internal/sz.h" + +JEMALLOC_ALIGNED(CACHELINE) +size_t sz_pind2sz_tab[SC_NPSIZES+1]; +size_t sz_large_pad; + +size_t +sz_psz_quantize_floor(size_t size) { + size_t ret; + pszind_t pind; + + assert(size > 0); + assert((size & PAGE_MASK) == 0); + + pind = sz_psz2ind(size - sz_large_pad + 1); + if (pind == 0) { + /* + * Avoid underflow. This short-circuit would also do the right + * thing for all sizes in the range for which there are + * PAGE-spaced size classes, but it's simplest to just handle + * the one case that would cause erroneous results. + */ + return size; + } + ret = sz_pind2sz(pind - 1) + sz_large_pad; + assert(ret <= size); + return ret; +} + +size_t +sz_psz_quantize_ceil(size_t size) { + size_t ret; + + assert(size > 0); + assert(size - sz_large_pad <= SC_LARGE_MAXCLASS); + assert((size & PAGE_MASK) == 0); + + ret = sz_psz_quantize_floor(size); + if (ret < size) { + /* + * Skip a quantization that may have an adequately large extent, + * because under-sized extents may be mixed in. This only + * happens when an unusual size is requested, i.e. for aligned + * allocation, and is just one of several places where linear + * search would potentially find sufficiently aligned available + * memory somewhere lower. + */ + ret = sz_pind2sz(sz_psz2ind(ret - sz_large_pad + 1)) + + sz_large_pad; + } + return ret; +} + +static void +sz_boot_pind2sz_tab(const sc_data_t *sc_data) { + int pind = 0; + for (unsigned i = 0; i < SC_NSIZES; i++) { + const sc_t *sc = &sc_data->sc[i]; + if (sc->psz) { + sz_pind2sz_tab[pind] = (ZU(1) << sc->lg_base) + + (ZU(sc->ndelta) << sc->lg_delta); + pind++; + } + } + for (int i = pind; i <= (int)SC_NPSIZES; i++) { + sz_pind2sz_tab[pind] = sc_data->large_maxclass + PAGE; + } +} + +JEMALLOC_ALIGNED(CACHELINE) +size_t sz_index2size_tab[SC_NSIZES]; + +static void +sz_boot_index2size_tab(const sc_data_t *sc_data) { + for (unsigned i = 0; i < SC_NSIZES; i++) { + const sc_t *sc = &sc_data->sc[i]; + sz_index2size_tab[i] = (ZU(1) << sc->lg_base) + + (ZU(sc->ndelta) << (sc->lg_delta)); + } +} + +/* + * To keep this table small, we divide sizes by the tiny min size, which gives + * the smallest interval for which the result can change. + */ +JEMALLOC_ALIGNED(CACHELINE) +uint8_t sz_size2index_tab[(SC_LOOKUP_MAXCLASS >> SC_LG_TINY_MIN) + 1]; + +static void +sz_boot_size2index_tab(const sc_data_t *sc_data) { + size_t dst_max = (SC_LOOKUP_MAXCLASS >> SC_LG_TINY_MIN) + 1; + size_t dst_ind = 0; + for (unsigned sc_ind = 0; sc_ind < SC_NSIZES && dst_ind < dst_max; + sc_ind++) { + const sc_t *sc = &sc_data->sc[sc_ind]; + size_t sz = (ZU(1) << sc->lg_base) + + (ZU(sc->ndelta) << sc->lg_delta); + size_t max_ind = ((sz + (ZU(1) << SC_LG_TINY_MIN) - 1) + >> SC_LG_TINY_MIN); + for (; dst_ind <= max_ind && dst_ind < dst_max; dst_ind++) { + assert(sc_ind < 1 << (sizeof(uint8_t) * 8)); + sz_size2index_tab[dst_ind] = (uint8_t)sc_ind; + } + } +} + +void +sz_boot(const sc_data_t *sc_data, bool cache_oblivious) { + sz_large_pad = cache_oblivious ? PAGE : 0; + sz_boot_pind2sz_tab(sc_data); + sz_boot_index2size_tab(sc_data); + sz_boot_size2index_tab(sc_data); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/tcache.c b/src/duckdb/extension/jemalloc/jemalloc/src/tcache.c new file mode 100644 index 000000000..b90907ad7 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/tcache.c @@ -0,0 +1,1553 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/base.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/safety_check.h" +#include "jemalloc/internal/san.h" +#include "jemalloc/internal/sc.h" + +/******************************************************************************/ +/* Data. */ + +bool opt_tcache = true; + +/* global_do_not_change_tcache_maxclass is set to 32KB by default. */ +size_t opt_tcache_max = ((size_t)1) << 15; + +/* Reasonable defaults for min and max values. */ +unsigned opt_tcache_nslots_small_min = 20; +unsigned opt_tcache_nslots_small_max = 200; +unsigned opt_tcache_nslots_large = 20; + +/* + * We attempt to make the number of slots in a tcache bin for a given size class + * equal to the number of objects in a slab times some multiplier. By default, + * the multiplier is 2 (i.e. we set the maximum number of objects in the tcache + * to twice the number of objects in a slab). + * This is bounded by some other constraints as well, like the fact that it + * must be even, must be less than opt_tcache_nslots_small_max, etc.. + */ +ssize_t opt_lg_tcache_nslots_mul = 1; + +/* + * Number of allocation bytes between tcache incremental GCs. Again, this + * default just seems to work well; more tuning is possible. + */ +size_t opt_tcache_gc_incr_bytes = 65536; + +/* + * With default settings, we may end up flushing small bins frequently with + * small flush amounts. To limit this tendency, we can set a number of bytes to + * "delay" by. If we try to flush N M-byte items, we decrease that size-class's + * delay by N * M. So, if delay is 1024 and we're looking at the 64-byte size + * class, we won't do any flushing until we've been asked to flush 1024/64 == 16 + * items. This can happen in any configuration (i.e. being asked to flush 16 + * items once, or 4 items 4 times). + * + * Practically, this is stored as a count of items in a uint8_t, so the + * effective maximum value for a size class is 255 * sz. + */ +size_t opt_tcache_gc_delay_bytes = 0; + +/* + * When a cache bin is flushed because it's full, how much of it do we flush? + * By default, we flush half the maximum number of items. + */ +unsigned opt_lg_tcache_flush_small_div = 1; +unsigned opt_lg_tcache_flush_large_div = 1; + +/* + * Number of cache bins enabled, including both large and small. This value + * is only used to initialize tcache_nbins in the per-thread tcache. + * Directly modifying it will not affect threads already launched. + */ +unsigned global_do_not_change_tcache_nbins; +/* + * Max size class to be cached (can be small or large). This value is only used + * to initialize tcache_max in the per-thread tcache. Directly modifying it + * will not affect threads already launched. + */ +size_t global_do_not_change_tcache_maxclass; + +/* + * Default bin info for each bin. Will be initialized in malloc_conf_init + * and tcache_boot and should not be modified after that. + */ +static cache_bin_info_t opt_tcache_ncached_max[TCACHE_NBINS_MAX] = {{0}}; +/* + * Marks whether a bin's info is set already. This is used in + * tcache_bin_info_compute to avoid overwriting ncached_max specified by + * malloc_conf. It should be set only when parsing malloc_conf. + */ +static bool opt_tcache_ncached_max_set[TCACHE_NBINS_MAX] = {0}; + +tcaches_t *tcaches; + +/* Index of first element within tcaches that has never been used. */ +static unsigned tcaches_past; + +/* Head of singly linked list tracking available tcaches elements. */ +static tcaches_t *tcaches_avail; + +/* Protects tcaches{,_past,_avail}. */ +static malloc_mutex_t tcaches_mtx; + +/******************************************************************************/ + +size_t +tcache_salloc(tsdn_t *tsdn, const void *ptr) { + return arena_salloc(tsdn, ptr); +} + +uint64_t +tcache_gc_new_event_wait(tsd_t *tsd) { + return opt_tcache_gc_incr_bytes; +} + +uint64_t +tcache_gc_postponed_event_wait(tsd_t *tsd) { + return TE_MIN_START_WAIT; +} + +uint64_t +tcache_gc_dalloc_new_event_wait(tsd_t *tsd) { + return opt_tcache_gc_incr_bytes; +} + +uint64_t +tcache_gc_dalloc_postponed_event_wait(tsd_t *tsd) { + return TE_MIN_START_WAIT; +} + +static uint8_t +tcache_gc_item_delay_compute(szind_t szind) { + assert(szind < SC_NBINS); + size_t sz = sz_index2size(szind); + size_t item_delay = opt_tcache_gc_delay_bytes / sz; + size_t delay_max = ZU(1) + << (sizeof(((tcache_slow_t *)NULL)->bin_flush_delay_items[0]) * 8); + if (item_delay >= delay_max) { + item_delay = delay_max - 1; + } + return (uint8_t)item_delay; +} + +static void +tcache_gc_small(tsd_t *tsd, tcache_slow_t *tcache_slow, tcache_t *tcache, + szind_t szind) { + /* Aim to flush 3/4 of items below low-water. */ + assert(szind < SC_NBINS); + + cache_bin_t *cache_bin = &tcache->bins[szind]; + assert(!tcache_bin_disabled(szind, cache_bin, tcache->tcache_slow)); + cache_bin_sz_t ncached = cache_bin_ncached_get_local(cache_bin); + cache_bin_sz_t low_water = cache_bin_low_water_get(cache_bin); + assert(!tcache_slow->bin_refilled[szind]); + + size_t nflush = low_water - (low_water >> 2); + if (nflush < tcache_slow->bin_flush_delay_items[szind]) { + /* Workaround for a conversion warning. */ + uint8_t nflush_uint8 = (uint8_t)nflush; + assert(sizeof(tcache_slow->bin_flush_delay_items[0]) == + sizeof(nflush_uint8)); + tcache_slow->bin_flush_delay_items[szind] -= nflush_uint8; + return; + } + + tcache_slow->bin_flush_delay_items[szind] + = tcache_gc_item_delay_compute(szind); + tcache_bin_flush_small(tsd, tcache, cache_bin, szind, + (unsigned)(ncached - nflush)); + + /* + * Reduce fill count by 2X. Limit lg_fill_div such that + * the fill count is always at least 1. + */ + if ((cache_bin_ncached_max_get(cache_bin) >> + tcache_slow->lg_fill_div[szind]) > 1) { + tcache_slow->lg_fill_div[szind]++; + } +} + +static void +tcache_gc_large(tsd_t *tsd, tcache_slow_t *tcache_slow, tcache_t *tcache, + szind_t szind) { + /* Like the small GC; flush 3/4 of untouched items. */ + assert(szind >= SC_NBINS); + cache_bin_t *cache_bin = &tcache->bins[szind]; + assert(!tcache_bin_disabled(szind, cache_bin, tcache->tcache_slow)); + cache_bin_sz_t ncached = cache_bin_ncached_get_local(cache_bin); + cache_bin_sz_t low_water = cache_bin_low_water_get(cache_bin); + tcache_bin_flush_large(tsd, tcache, cache_bin, szind, + (unsigned)(ncached - low_water + (low_water >> 2))); +} + +static void +tcache_event(tsd_t *tsd) { + tcache_t *tcache = tcache_get(tsd); + if (tcache == NULL) { + return; + } + + tcache_slow_t *tcache_slow = tsd_tcache_slowp_get(tsd); + szind_t szind = tcache_slow->next_gc_bin; + bool is_small = (szind < SC_NBINS); + cache_bin_t *cache_bin = &tcache->bins[szind]; + + if (tcache_bin_disabled(szind, cache_bin, tcache_slow)) { + goto label_done; + } + + tcache_bin_flush_stashed(tsd, tcache, cache_bin, szind, is_small); + cache_bin_sz_t low_water = cache_bin_low_water_get(cache_bin); + if (low_water > 0) { + if (is_small) { + tcache_gc_small(tsd, tcache_slow, tcache, szind); + } else { + tcache_gc_large(tsd, tcache_slow, tcache, szind); + } + } else if (is_small && tcache_slow->bin_refilled[szind]) { + assert(low_water == 0); + /* + * Increase fill count by 2X for small bins. Make sure + * lg_fill_div stays greater than 0. + */ + if (tcache_slow->lg_fill_div[szind] > 1) { + tcache_slow->lg_fill_div[szind]--; + } + tcache_slow->bin_refilled[szind] = false; + } + cache_bin_low_water_set(cache_bin); + +label_done: + tcache_slow->next_gc_bin++; + if (tcache_slow->next_gc_bin == tcache_nbins_get(tcache_slow)) { + tcache_slow->next_gc_bin = 0; + } +} + +void +tcache_gc_event_handler(tsd_t *tsd, uint64_t elapsed) { + assert(elapsed == TE_INVALID_ELAPSED); + tcache_event(tsd); +} + +void +tcache_gc_dalloc_event_handler(tsd_t *tsd, uint64_t elapsed) { + assert(elapsed == TE_INVALID_ELAPSED); + tcache_event(tsd); +} + +void * +tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, + tcache_t *tcache, cache_bin_t *cache_bin, szind_t binind, + bool *tcache_success) { + tcache_slow_t *tcache_slow = tcache->tcache_slow; + void *ret; + + assert(tcache_slow->arena != NULL); + assert(!tcache_bin_disabled(binind, cache_bin, tcache_slow)); + cache_bin_sz_t nfill = cache_bin_ncached_max_get(cache_bin) + >> tcache_slow->lg_fill_div[binind]; + if (nfill == 0) { + nfill = 1; + } + arena_cache_bin_fill_small(tsdn, arena, cache_bin, binind, nfill); + tcache_slow->bin_refilled[binind] = true; + ret = cache_bin_alloc(cache_bin, tcache_success); + + return ret; +} + +static const void * +tcache_bin_flush_ptr_getter(void *arr_ctx, size_t ind) { + cache_bin_ptr_array_t *arr = (cache_bin_ptr_array_t *)arr_ctx; + return arr->ptr[ind]; +} + +static void +tcache_bin_flush_metadata_visitor(void *szind_sum_ctx, + emap_full_alloc_ctx_t *alloc_ctx) { + size_t *szind_sum = (size_t *)szind_sum_ctx; + *szind_sum -= alloc_ctx->szind; + util_prefetch_write_range(alloc_ctx->edata, sizeof(edata_t)); +} + +JEMALLOC_NOINLINE static void +tcache_bin_flush_size_check_fail(cache_bin_ptr_array_t *arr, szind_t szind, + size_t nptrs, emap_batch_lookup_result_t *edatas) { + bool found_mismatch = false; + for (size_t i = 0; i < nptrs; i++) { + szind_t true_szind = edata_szind_get(edatas[i].edata); + if (true_szind != szind) { + found_mismatch = true; + safety_check_fail_sized_dealloc( + /* current_dealloc */ false, + /* ptr */ tcache_bin_flush_ptr_getter(arr, i), + /* true_size */ sz_index2size(true_szind), + /* input_size */ sz_index2size(szind)); + } + } + assert(found_mismatch); +} + +static void +tcache_bin_flush_edatas_lookup(tsd_t *tsd, cache_bin_ptr_array_t *arr, + szind_t binind, size_t nflush, emap_batch_lookup_result_t *edatas) { + + /* + * This gets compiled away when config_opt_safety_checks is false. + * Checks for sized deallocation bugs, failing early rather than + * corrupting metadata. + */ + size_t szind_sum = binind * nflush; + emap_edata_lookup_batch(tsd, &arena_emap_global, nflush, + &tcache_bin_flush_ptr_getter, (void *)arr, + &tcache_bin_flush_metadata_visitor, (void *)&szind_sum, + edatas); + if (config_opt_safety_checks && unlikely(szind_sum != 0)) { + tcache_bin_flush_size_check_fail(arr, binind, nflush, edatas); + } +} + +JEMALLOC_ALWAYS_INLINE void +tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, + szind_t binind, cache_bin_ptr_array_t *ptrs, unsigned nflush) { + tcache_slow_t *tcache_slow = tcache->tcache_slow; + /* + * A couple lookup calls take tsdn; declare it once for convenience + * instead of calling tsd_tsdn(tsd) all the time. + */ + tsdn_t *tsdn = tsd_tsdn(tsd); + + assert(binind < SC_NBINS); + arena_t *tcache_arena = tcache_slow->arena; + assert(tcache_arena != NULL); + unsigned tcache_binshard = tsd_binshardsp_get(tsdn_tsd(tsdn))->binshard[binind]; + + /* + * Variable length array must have > 0 length; the last element is never + * touched (it's just included to satisfy the no-zero-length rule). + */ + VARIABLE_ARRAY(emap_batch_lookup_result_t, item_edata, nflush + 1); + tcache_bin_flush_edatas_lookup(tsd, ptrs, binind, nflush, item_edata); + + /* + * The slabs where we freed the last remaining object in the slab (and + * so need to free the slab itself). + * Used only if small == true. + */ + unsigned dalloc_count = 0; + VARIABLE_ARRAY(edata_t *, dalloc_slabs, nflush + 1); + + /* + * There's an edge case where we need to deallocate more slabs than we + * have elements of dalloc_slabs. This can if we end up deallocating + * items batched by another thread in addition to ones flushed from the + * cache. Since this is not very likely (most small object + * deallocations don't free up a whole slab), we don't want to burn the + * stack space to keep those excess slabs in an array. Instead we'll + * maintain an overflow list. + */ + edata_list_active_t dalloc_slabs_extra; + edata_list_active_init(&dalloc_slabs_extra); + + /* + * We're about to grab a bunch of locks. If one of them happens to be + * the one guarding the arena-level stats counters we flush our + * thread-local ones to, we do so under one critical section. + */ + bool merged_stats = false; + + /* + * We maintain the invariant that all edatas yet to be flushed are + * contained in the half-open range [flush_start, flush_end). We'll + * repeatedly partition the array so that the unflushed items are at the + * end. + */ + unsigned flush_start = 0; + + while (flush_start < nflush) { + /* + * After our partitioning step, all objects to flush will be in + * the half-open range [prev_flush_start, flush_start), and + * flush_start will be updated to correspond to the next loop + * iteration. + */ + unsigned prev_flush_start = flush_start; + + edata_t *cur_edata = item_edata[flush_start].edata; + unsigned cur_arena_ind = edata_arena_ind_get(cur_edata); + arena_t *cur_arena = arena_get(tsdn, cur_arena_ind, false); + + unsigned cur_binshard = edata_binshard_get(cur_edata); + bin_t *cur_bin = arena_get_bin(cur_arena, binind, + cur_binshard); + assert(cur_binshard < bin_infos[binind].n_shards); + + /* + * Start off the partition; item_edata[i] always matches itself + * of course. + */ + flush_start++; + for (unsigned i = flush_start; i < nflush; i++) { + void *ptr = ptrs->ptr[i]; + edata_t *edata = item_edata[i].edata; + assert(ptr != NULL && edata != NULL); + assert((uintptr_t)ptr >= (uintptr_t)edata_addr_get(edata)); + assert((uintptr_t)ptr < (uintptr_t)edata_past_get(edata)); + if (edata_arena_ind_get(edata) == cur_arena_ind + && edata_binshard_get(edata) == cur_binshard) { + /* Swap the edatas. */ + emap_batch_lookup_result_t temp_edata + = item_edata[flush_start]; + item_edata[flush_start] = item_edata[i]; + item_edata[i] = temp_edata; + /* Swap the pointers */ + void *temp_ptr = ptrs->ptr[flush_start]; + ptrs->ptr[flush_start] = ptrs->ptr[i]; + ptrs->ptr[i] = temp_ptr; + flush_start++; + } + } + /* Make sure we implemented partitioning correctly. */ + if (config_debug) { + for (unsigned i = prev_flush_start; i < flush_start; + i++) { + edata_t *edata = item_edata[i].edata; + unsigned arena_ind = edata_arena_ind_get(edata); + assert(arena_ind == cur_arena_ind); + unsigned binshard = edata_binshard_get(edata); + assert(binshard == cur_binshard); + } + for (unsigned i = flush_start; i < nflush; i++) { + edata_t *edata = item_edata[i].edata; + assert(edata_arena_ind_get(edata) + != cur_arena_ind + || edata_binshard_get(edata) + != cur_binshard); + } + } + + /* + * We never batch when flushing to our home-base bin shard, + * since it's likely that we'll have to acquire that lock anyway + * when flushing stats. + * + * A plausible check we could add to can_batch is + * '&& arena_is_auto(cur_arena)'. The motivation would be that + * we have a higher tolerance for dubious user assumptions + * around non-auto arenas (e.g. "if I deallocate every object I + * allocated, and then call tcache.flush, then the arena stats + * must reflect zero live allocations"). + * + * This is dubious for a couple reasons: + * - We already don't provide perfect fidelity for stats + * counting (e.g. for profiled allocations, whose size can + * inflate in stats). + * - Hanging load-bearing guarantees around stats impedes + * scalability in general. + * + * There are some "complete" strategies we could do instead: + * - Add a arena..quiesce call to pop all bins for users who + * do want those stats accounted for. + * - Make batchability a user-controllable per-arena option. + * - Do a batch pop after every mutex acquisition for which we + * want to provide accurate stats. This gives perfectly + * accurate stats, but can cause weird performance effects + * (because doing stats collection can now result in slabs + * becoming empty, and therefore purging, large mutex + * acquisition, etc.). + * - Propagate the "why" behind a flush down to the level of the + * batcher, and include a batch pop attempt down full tcache + * flushing pathways. This is just a lot of plumbing and + * internal complexity. + * + * We don't do any of these right now, but the decision calculus + * and tradeoffs are subtle enough that the reasoning was worth + * leaving in this comment. + */ + bool bin_is_batched = arena_bin_has_batch(binind); + bool home_binshard = (cur_arena == tcache_arena + && cur_binshard == tcache_binshard); + bool can_batch = (flush_start - prev_flush_start + <= opt_bin_info_remote_free_max_batch) + && !home_binshard && bin_is_batched; + + /* + * We try to avoid the batching pathway if we can, so we always + * at least *try* to lock. + */ + bool locked = false; + bool batched = false; + bool batch_failed = false; + if (can_batch) { + locked = !malloc_mutex_trylock(tsdn, &cur_bin->lock); + } + if (can_batch && !locked) { + bin_with_batch_t *batched_bin = + (bin_with_batch_t *)cur_bin; + size_t push_idx = batcher_push_begin(tsdn, + &batched_bin->remote_frees, + flush_start - prev_flush_start); + bin_batching_test_after_push(push_idx); + + if (push_idx != BATCHER_NO_IDX) { + batched = true; + unsigned nbatched + = flush_start - prev_flush_start; + for (unsigned i = 0; i < nbatched; i++) { + unsigned src_ind = prev_flush_start + i; + batched_bin->remote_free_data[ + push_idx + i].ptr + = ptrs->ptr[src_ind]; + batched_bin->remote_free_data[ + push_idx + i].slab + = item_edata[src_ind].edata; + } + batcher_push_end(tsdn, + &batched_bin->remote_frees); + } else { + batch_failed = true; + } + } + if (!batched) { + if (!locked) { + malloc_mutex_lock(tsdn, &cur_bin->lock); + } + /* + * Unlike other stats (which only ever get flushed into + * a tcache's associated arena), batch_failed counts get + * accumulated into the bin where the push attempt + * failed. + */ + if (config_stats && batch_failed) { + cur_bin->stats.batch_failed_pushes++; + } + + /* + * Flush stats first, if that was the right lock. Note + * that we don't actually have to flush stats into the + * current thread's binshard. Flushing into any binshard + * in the same arena is enough; we don't expose stats on + * per-binshard basis (just per-bin). + */ + if (config_stats && tcache_arena == cur_arena + && !merged_stats) { + merged_stats = true; + cur_bin->stats.nflushes++; + cur_bin->stats.nrequests += + cache_bin->tstats.nrequests; + cache_bin->tstats.nrequests = 0; + } + unsigned preallocated_slabs = nflush; + unsigned ndalloc_slabs = arena_bin_batch_get_ndalloc_slabs( + preallocated_slabs); + + /* Next flush objects our own objects. */ + /* Init only to avoid used-uninitialized warning. */ + arena_dalloc_bin_locked_info_t dalloc_bin_info = {0}; + arena_dalloc_bin_locked_begin(&dalloc_bin_info, binind); + for (unsigned i = prev_flush_start; i < flush_start; + i++) { + void *ptr = ptrs->ptr[i]; + edata_t *edata = item_edata[i].edata; + arena_dalloc_bin_locked_step(tsdn, cur_arena, + cur_bin, &dalloc_bin_info, binind, edata, + ptr, dalloc_slabs, ndalloc_slabs, + &dalloc_count, &dalloc_slabs_extra); + } + /* + * Lastly, flush any batched objects (from other + * threads). + */ + if (bin_is_batched) { + arena_bin_flush_batch_impl(tsdn, cur_arena, + cur_bin, &dalloc_bin_info, binind, + dalloc_slabs, ndalloc_slabs, + &dalloc_count, &dalloc_slabs_extra); + } + + arena_dalloc_bin_locked_finish(tsdn, cur_arena, cur_bin, + &dalloc_bin_info); + malloc_mutex_unlock(tsdn, &cur_bin->lock); + } + arena_decay_ticks(tsdn, cur_arena, + flush_start - prev_flush_start); + } + + /* Handle all deferred slab dalloc. */ + for (unsigned i = 0; i < dalloc_count; i++) { + edata_t *slab = dalloc_slabs[i]; + arena_slab_dalloc(tsdn, arena_get_from_edata(slab), slab); + } + while (!edata_list_active_empty(&dalloc_slabs_extra)) { + edata_t *slab = edata_list_active_first(&dalloc_slabs_extra); + edata_list_active_remove(&dalloc_slabs_extra, slab); + arena_slab_dalloc(tsdn, arena_get_from_edata(slab), slab); + } + + if (config_stats && !merged_stats) { + /* + * The flush loop didn't happen to flush to this + * thread's arena, so the stats didn't get merged. + * Manually do so now. + */ + bin_t *bin = arena_bin_choose(tsdn, tcache_arena, + binind, NULL); + malloc_mutex_lock(tsdn, &bin->lock); + bin->stats.nflushes++; + bin->stats.nrequests += cache_bin->tstats.nrequests; + cache_bin->tstats.nrequests = 0; + malloc_mutex_unlock(tsdn, &bin->lock); + } +} + +JEMALLOC_ALWAYS_INLINE void +tcache_bin_flush_impl_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, + szind_t binind, cache_bin_ptr_array_t *ptrs, unsigned nflush) { + tcache_slow_t *tcache_slow = tcache->tcache_slow; + /* + * A couple lookup calls take tsdn; declare it once for convenience + * instead of calling tsd_tsdn(tsd) all the time. + */ + tsdn_t *tsdn = tsd_tsdn(tsd); + + assert(binind < tcache_nbins_get(tcache_slow)); + arena_t *tcache_arena = tcache_slow->arena; + assert(tcache_arena != NULL); + + /* + * Variable length array must have > 0 length; the last element is never + * touched (it's just included to satisfy the no-zero-length rule). + */ + VARIABLE_ARRAY(emap_batch_lookup_result_t, item_edata, nflush + 1); + tcache_bin_flush_edatas_lookup(tsd, ptrs, binind, nflush, item_edata); + + /* + * We're about to grab a bunch of locks. If one of them happens to be + * the one guarding the arena-level stats counters we flush our + * thread-local ones to, we do so under one critical section. + */ + bool merged_stats = false; + while (nflush > 0) { + /* Lock the arena, or bin, associated with the first object. */ + edata_t *edata = item_edata[0].edata; + unsigned cur_arena_ind = edata_arena_ind_get(edata); + arena_t *cur_arena = arena_get(tsdn, cur_arena_ind, false); + + if (!arena_is_auto(cur_arena)) { + malloc_mutex_lock(tsdn, &cur_arena->large_mtx); + } + + /* + * If we acquired the right lock and have some stats to flush, + * flush them. + */ + if (config_stats && tcache_arena == cur_arena + && !merged_stats) { + merged_stats = true; + arena_stats_large_flush_nrequests_add(tsdn, + &tcache_arena->stats, binind, + cache_bin->tstats.nrequests); + cache_bin->tstats.nrequests = 0; + } + + /* + * Large allocations need special prep done. Afterwards, we can + * drop the large lock. + */ + for (unsigned i = 0; i < nflush; i++) { + void *ptr = ptrs->ptr[i]; + edata = item_edata[i].edata; + assert(ptr != NULL && edata != NULL); + + if (edata_arena_ind_get(edata) == cur_arena_ind) { + large_dalloc_prep_locked(tsdn, + edata); + } + } + if (!arena_is_auto(cur_arena)) { + malloc_mutex_unlock(tsdn, &cur_arena->large_mtx); + } + + /* Deallocate whatever we can. */ + unsigned ndeferred = 0; + for (unsigned i = 0; i < nflush; i++) { + void *ptr = ptrs->ptr[i]; + edata = item_edata[i].edata; + assert(ptr != NULL && edata != NULL); + if (edata_arena_ind_get(edata) != cur_arena_ind) { + /* + * The object was allocated either via a + * different arena, or a different bin in this + * arena. Either way, stash the object so that + * it can be handled in a future pass. + */ + ptrs->ptr[ndeferred] = ptr; + item_edata[ndeferred].edata = edata; + ndeferred++; + continue; + } + if (large_dalloc_safety_checks(edata, ptr, binind)) { + /* See the comment in isfree. */ + continue; + } + large_dalloc_finish(tsdn, edata); + } + arena_decay_ticks(tsdn, cur_arena, nflush - ndeferred); + nflush = ndeferred; + } + + if (config_stats && !merged_stats) { + arena_stats_large_flush_nrequests_add(tsdn, + &tcache_arena->stats, binind, + cache_bin->tstats.nrequests); + cache_bin->tstats.nrequests = 0; + } +} + +JEMALLOC_ALWAYS_INLINE void +tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, + szind_t binind, cache_bin_ptr_array_t *ptrs, unsigned nflush, bool small) { + assert(ptrs != NULL && ptrs->ptr != NULL); + unsigned nflush_batch, nflushed = 0; + cache_bin_ptr_array_t ptrs_batch; + do { + nflush_batch = nflush - nflushed; + if (nflush_batch > CACHE_BIN_NFLUSH_BATCH_MAX) { + nflush_batch = CACHE_BIN_NFLUSH_BATCH_MAX; + } + assert(nflush_batch <= CACHE_BIN_NFLUSH_BATCH_MAX); + (&ptrs_batch)->n = (cache_bin_sz_t)nflush_batch; + (&ptrs_batch)->ptr = ptrs->ptr + nflushed; + /* + * The small/large flush logic is very similar; you might conclude that + * it's a good opportunity to share code. We've tried this, and by and + * large found this to obscure more than it helps; there are so many + * fiddly bits around things like stats handling, precisely when and + * which mutexes are acquired, etc., that almost all code ends up being + * gated behind 'if (small) { ... } else { ... }'. Even though the + * '...' is morally equivalent, the code itself needs slight tweaks. + */ + if (small) { + tcache_bin_flush_impl_small(tsd, tcache, cache_bin, binind, + &ptrs_batch, nflush_batch); + } else { + tcache_bin_flush_impl_large(tsd, tcache, cache_bin, binind, + &ptrs_batch, nflush_batch); + } + nflushed += nflush_batch; + } while (nflushed < nflush); + assert(nflush == nflushed); + assert((ptrs->ptr + nflush) == ((&ptrs_batch)->ptr + nflush_batch)); +} + +JEMALLOC_ALWAYS_INLINE void +tcache_bin_flush_bottom(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, + szind_t binind, unsigned rem, bool small) { + assert(rem <= cache_bin_ncached_max_get(cache_bin)); + assert(!tcache_bin_disabled(binind, cache_bin, tcache->tcache_slow)); + cache_bin_sz_t orig_nstashed = cache_bin_nstashed_get_local(cache_bin); + tcache_bin_flush_stashed(tsd, tcache, cache_bin, binind, small); + + cache_bin_sz_t ncached = cache_bin_ncached_get_local(cache_bin); + assert((cache_bin_sz_t)rem <= ncached + orig_nstashed); + if ((cache_bin_sz_t)rem > ncached) { + /* + * The flush_stashed above could have done enough flushing, if + * there were many items stashed. Validate that: 1) non zero + * stashed, and 2) bin stack has available space now. + */ + assert(orig_nstashed > 0); + assert(ncached + cache_bin_nstashed_get_local(cache_bin) + < cache_bin_ncached_max_get(cache_bin)); + /* Still go through the flush logic for stats purpose only. */ + rem = ncached; + } + cache_bin_sz_t nflush = ncached - (cache_bin_sz_t)rem; + + CACHE_BIN_PTR_ARRAY_DECLARE(ptrs, nflush); + cache_bin_init_ptr_array_for_flush(cache_bin, &ptrs, nflush); + + tcache_bin_flush_impl(tsd, tcache, cache_bin, binind, &ptrs, nflush, + small); + + cache_bin_finish_flush(cache_bin, &ptrs, nflush); +} + +void +tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, + szind_t binind, unsigned rem) { + tcache_bin_flush_bottom(tsd, tcache, cache_bin, binind, rem, + /* small */ true); +} + +void +tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, + szind_t binind, unsigned rem) { + tcache_bin_flush_bottom(tsd, tcache, cache_bin, binind, rem, + /* small */ false); +} + +/* + * Flushing stashed happens when 1) tcache fill, 2) tcache flush, or 3) tcache + * GC event. This makes sure that the stashed items do not hold memory for too + * long, and new buffers can only be allocated when nothing is stashed. + * + * The downside is, the time between stash and flush may be relatively short, + * especially when the request rate is high. It lowers the chance of detecting + * write-after-free -- however that is a delayed detection anyway, and is less + * of a focus than the memory overhead. + */ +void +tcache_bin_flush_stashed(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, + szind_t binind, bool is_small) { + assert(!tcache_bin_disabled(binind, cache_bin, tcache->tcache_slow)); + /* + * The two below are for assertion only. The content of original cached + * items remain unchanged -- the stashed items reside on the other end + * of the stack. Checking the stack head and ncached to verify. + */ + void *head_content = *cache_bin->stack_head; + cache_bin_sz_t orig_cached = cache_bin_ncached_get_local(cache_bin); + + cache_bin_sz_t nstashed = cache_bin_nstashed_get_local(cache_bin); + assert(orig_cached + nstashed <= cache_bin_ncached_max_get(cache_bin)); + if (nstashed == 0) { + return; + } + + CACHE_BIN_PTR_ARRAY_DECLARE(ptrs, nstashed); + cache_bin_init_ptr_array_for_stashed(cache_bin, binind, &ptrs, + nstashed); + san_check_stashed_ptrs(ptrs.ptr, nstashed, sz_index2size(binind)); + tcache_bin_flush_impl(tsd, tcache, cache_bin, binind, &ptrs, nstashed, + is_small); + cache_bin_finish_flush_stashed(cache_bin); + + assert(cache_bin_nstashed_get_local(cache_bin) == 0); + assert(cache_bin_ncached_get_local(cache_bin) == orig_cached); + assert(head_content == *cache_bin->stack_head); +} + +JET_EXTERN bool +tcache_get_default_ncached_max_set(szind_t ind) { + return opt_tcache_ncached_max_set[ind]; +} + +JET_EXTERN const cache_bin_info_t * +tcache_get_default_ncached_max(void) { + return opt_tcache_ncached_max; +} + +bool +tcache_bin_ncached_max_read(tsd_t *tsd, size_t bin_size, + cache_bin_sz_t *ncached_max) { + if (bin_size > TCACHE_MAXCLASS_LIMIT) { + return true; + } + + if (!tcache_available(tsd)) { + *ncached_max = 0; + return false; + } + + tcache_t *tcache = tsd_tcachep_get(tsd); + assert(tcache != NULL); + szind_t bin_ind = sz_size2index(bin_size); + + cache_bin_t *bin = &tcache->bins[bin_ind]; + *ncached_max = tcache_bin_disabled(bin_ind, bin, tcache->tcache_slow) ? + 0: cache_bin_ncached_max_get(bin); + return false; +} + +void +tcache_arena_associate(tsdn_t *tsdn, tcache_slow_t *tcache_slow, + tcache_t *tcache, arena_t *arena) { + assert(tcache_slow->arena == NULL); + tcache_slow->arena = arena; + + if (config_stats) { + /* Link into list of extant tcaches. */ + malloc_mutex_lock(tsdn, &arena->tcache_ql_mtx); + + ql_elm_new(tcache_slow, link); + ql_tail_insert(&arena->tcache_ql, tcache_slow, link); + cache_bin_array_descriptor_init( + &tcache_slow->cache_bin_array_descriptor, tcache->bins); + ql_tail_insert(&arena->cache_bin_array_descriptor_ql, + &tcache_slow->cache_bin_array_descriptor, link); + + malloc_mutex_unlock(tsdn, &arena->tcache_ql_mtx); + } +} + +static void +tcache_arena_dissociate(tsdn_t *tsdn, tcache_slow_t *tcache_slow, + tcache_t *tcache) { + arena_t *arena = tcache_slow->arena; + assert(arena != NULL); + if (config_stats) { + /* Unlink from list of extant tcaches. */ + malloc_mutex_lock(tsdn, &arena->tcache_ql_mtx); + if (config_debug) { + bool in_ql = false; + tcache_slow_t *iter; + ql_foreach(iter, &arena->tcache_ql, link) { + if (iter == tcache_slow) { + in_ql = true; + break; + } + } + assert(in_ql); + } + ql_remove(&arena->tcache_ql, tcache_slow, link); + ql_remove(&arena->cache_bin_array_descriptor_ql, + &tcache_slow->cache_bin_array_descriptor, link); + tcache_stats_merge(tsdn, tcache_slow->tcache, arena); + malloc_mutex_unlock(tsdn, &arena->tcache_ql_mtx); + } + tcache_slow->arena = NULL; +} + +void +tcache_arena_reassociate(tsdn_t *tsdn, tcache_slow_t *tcache_slow, + tcache_t *tcache, arena_t *arena) { + tcache_arena_dissociate(tsdn, tcache_slow, tcache); + tcache_arena_associate(tsdn, tcache_slow, tcache, arena); +} + +static void +tcache_default_settings_init(tcache_slow_t *tcache_slow) { + assert(tcache_slow != NULL); + assert(global_do_not_change_tcache_maxclass != 0); + assert(global_do_not_change_tcache_nbins != 0); + tcache_slow->tcache_nbins = global_do_not_change_tcache_nbins; +} + +static void +tcache_init(tsd_t *tsd, tcache_slow_t *tcache_slow, tcache_t *tcache, + void *mem, const cache_bin_info_t *tcache_bin_info) { + tcache->tcache_slow = tcache_slow; + tcache_slow->tcache = tcache; + + memset(&tcache_slow->link, 0, sizeof(ql_elm(tcache_t))); + tcache_slow->next_gc_bin = 0; + tcache_slow->arena = NULL; + tcache_slow->dyn_alloc = mem; + + /* + * We reserve cache bins for all small size classes, even if some may + * not get used (i.e. bins higher than tcache_nbins). This allows + * the fast and common paths to access cache bin metadata safely w/o + * worrying about which ones are disabled. + */ + unsigned tcache_nbins = tcache_nbins_get(tcache_slow); + size_t cur_offset = 0; + cache_bin_preincrement(tcache_bin_info, tcache_nbins, mem, + &cur_offset); + for (unsigned i = 0; i < tcache_nbins; i++) { + if (i < SC_NBINS) { + tcache_slow->lg_fill_div[i] = 1; + tcache_slow->bin_refilled[i] = false; + tcache_slow->bin_flush_delay_items[i] + = tcache_gc_item_delay_compute(i); + } + cache_bin_t *cache_bin = &tcache->bins[i]; + if (tcache_bin_info[i].ncached_max > 0) { + cache_bin_init(cache_bin, &tcache_bin_info[i], mem, + &cur_offset); + } else { + cache_bin_init_disabled(cache_bin, + tcache_bin_info[i].ncached_max); + } + } + /* + * Initialize all disabled bins to a state that can safely and + * efficiently fail all fastpath alloc / free, so that no additional + * check around tcache_nbins is needed on fastpath. Yet we still + * store the ncached_max in the bin_info for future usage. + */ + for (unsigned i = tcache_nbins; i < TCACHE_NBINS_MAX; i++) { + cache_bin_t *cache_bin = &tcache->bins[i]; + cache_bin_init_disabled(cache_bin, + tcache_bin_info[i].ncached_max); + assert(tcache_bin_disabled(i, cache_bin, tcache->tcache_slow)); + } + + cache_bin_postincrement(mem, &cur_offset); + if (config_debug) { + /* Sanity check that the whole stack is used. */ + size_t size, alignment; + cache_bin_info_compute_alloc(tcache_bin_info, tcache_nbins, + &size, &alignment); + assert(cur_offset == size); + } +} + +static inline unsigned +tcache_ncached_max_compute(szind_t szind) { + if (szind >= SC_NBINS) { + return opt_tcache_nslots_large; + } + unsigned slab_nregs = bin_infos[szind].nregs; + + /* We may modify these values; start with the opt versions. */ + unsigned nslots_small_min = opt_tcache_nslots_small_min; + unsigned nslots_small_max = opt_tcache_nslots_small_max; + + /* + * Clamp values to meet our constraints -- even, nonzero, min < max, and + * suitable for a cache bin size. + */ + if (opt_tcache_nslots_small_max > CACHE_BIN_NCACHED_MAX) { + nslots_small_max = CACHE_BIN_NCACHED_MAX; + } + if (nslots_small_min % 2 != 0) { + nslots_small_min++; + } + if (nslots_small_max % 2 != 0) { + nslots_small_max--; + } + if (nslots_small_min < 2) { + nslots_small_min = 2; + } + if (nslots_small_max < 2) { + nslots_small_max = 2; + } + if (nslots_small_min > nslots_small_max) { + nslots_small_min = nslots_small_max; + } + + unsigned candidate; + if (opt_lg_tcache_nslots_mul < 0) { + candidate = slab_nregs >> (-opt_lg_tcache_nslots_mul); + } else { + candidate = slab_nregs << opt_lg_tcache_nslots_mul; + } + if (candidate % 2 != 0) { + /* + * We need the candidate size to be even -- we assume that we + * can divide by two and get a positive number (e.g. when + * flushing). + */ + ++candidate; + } + if (candidate <= nslots_small_min) { + return nslots_small_min; + } else if (candidate <= nslots_small_max) { + return candidate; + } else { + return nslots_small_max; + } +} + +JET_EXTERN void +tcache_bin_info_compute(cache_bin_info_t tcache_bin_info[TCACHE_NBINS_MAX]) { + /* + * Compute the values for each bin, but for bins with indices larger + * than tcache_nbins, no items will be cached. + */ + for (szind_t i = 0; i < TCACHE_NBINS_MAX; i++) { + unsigned ncached_max = tcache_get_default_ncached_max_set(i) ? + (unsigned)tcache_get_default_ncached_max()[i].ncached_max: + tcache_ncached_max_compute(i); + assert(ncached_max <= CACHE_BIN_NCACHED_MAX); + cache_bin_info_init(&tcache_bin_info[i], + (cache_bin_sz_t)ncached_max); + } +} + +static bool +tsd_tcache_data_init_impl(tsd_t *tsd, arena_t *arena, + const cache_bin_info_t *tcache_bin_info) { + tcache_slow_t *tcache_slow = tsd_tcache_slowp_get_unsafe(tsd); + tcache_t *tcache = tsd_tcachep_get_unsafe(tsd); + + assert(cache_bin_still_zero_initialized(&tcache->bins[0])); + unsigned tcache_nbins = tcache_nbins_get(tcache_slow); + size_t size, alignment; + cache_bin_info_compute_alloc(tcache_bin_info, tcache_nbins, + &size, &alignment); + + void *mem; + if (cache_bin_stack_use_thp()) { + /* Alignment is ignored since it comes from THP. */ + assert(alignment == QUANTUM); + mem = b0_alloc_tcache_stack(tsd_tsdn(tsd), size); + } else { + size = sz_sa2u(size, alignment); + mem = ipallocztm(tsd_tsdn(tsd), size, alignment, true, NULL, + true, arena_get(TSDN_NULL, 0, true)); + } + if (mem == NULL) { + return true; + } + + tcache_init(tsd, tcache_slow, tcache, mem, tcache_bin_info); + /* + * Initialization is a bit tricky here. After malloc init is done, all + * threads can rely on arena_choose and associate tcache accordingly. + * However, the thread that does actual malloc bootstrapping relies on + * functional tsd, and it can only rely on a0. In that case, we + * associate its tcache to a0 temporarily, and later on + * arena_choose_hard() will re-associate properly. + */ + tcache_slow->arena = NULL; + if (!malloc_initialized()) { + /* If in initialization, assign to a0. */ + arena = arena_get(tsd_tsdn(tsd), 0, false); + tcache_arena_associate(tsd_tsdn(tsd), tcache_slow, tcache, + arena); + } else { + if (arena == NULL) { + arena = arena_choose(tsd, NULL); + } + /* This may happen if thread.tcache.enabled is used. */ + if (tcache_slow->arena == NULL) { + tcache_arena_associate(tsd_tsdn(tsd), tcache_slow, + tcache, arena); + } + } + assert(arena == tcache_slow->arena); + + return false; +} + +/* Initialize auto tcache (embedded in TSD). */ +static bool +tsd_tcache_data_init(tsd_t *tsd, arena_t *arena, + const cache_bin_info_t tcache_bin_info[TCACHE_NBINS_MAX]) { + assert(tcache_bin_info != NULL); + return tsd_tcache_data_init_impl(tsd, arena, tcache_bin_info); +} + +/* Created manual tcache for tcache.create mallctl. */ +tcache_t * +tcache_create_explicit(tsd_t *tsd) { + /* + * We place the cache bin stacks, then the tcache_t, then a pointer to + * the beginning of the whole allocation (for freeing). The makes sure + * the cache bins have the requested alignment. + */ + unsigned tcache_nbins = global_do_not_change_tcache_nbins; + size_t tcache_size, alignment; + cache_bin_info_compute_alloc(tcache_get_default_ncached_max(), + tcache_nbins, &tcache_size, &alignment); + + size_t size = tcache_size + sizeof(tcache_t) + + sizeof(tcache_slow_t); + /* Naturally align the pointer stacks. */ + size = PTR_CEILING(size); + size = sz_sa2u(size, alignment); + + void *mem = ipallocztm(tsd_tsdn(tsd), size, alignment, + true, NULL, true, arena_get(TSDN_NULL, 0, true)); + if (mem == NULL) { + return NULL; + } + tcache_t *tcache = (void *)((byte_t *)mem + tcache_size); + tcache_slow_t *tcache_slow = + (void *)((byte_t *)mem + tcache_size + sizeof(tcache_t)); + tcache_default_settings_init(tcache_slow); + tcache_init(tsd, tcache_slow, tcache, mem, + tcache_get_default_ncached_max()); + + tcache_arena_associate(tsd_tsdn(tsd), tcache_slow, tcache, + arena_ichoose(tsd, NULL)); + + return tcache; +} + +bool +tsd_tcache_enabled_data_init(tsd_t *tsd) { + /* Called upon tsd initialization. */ + tsd_tcache_enabled_set(tsd, opt_tcache); + /* + * tcache is not available yet, but we need to set up its tcache_nbins + * in advance. + */ + tcache_default_settings_init(tsd_tcache_slowp_get(tsd)); + tsd_slow_update(tsd); + + if (opt_tcache) { + /* Trigger tcache init. */ + tsd_tcache_data_init(tsd, NULL, + tcache_get_default_ncached_max()); + } + + return false; +} + +void +tcache_enabled_set(tsd_t *tsd, bool enabled) { + bool was_enabled = tsd_tcache_enabled_get(tsd); + + if (!was_enabled && enabled) { + tsd_tcache_data_init(tsd, NULL, + tcache_get_default_ncached_max()); + } else if (was_enabled && !enabled) { + tcache_cleanup(tsd); + } + /* Commit the state last. Above calls check current state. */ + tsd_tcache_enabled_set(tsd, enabled); + tsd_slow_update(tsd); +} + +void +thread_tcache_max_set(tsd_t *tsd, size_t tcache_max) { + assert(tcache_max <= TCACHE_MAXCLASS_LIMIT); + assert(tcache_max == sz_s2u(tcache_max)); + tcache_t *tcache = tsd_tcachep_get(tsd); + tcache_slow_t *tcache_slow = tcache->tcache_slow; + cache_bin_info_t tcache_bin_info[TCACHE_NBINS_MAX] = {{0}}; + assert(tcache != NULL && tcache_slow != NULL); + + bool enabled = tcache_available(tsd); + arena_t *assigned_arena; + if (enabled) { + assigned_arena = tcache_slow->arena; + /* Carry over the bin settings during the reboot. */ + tcache_bin_settings_backup(tcache, tcache_bin_info); + /* Shutdown and reboot the tcache for a clean slate. */ + tcache_cleanup(tsd); + } + + /* + * Still set tcache_nbins of the tcache even if the tcache is not + * available yet because the values are stored in tsd_t and are + * always available for changing. + */ + tcache_max_set(tcache_slow, tcache_max); + + if (enabled) { + tsd_tcache_data_init(tsd, assigned_arena, tcache_bin_info); + } + + assert(tcache_nbins_get(tcache_slow) == sz_size2index(tcache_max) + 1); +} + +static bool +tcache_bin_info_settings_parse(const char *bin_settings_segment_cur, + size_t len_left, cache_bin_info_t tcache_bin_info[TCACHE_NBINS_MAX], + bool bin_info_is_set[TCACHE_NBINS_MAX]) { + do { + size_t size_start, size_end; + size_t ncached_max; + bool err = multi_setting_parse_next(&bin_settings_segment_cur, + &len_left, &size_start, &size_end, &ncached_max); + if (err) { + return true; + } + if (size_end > TCACHE_MAXCLASS_LIMIT) { + size_end = TCACHE_MAXCLASS_LIMIT; + } + if (size_start > TCACHE_MAXCLASS_LIMIT || + size_start > size_end) { + continue; + } + /* May get called before sz_init (during malloc_conf_init). */ + szind_t bin_start = sz_size2index_compute(size_start); + szind_t bin_end = sz_size2index_compute(size_end); + if (ncached_max > CACHE_BIN_NCACHED_MAX) { + ncached_max = (size_t)CACHE_BIN_NCACHED_MAX; + } + for (szind_t i = bin_start; i <= bin_end; i++) { + cache_bin_info_init(&tcache_bin_info[i], + (cache_bin_sz_t)ncached_max); + if (bin_info_is_set != NULL) { + bin_info_is_set[i] = true; + } + } + } while (len_left > 0); + + return false; +} + +bool +tcache_bin_info_default_init(const char *bin_settings_segment_cur, + size_t len_left) { + return tcache_bin_info_settings_parse(bin_settings_segment_cur, + len_left, opt_tcache_ncached_max, opt_tcache_ncached_max_set); +} + + +bool +tcache_bins_ncached_max_write(tsd_t *tsd, char *settings, size_t len) { + assert(tcache_available(tsd)); + assert(len != 0); + tcache_t *tcache = tsd_tcachep_get(tsd); + assert(tcache != NULL); + cache_bin_info_t tcache_bin_info[TCACHE_NBINS_MAX]; + tcache_bin_settings_backup(tcache, tcache_bin_info); + + if(tcache_bin_info_settings_parse(settings, len, tcache_bin_info, + NULL)) { + return true; + } + + arena_t *assigned_arena = tcache->tcache_slow->arena; + tcache_cleanup(tsd); + tsd_tcache_data_init(tsd, assigned_arena, + tcache_bin_info); + + return false; +} + +static void +tcache_flush_cache(tsd_t *tsd, tcache_t *tcache) { + tcache_slow_t *tcache_slow = tcache->tcache_slow; + assert(tcache_slow->arena != NULL); + + for (unsigned i = 0; i < tcache_nbins_get(tcache_slow); i++) { + cache_bin_t *cache_bin = &tcache->bins[i]; + if (tcache_bin_disabled(i, cache_bin, tcache_slow)) { + continue; + } + if (i < SC_NBINS) { + tcache_bin_flush_small(tsd, tcache, cache_bin, i, 0); + } else { + tcache_bin_flush_large(tsd, tcache, cache_bin, i, 0); + } + if (config_stats) { + assert(cache_bin->tstats.nrequests == 0); + } + } +} + +void +tcache_flush(tsd_t *tsd) { + assert(tcache_available(tsd)); + tcache_flush_cache(tsd, tsd_tcachep_get(tsd)); +} + +static void +tcache_destroy(tsd_t *tsd, tcache_t *tcache, bool tsd_tcache) { + tcache_slow_t *tcache_slow = tcache->tcache_slow; + tcache_flush_cache(tsd, tcache); + arena_t *arena = tcache_slow->arena; + tcache_arena_dissociate(tsd_tsdn(tsd), tcache_slow, tcache); + + if (tsd_tcache) { + cache_bin_t *cache_bin = &tcache->bins[0]; + cache_bin_assert_empty(cache_bin); + } + if (tsd_tcache && cache_bin_stack_use_thp()) { + b0_dalloc_tcache_stack(tsd_tsdn(tsd), tcache_slow->dyn_alloc); + } else { + idalloctm(tsd_tsdn(tsd), tcache_slow->dyn_alloc, NULL, NULL, + true, true); + } + + /* + * The deallocation and tcache flush above may not trigger decay since + * we are on the tcache shutdown path (potentially with non-nominal + * tsd). Manually trigger decay to avoid pathological cases. Also + * include arena 0 because the tcache array is allocated from it. + */ + arena_decay(tsd_tsdn(tsd), arena_get(tsd_tsdn(tsd), 0, false), + false, false); + + if (arena_nthreads_get(arena, false) == 0 && + !background_thread_enabled()) { + /* Force purging when no threads assigned to the arena anymore. */ + arena_decay(tsd_tsdn(tsd), arena, + /* is_background_thread */ false, /* all */ true); + } else { + arena_decay(tsd_tsdn(tsd), arena, + /* is_background_thread */ false, /* all */ false); + } +} + +/* For auto tcache (embedded in TSD) only. */ +void +tcache_cleanup(tsd_t *tsd) { + tcache_t *tcache = tsd_tcachep_get(tsd); + if (!tcache_available(tsd)) { + assert(tsd_tcache_enabled_get(tsd) == false); + assert(cache_bin_still_zero_initialized(&tcache->bins[0])); + return; + } + assert(tsd_tcache_enabled_get(tsd)); + assert(!cache_bin_still_zero_initialized(&tcache->bins[0])); + + tcache_destroy(tsd, tcache, true); + /* Make sure all bins used are reinitialized to the clean state. */ + memset(tcache->bins, 0, sizeof(cache_bin_t) * TCACHE_NBINS_MAX); +} + +void +tcache_stats_merge(tsdn_t *tsdn, tcache_t *tcache, arena_t *arena) { + cassert(config_stats); + + /* Merge and reset tcache stats. */ + for (unsigned i = 0; i < tcache_nbins_get(tcache->tcache_slow); i++) { + cache_bin_t *cache_bin = &tcache->bins[i]; + if (tcache_bin_disabled(i, cache_bin, tcache->tcache_slow)) { + continue; + } + if (i < SC_NBINS) { + bin_t *bin = arena_bin_choose(tsdn, arena, i, NULL); + malloc_mutex_lock(tsdn, &bin->lock); + bin->stats.nrequests += cache_bin->tstats.nrequests; + malloc_mutex_unlock(tsdn, &bin->lock); + } else { + arena_stats_large_flush_nrequests_add(tsdn, + &arena->stats, i, cache_bin->tstats.nrequests); + } + cache_bin->tstats.nrequests = 0; + } +} + +static bool +tcaches_create_prep(tsd_t *tsd, base_t *base) { + bool err; + + malloc_mutex_assert_owner(tsd_tsdn(tsd), &tcaches_mtx); + + if (tcaches == NULL) { + tcaches = base_alloc(tsd_tsdn(tsd), base, + sizeof(tcache_t *) * (MALLOCX_TCACHE_MAX+1), CACHELINE); + if (tcaches == NULL) { + err = true; + goto label_return; + } + } + + if (tcaches_avail == NULL && tcaches_past > MALLOCX_TCACHE_MAX) { + err = true; + goto label_return; + } + + err = false; +label_return: + return err; +} + +bool +tcaches_create(tsd_t *tsd, base_t *base, unsigned *r_ind) { + witness_assert_depth(tsdn_witness_tsdp_get(tsd_tsdn(tsd)), 0); + + bool err; + + malloc_mutex_lock(tsd_tsdn(tsd), &tcaches_mtx); + + if (tcaches_create_prep(tsd, base)) { + err = true; + goto label_return; + } + + tcache_t *tcache = tcache_create_explicit(tsd); + if (tcache == NULL) { + err = true; + goto label_return; + } + + tcaches_t *elm; + if (tcaches_avail != NULL) { + elm = tcaches_avail; + tcaches_avail = tcaches_avail->next; + elm->tcache = tcache; + *r_ind = (unsigned)(elm - tcaches); + } else { + elm = &tcaches[tcaches_past]; + elm->tcache = tcache; + *r_ind = tcaches_past; + tcaches_past++; + } + + err = false; +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &tcaches_mtx); + witness_assert_depth(tsdn_witness_tsdp_get(tsd_tsdn(tsd)), 0); + return err; +} + +static tcache_t * +tcaches_elm_remove(tsd_t *tsd, tcaches_t *elm, bool allow_reinit) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &tcaches_mtx); + + if (elm->tcache == NULL) { + return NULL; + } + tcache_t *tcache = elm->tcache; + if (allow_reinit) { + elm->tcache = TCACHES_ELM_NEED_REINIT; + } else { + elm->tcache = NULL; + } + + if (tcache == TCACHES_ELM_NEED_REINIT) { + return NULL; + } + return tcache; +} + +void +tcaches_flush(tsd_t *tsd, unsigned ind) { + malloc_mutex_lock(tsd_tsdn(tsd), &tcaches_mtx); + tcache_t *tcache = tcaches_elm_remove(tsd, &tcaches[ind], true); + malloc_mutex_unlock(tsd_tsdn(tsd), &tcaches_mtx); + if (tcache != NULL) { + /* Destroy the tcache; recreate in tcaches_get() if needed. */ + tcache_destroy(tsd, tcache, false); + } +} + +void +tcaches_destroy(tsd_t *tsd, unsigned ind) { + malloc_mutex_lock(tsd_tsdn(tsd), &tcaches_mtx); + tcaches_t *elm = &tcaches[ind]; + tcache_t *tcache = tcaches_elm_remove(tsd, elm, false); + elm->next = tcaches_avail; + tcaches_avail = elm; + malloc_mutex_unlock(tsd_tsdn(tsd), &tcaches_mtx); + if (tcache != NULL) { + tcache_destroy(tsd, tcache, false); + } +} + +bool +tcache_boot(tsdn_t *tsdn, base_t *base) { + global_do_not_change_tcache_maxclass = sz_s2u(opt_tcache_max); + assert(global_do_not_change_tcache_maxclass <= TCACHE_MAXCLASS_LIMIT); + global_do_not_change_tcache_nbins = + sz_size2index(global_do_not_change_tcache_maxclass) + 1; + /* + * Pre-compute default bin info and store the results in + * opt_tcache_ncached_max. After the changes here, + * opt_tcache_ncached_max should not be modified and should always be + * accessed using tcache_get_default_ncached_max. + */ + tcache_bin_info_compute(opt_tcache_ncached_max); + + if (malloc_mutex_init(&tcaches_mtx, "tcaches", WITNESS_RANK_TCACHES, + malloc_mutex_rank_exclusive)) { + return true; + } + + return false; +} + +void +tcache_prefork(tsdn_t *tsdn) { + malloc_mutex_prefork(tsdn, &tcaches_mtx); +} + +void +tcache_postfork_parent(tsdn_t *tsdn) { + malloc_mutex_postfork_parent(tsdn, &tcaches_mtx); +} + +void +tcache_postfork_child(tsdn_t *tsdn) { + malloc_mutex_postfork_child(tsdn, &tcaches_mtx); +} + +void tcache_assert_initialized(tcache_t *tcache) { + assert(!cache_bin_still_zero_initialized(&tcache->bins[0])); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/test_hooks.c b/src/duckdb/extension/jemalloc/jemalloc/src/test_hooks.c new file mode 100644 index 000000000..40621199d --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/test_hooks.c @@ -0,0 +1,12 @@ +#include "jemalloc/internal/jemalloc_preamble.h" + +/* + * The hooks are a little bit screwy -- they're not genuinely exported in the + * sense that we want them available to end-users, but we do want them visible + * from outside the generated library, so that we can use them in test code. + */ +JEMALLOC_EXPORT +void (*test_hooks_arena_new_hook)(void) = NULL; + +JEMALLOC_EXPORT +void (*test_hooks_libc_hook)(void) = NULL; diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/thread_event.c b/src/duckdb/extension/jemalloc/jemalloc/src/thread_event.c new file mode 100644 index 000000000..37eb5827d --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/thread_event.c @@ -0,0 +1,343 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/thread_event.h" + +/* + * Signatures for event specific functions. These functions should be defined + * by the modules owning each event. The signatures here verify that the + * definitions follow the right format. + * + * The first two are functions computing new / postponed event wait time. New + * event wait time is the time till the next event if an event is currently + * being triggered; postponed event wait time is the time till the next event + * if an event should be triggered but needs to be postponed, e.g. when the TSD + * is not nominal or during reentrancy. + * + * The third is the event handler function, which is called whenever an event + * is triggered. The parameter is the elapsed time since the last time an + * event of the same type was triggered. + */ +#define E(event, condition_unused, is_alloc_event_unused) \ +uint64_t event##_new_event_wait(tsd_t *tsd); \ +uint64_t event##_postponed_event_wait(tsd_t *tsd); \ +void event##_event_handler(tsd_t *tsd, uint64_t elapsed); + +ITERATE_OVER_ALL_EVENTS +#undef E + +/* Signatures for internal functions fetching elapsed time. */ +#define E(event, condition_unused, is_alloc_event_unused) \ +static uint64_t event##_fetch_elapsed(tsd_t *tsd); + +ITERATE_OVER_ALL_EVENTS +#undef E + +static uint64_t +tcache_gc_fetch_elapsed(tsd_t *tsd) { + return TE_INVALID_ELAPSED; +} + +static uint64_t +tcache_gc_dalloc_fetch_elapsed(tsd_t *tsd) { + return TE_INVALID_ELAPSED; +} + +static uint64_t +prof_sample_fetch_elapsed(tsd_t *tsd) { + uint64_t last_event = thread_allocated_last_event_get(tsd); + uint64_t last_sample_event = prof_sample_last_event_get(tsd); + prof_sample_last_event_set(tsd, last_event); + return last_event - last_sample_event; +} + +static uint64_t +stats_interval_fetch_elapsed(tsd_t *tsd) { + uint64_t last_event = thread_allocated_last_event_get(tsd); + uint64_t last_stats_event = stats_interval_last_event_get(tsd); + stats_interval_last_event_set(tsd, last_event); + return last_event - last_stats_event; +} + +static uint64_t +peak_alloc_fetch_elapsed(tsd_t *tsd) { + return TE_INVALID_ELAPSED; +} + +static uint64_t +peak_dalloc_fetch_elapsed(tsd_t *tsd) { + return TE_INVALID_ELAPSED; +} + +/* Per event facilities done. */ + +static bool +te_ctx_has_active_events(te_ctx_t *ctx) { + assert(config_debug); +#define E(event, condition, alloc_event) \ + if (condition && alloc_event == ctx->is_alloc) { \ + return true; \ + } + ITERATE_OVER_ALL_EVENTS +#undef E + return false; +} + +static uint64_t +te_next_event_compute(tsd_t *tsd, bool is_alloc) { + uint64_t wait = TE_MAX_START_WAIT; +#define E(event, condition, alloc_event) \ + if (is_alloc == alloc_event && condition) { \ + uint64_t event_wait = \ + event##_event_wait_get(tsd); \ + assert(event_wait <= TE_MAX_START_WAIT); \ + if (event_wait > 0U && event_wait < wait) { \ + wait = event_wait; \ + } \ + } + + ITERATE_OVER_ALL_EVENTS +#undef E + assert(wait <= TE_MAX_START_WAIT); + return wait; +} + +static void +te_assert_invariants_impl(tsd_t *tsd, te_ctx_t *ctx) { + uint64_t current_bytes = te_ctx_current_bytes_get(ctx); + uint64_t last_event = te_ctx_last_event_get(ctx); + uint64_t next_event = te_ctx_next_event_get(ctx); + uint64_t next_event_fast = te_ctx_next_event_fast_get(ctx); + + assert(last_event != next_event); + if (next_event > TE_NEXT_EVENT_FAST_MAX || !tsd_fast(tsd)) { + assert(next_event_fast == 0U); + } else { + assert(next_event_fast == next_event); + } + + /* The subtraction is intentionally susceptible to underflow. */ + uint64_t interval = next_event - last_event; + + /* The subtraction is intentionally susceptible to underflow. */ + assert(current_bytes - last_event < interval); + uint64_t min_wait = te_next_event_compute(tsd, te_ctx_is_alloc(ctx)); + /* + * next_event should have been pushed up only except when no event is + * on and the TSD is just initialized. The last_event == 0U guard + * below is stronger than needed, but having an exactly accurate guard + * is more complicated to implement. + */ + assert((!te_ctx_has_active_events(ctx) && last_event == 0U) || + interval == min_wait || + (interval < min_wait && interval == TE_MAX_INTERVAL)); +} + +void +te_assert_invariants_debug(tsd_t *tsd) { + te_ctx_t ctx; + te_ctx_get(tsd, &ctx, true); + te_assert_invariants_impl(tsd, &ctx); + + te_ctx_get(tsd, &ctx, false); + te_assert_invariants_impl(tsd, &ctx); +} + +/* + * Synchronization around the fast threshold in tsd -- + * There are two threads to consider in the synchronization here: + * - The owner of the tsd being updated by a slow path change + * - The remote thread, doing that slow path change. + * + * As a design constraint, we want to ensure that a slow-path transition cannot + * be ignored for arbitrarily long, and that if the remote thread causes a + * slow-path transition and then communicates with the owner thread that it has + * occurred, then the owner will go down the slow path on the next allocator + * operation (so that we don't want to just wait until the owner hits its slow + * path reset condition on its own). + * + * Here's our strategy to do that: + * + * The remote thread will update the slow-path stores to TSD variables, issue a + * SEQ_CST fence, and then update the TSD next_event_fast counter. The owner + * thread will update next_event_fast, issue an SEQ_CST fence, and then check + * its TSD to see if it's on the slow path. + + * This is fairly straightforward when 64-bit atomics are supported. Assume that + * the remote fence is sandwiched between two owner fences in the reset pathway. + * The case where there is no preceding or trailing owner fence (i.e. because + * the owner thread is near the beginning or end of its life) can be analyzed + * similarly. The owner store to next_event_fast preceding the earlier owner + * fence will be earlier in coherence order than the remote store to it, so that + * the owner thread will go down the slow path once the store becomes visible to + * it, which is no later than the time of the second fence. + + * The case where we don't support 64-bit atomics is trickier, since word + * tearing is possible. We'll repeat the same analysis, and look at the two + * owner fences sandwiching the remote fence. The next_event_fast stores done + * alongside the earlier owner fence cannot overwrite any of the remote stores + * (since they precede the earlier owner fence in sb, which precedes the remote + * fence in sc, which precedes the remote stores in sb). After the second owner + * fence there will be a re-check of the slow-path variables anyways, so the + * "owner will notice that it's on the slow path eventually" guarantee is + * satisfied. To make sure that the out-of-band-messaging constraint is as well, + * note that either the message passing is sequenced before the second owner + * fence (in which case the remote stores happen before the second set of owner + * stores, so malloc sees a value of zero for next_event_fast and goes down the + * slow path), or it is not (in which case the owner sees the tsd slow-path + * writes on its previous update). This leaves open the possibility that the + * remote thread will (at some arbitrary point in the future) zero out one half + * of the owner thread's next_event_fast, but that's always safe (it just sends + * it down the slow path earlier). + */ +static void +te_ctx_next_event_fast_update(te_ctx_t *ctx) { + uint64_t next_event = te_ctx_next_event_get(ctx); + uint64_t next_event_fast = (next_event <= TE_NEXT_EVENT_FAST_MAX) ? + next_event : 0U; + te_ctx_next_event_fast_set(ctx, next_event_fast); +} + +void +te_recompute_fast_threshold(tsd_t *tsd) { + if (tsd_state_get(tsd) != tsd_state_nominal) { + /* Check first because this is also called on purgatory. */ + te_next_event_fast_set_non_nominal(tsd); + return; + } + + te_ctx_t ctx; + te_ctx_get(tsd, &ctx, true); + te_ctx_next_event_fast_update(&ctx); + te_ctx_get(tsd, &ctx, false); + te_ctx_next_event_fast_update(&ctx); + + atomic_fence(ATOMIC_SEQ_CST); + if (tsd_state_get(tsd) != tsd_state_nominal) { + te_next_event_fast_set_non_nominal(tsd); + } +} + +static void +te_adjust_thresholds_helper(tsd_t *tsd, te_ctx_t *ctx, + uint64_t wait) { + /* + * The next threshold based on future events can only be adjusted after + * progressing the last_event counter (which is set to current). + */ + assert(te_ctx_current_bytes_get(ctx) == te_ctx_last_event_get(ctx)); + assert(wait <= TE_MAX_START_WAIT); + + uint64_t next_event = te_ctx_last_event_get(ctx) + (wait <= + TE_MAX_INTERVAL ? wait : TE_MAX_INTERVAL); + te_ctx_next_event_set(tsd, ctx, next_event); +} + +static uint64_t +te_clip_event_wait(uint64_t event_wait) { + assert(event_wait > 0U); + if (TE_MIN_START_WAIT > 1U && + unlikely(event_wait < TE_MIN_START_WAIT)) { + event_wait = TE_MIN_START_WAIT; + } + if (TE_MAX_START_WAIT < UINT64_MAX && + unlikely(event_wait > TE_MAX_START_WAIT)) { + event_wait = TE_MAX_START_WAIT; + } + return event_wait; +} + +void +te_event_trigger(tsd_t *tsd, te_ctx_t *ctx) { + /* usize has already been added to thread_allocated. */ + uint64_t bytes_after = te_ctx_current_bytes_get(ctx); + /* The subtraction is intentionally susceptible to underflow. */ + uint64_t accumbytes = bytes_after - te_ctx_last_event_get(ctx); + + te_ctx_last_event_set(ctx, bytes_after); + + bool allow_event_trigger = tsd_nominal(tsd) && + tsd_reentrancy_level_get(tsd) == 0; + bool is_alloc = ctx->is_alloc; + uint64_t wait = TE_MAX_START_WAIT; + +#define E(event, condition, alloc_event) \ + bool is_##event##_triggered = false; \ + if (is_alloc == alloc_event && condition) { \ + uint64_t event_wait = event##_event_wait_get(tsd); \ + assert(event_wait <= TE_MAX_START_WAIT); \ + if (event_wait > accumbytes) { \ + event_wait -= accumbytes; \ + } else if (!allow_event_trigger) { \ + event_wait = event##_postponed_event_wait(tsd); \ + } else { \ + is_##event##_triggered = true; \ + event_wait = event##_new_event_wait(tsd); \ + } \ + event_wait = te_clip_event_wait(event_wait); \ + event##_event_wait_set(tsd, event_wait); \ + if (event_wait < wait) { \ + wait = event_wait; \ + } \ + } + + ITERATE_OVER_ALL_EVENTS +#undef E + + assert(wait <= TE_MAX_START_WAIT); + te_adjust_thresholds_helper(tsd, ctx, wait); + te_assert_invariants(tsd); + +#define E(event, condition, alloc_event) \ + if (is_alloc == alloc_event && condition && \ + is_##event##_triggered) { \ + assert(allow_event_trigger); \ + uint64_t elapsed = event##_fetch_elapsed(tsd); \ + event##_event_handler(tsd, elapsed); \ + } + + ITERATE_OVER_ALL_EVENTS +#undef E + + te_assert_invariants(tsd); +} + +static void +te_init(tsd_t *tsd, bool is_alloc) { + te_ctx_t ctx; + te_ctx_get(tsd, &ctx, is_alloc); + /* + * Reset the last event to current, which starts the events from a clean + * state. This is necessary when re-init the tsd event counters. + * + * The event counters maintain a relationship with the current bytes: + * last_event <= current < next_event. When a reinit happens (e.g. + * reincarnated tsd), the last event needs progressing because all + * events start fresh from the current bytes. + */ + te_ctx_last_event_set(&ctx, te_ctx_current_bytes_get(&ctx)); + + uint64_t wait = TE_MAX_START_WAIT; +#define E(event, condition, alloc_event) \ + if (is_alloc == alloc_event && condition) { \ + uint64_t event_wait = event##_new_event_wait(tsd); \ + event_wait = te_clip_event_wait(event_wait); \ + event##_event_wait_set(tsd, event_wait); \ + if (event_wait < wait) { \ + wait = event_wait; \ + } \ + } + + ITERATE_OVER_ALL_EVENTS +#undef E + te_adjust_thresholds_helper(tsd, &ctx, wait); +} + +void +tsd_te_init(tsd_t *tsd) { + /* Make sure no overflow for the bytes accumulated on event_trigger. */ + assert(TE_MAX_INTERVAL <= UINT64_MAX - SC_LARGE_MAXCLASS + 1); + te_init(tsd, true); + te_init(tsd, false); + te_assert_invariants(tsd); +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/ticker.c b/src/duckdb/extension/jemalloc/jemalloc/src/ticker.c new file mode 100644 index 000000000..790b5c200 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/ticker.c @@ -0,0 +1,32 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +/* + * To avoid using floating point math down core paths (still necessary because + * versions of the glibc dynamic loader that did not preserve xmm registers are + * still somewhat common, requiring us to be compilable with -mno-sse), and also + * to avoid generally expensive library calls, we use a precomputed table of + * values. We want to sample U uniformly on [0, 1], and then compute + * ceil(log(u)/log(1-1/nticks)). We're mostly interested in the case where + * nticks is reasonably big, so 1/log(1-1/nticks) is well-approximated by + * -nticks. + * + * To compute log(u), we sample an integer in [1, 64] and divide, then just look + * up results in a table. As a space-compression mechanism, we store these as + * uint8_t by dividing the range (255) by the highest-magnitude value the log + * can take on, and using that as a multiplier. We then have to divide by that + * multiplier at the end of the computation. + * + * The values here are computed in src/ticker.py + */ + +const uint8_t ticker_geom_table[1 << TICKER_GEOM_NBITS] = { + 254, 211, 187, 169, 156, 144, 135, 127, + 120, 113, 107, 102, 97, 93, 89, 85, + 81, 77, 74, 71, 68, 65, 62, 60, + 57, 55, 53, 50, 48, 46, 44, 42, + 40, 39, 37, 35, 33, 32, 30, 29, + 27, 26, 24, 23, 21, 20, 19, 18, + 16, 15, 14, 13, 12, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0 +}; diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/tsd.c b/src/duckdb/extension/jemalloc/jemalloc/src/tsd.c new file mode 100644 index 000000000..a4db8e360 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/tsd.c @@ -0,0 +1,565 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/san.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/rtree.h" + +/******************************************************************************/ +/* Data. */ + +/* TSD_INITIALIZER triggers "-Wmissing-field-initializer" */ +JEMALLOC_DIAGNOSTIC_PUSH +JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS + +#ifdef JEMALLOC_MALLOC_THREAD_CLEANUP +JEMALLOC_TSD_TYPE_ATTR(tsd_t) tsd_tls = TSD_INITIALIZER; +JEMALLOC_TSD_TYPE_ATTR(bool) JEMALLOC_TLS_MODEL tsd_initialized = false; +bool tsd_booted = false; +#elif (defined(JEMALLOC_TLS)) +JEMALLOC_TSD_TYPE_ATTR(tsd_t) tsd_tls = TSD_INITIALIZER; +pthread_key_t tsd_tsd; +bool tsd_booted = false; +#elif (defined(_WIN32)) +DWORD tsd_tsd; +tsd_wrapper_t tsd_boot_wrapper = {false, TSD_INITIALIZER}; +bool tsd_booted = false; +#else + +/* + * This contains a mutex, but it's pretty convenient to allow the mutex code to + * have a dependency on tsd. So we define the struct here, and only refer to it + * by pointer in the header. + */ +struct tsd_init_head_s { + ql_head(tsd_init_block_t) blocks; + malloc_mutex_t lock; +}; + +pthread_key_t tsd_tsd; +tsd_init_head_t tsd_init_head = { + ql_head_initializer(blocks), + MALLOC_MUTEX_INITIALIZER +}; + +tsd_wrapper_t tsd_boot_wrapper = { + false, + TSD_INITIALIZER +}; +bool tsd_booted = false; +#endif + +JEMALLOC_DIAGNOSTIC_POP + +/******************************************************************************/ + +/* A list of all the tsds in the nominal state. */ +typedef ql_head(tsd_t) tsd_list_t; +static tsd_list_t tsd_nominal_tsds = ql_head_initializer(tsd_nominal_tsds); +static malloc_mutex_t tsd_nominal_tsds_lock; + +/* How many slow-path-enabling features are turned on. */ +static atomic_u32_t tsd_global_slow_count = ATOMIC_INIT(0); + +static bool +tsd_in_nominal_list(tsd_t *tsd) { + tsd_t *tsd_list; + bool found = false; + /* + * We don't know that tsd is nominal; it might not be safe to get data + * out of it here. + */ + malloc_mutex_lock(TSDN_NULL, &tsd_nominal_tsds_lock); + ql_foreach(tsd_list, &tsd_nominal_tsds, TSD_MANGLE(tsd_link)) { + if (tsd == tsd_list) { + found = true; + break; + } + } + malloc_mutex_unlock(TSDN_NULL, &tsd_nominal_tsds_lock); + return found; +} + +static void +tsd_add_nominal(tsd_t *tsd) { + assert(!tsd_in_nominal_list(tsd)); + assert(tsd_state_get(tsd) <= tsd_state_nominal_max); + ql_elm_new(tsd, TSD_MANGLE(tsd_link)); + malloc_mutex_lock(tsd_tsdn(tsd), &tsd_nominal_tsds_lock); + ql_tail_insert(&tsd_nominal_tsds, tsd, TSD_MANGLE(tsd_link)); + malloc_mutex_unlock(tsd_tsdn(tsd), &tsd_nominal_tsds_lock); +} + +static void +tsd_remove_nominal(tsd_t *tsd) { + assert(tsd_in_nominal_list(tsd)); + assert(tsd_state_get(tsd) <= tsd_state_nominal_max); + malloc_mutex_lock(tsd_tsdn(tsd), &tsd_nominal_tsds_lock); + ql_remove(&tsd_nominal_tsds, tsd, TSD_MANGLE(tsd_link)); + malloc_mutex_unlock(tsd_tsdn(tsd), &tsd_nominal_tsds_lock); +} + +static void +tsd_force_recompute(tsdn_t *tsdn) { + /* + * The stores to tsd->state here need to synchronize with the exchange + * in tsd_slow_update. + */ + atomic_fence(ATOMIC_RELEASE); + malloc_mutex_lock(tsdn, &tsd_nominal_tsds_lock); + tsd_t *remote_tsd; + ql_foreach(remote_tsd, &tsd_nominal_tsds, TSD_MANGLE(tsd_link)) { + assert(tsd_atomic_load(&remote_tsd->state, ATOMIC_RELAXED) + <= tsd_state_nominal_max); + tsd_atomic_store(&remote_tsd->state, + tsd_state_nominal_recompute, ATOMIC_RELAXED); + /* See comments in te_recompute_fast_threshold(). */ + atomic_fence(ATOMIC_SEQ_CST); + te_next_event_fast_set_non_nominal(remote_tsd); + } + malloc_mutex_unlock(tsdn, &tsd_nominal_tsds_lock); +} + +void +tsd_global_slow_inc(tsdn_t *tsdn) { + atomic_fetch_add_u32(&tsd_global_slow_count, 1, ATOMIC_RELAXED); + /* + * We unconditionally force a recompute, even if the global slow count + * was already positive. If we didn't, then it would be possible for us + * to return to the user, have the user synchronize externally with some + * other thread, and then have that other thread not have picked up the + * update yet (since the original incrementing thread might still be + * making its way through the tsd list). + */ + tsd_force_recompute(tsdn); +} + +void tsd_global_slow_dec(tsdn_t *tsdn) { + atomic_fetch_sub_u32(&tsd_global_slow_count, 1, ATOMIC_RELAXED); + /* See the note in ..._inc(). */ + tsd_force_recompute(tsdn); +} + +static bool +tsd_local_slow(tsd_t *tsd) { + return !tsd_tcache_enabled_get(tsd) + || tsd_reentrancy_level_get(tsd) > 0; +} + +bool +tsd_global_slow(void) { + return atomic_load_u32(&tsd_global_slow_count, ATOMIC_RELAXED) > 0; +} + +/******************************************************************************/ + +static uint8_t +tsd_state_compute(tsd_t *tsd) { + if (!tsd_nominal(tsd)) { + return tsd_state_get(tsd); + } + /* We're in *a* nominal state; but which one? */ + if (malloc_slow || tsd_local_slow(tsd) || tsd_global_slow()) { + return tsd_state_nominal_slow; + } else { + return tsd_state_nominal; + } +} + +void +tsd_slow_update(tsd_t *tsd) { + uint8_t old_state; + do { + uint8_t new_state = tsd_state_compute(tsd); + old_state = tsd_atomic_exchange(&tsd->state, new_state, + ATOMIC_ACQUIRE); + } while (old_state == tsd_state_nominal_recompute); + + te_recompute_fast_threshold(tsd); +} + +void +tsd_state_set(tsd_t *tsd, uint8_t new_state) { + /* Only the tsd module can change the state *to* recompute. */ + assert(new_state != tsd_state_nominal_recompute); + uint8_t old_state = tsd_atomic_load(&tsd->state, ATOMIC_RELAXED); + if (old_state > tsd_state_nominal_max) { + /* + * Not currently in the nominal list, but it might need to be + * inserted there. + */ + assert(!tsd_in_nominal_list(tsd)); + tsd_atomic_store(&tsd->state, new_state, ATOMIC_RELAXED); + if (new_state <= tsd_state_nominal_max) { + tsd_add_nominal(tsd); + } + } else { + /* + * We're currently nominal. If the new state is non-nominal, + * great; we take ourselves off the list and just enter the new + * state. + */ + assert(tsd_in_nominal_list(tsd)); + if (new_state > tsd_state_nominal_max) { + tsd_remove_nominal(tsd); + tsd_atomic_store(&tsd->state, new_state, + ATOMIC_RELAXED); + } else { + /* + * This is the tricky case. We're transitioning from + * one nominal state to another. The caller can't know + * about any races that are occurring at the same time, + * so we always have to recompute no matter what. + */ + tsd_slow_update(tsd); + } + } + te_recompute_fast_threshold(tsd); +} + +static void +tsd_prng_state_init(tsd_t *tsd) { + /* + * A nondeterministic seed based on the address of tsd reduces + * the likelihood of lockstep non-uniform cache index + * utilization among identical concurrent processes, but at the + * cost of test repeatability. For debug builds, instead use a + * deterministic seed. + */ + *tsd_prng_statep_get(tsd) = config_debug ? 0 : + (uint64_t)(uintptr_t)tsd; +} + +static bool +tsd_data_init(tsd_t *tsd) { + /* + * We initialize the rtree context first (before the tcache), since the + * tcache initialization depends on it. + */ + rtree_ctx_data_init(tsd_rtree_ctxp_get_unsafe(tsd)); + tsd_prng_state_init(tsd); + tsd_te_init(tsd); /* event_init may use the prng state above. */ + tsd_san_init(tsd); + return tsd_tcache_enabled_data_init(tsd); +} + +static void +assert_tsd_data_cleanup_done(tsd_t *tsd) { + assert(!tsd_nominal(tsd)); + assert(!tsd_in_nominal_list(tsd)); + assert(*tsd_arenap_get_unsafe(tsd) == NULL); + assert(*tsd_iarenap_get_unsafe(tsd) == NULL); + assert(*tsd_tcache_enabledp_get_unsafe(tsd) == false); + assert(*tsd_prof_tdatap_get_unsafe(tsd) == NULL); +} + +static bool +tsd_data_init_nocleanup(tsd_t *tsd) { + assert(tsd_state_get(tsd) == tsd_state_reincarnated || + tsd_state_get(tsd) == tsd_state_minimal_initialized); + /* + * During reincarnation, there is no guarantee that the cleanup function + * will be called (deallocation may happen after all tsd destructors). + * We set up tsd in a way that no cleanup is needed. + */ + rtree_ctx_data_init(tsd_rtree_ctxp_get_unsafe(tsd)); + *tsd_tcache_enabledp_get_unsafe(tsd) = false; + *tsd_reentrancy_levelp_get(tsd) = 1; + tsd_prng_state_init(tsd); + tsd_te_init(tsd); /* event_init may use the prng state above. */ + tsd_san_init(tsd); + assert_tsd_data_cleanup_done(tsd); + + return false; +} + +tsd_t * +tsd_fetch_slow(tsd_t *tsd, bool minimal) { + assert(!tsd_fast(tsd)); + + if (tsd_state_get(tsd) == tsd_state_nominal_slow) { + /* + * On slow path but no work needed. Note that we can't + * necessarily *assert* that we're slow, because we might be + * slow because of an asynchronous modification to global state, + * which might be asynchronously modified *back*. + */ + } else if (tsd_state_get(tsd) == tsd_state_nominal_recompute) { + tsd_slow_update(tsd); + } else if (tsd_state_get(tsd) == tsd_state_uninitialized) { + if (!minimal) { + if (tsd_booted) { + tsd_state_set(tsd, tsd_state_nominal); + tsd_slow_update(tsd); + /* Trigger cleanup handler registration. */ + tsd_set(tsd); + tsd_data_init(tsd); + } + } else { + tsd_state_set(tsd, tsd_state_minimal_initialized); + tsd_set(tsd); + tsd_data_init_nocleanup(tsd); + *tsd_min_init_state_nfetchedp_get(tsd) = 1; + } + } else if (tsd_state_get(tsd) == tsd_state_minimal_initialized) { + /* + * If a thread only ever deallocates (e.g. dedicated reclamation + * threads), we want to help it to eventually escape the slow + * path (caused by the minimal initialized state). The nfetched + * counter tracks the number of times the tsd has been accessed + * under the min init state, and triggers the switch to nominal + * once reached the max allowed count. + * + * This means at most 128 deallocations stay on the slow path. + * + * Also see comments in free_default(). + */ + uint8_t *nfetched = tsd_min_init_state_nfetchedp_get(tsd); + assert(*nfetched >= 1); + (*nfetched)++; + if (!minimal || *nfetched == TSD_MIN_INIT_STATE_MAX_FETCHED) { + /* Switch to fully initialized. */ + tsd_state_set(tsd, tsd_state_nominal); + assert(*tsd_reentrancy_levelp_get(tsd) >= 1); + (*tsd_reentrancy_levelp_get(tsd))--; + tsd_slow_update(tsd); + tsd_data_init(tsd); + } else { + assert_tsd_data_cleanup_done(tsd); + } + } else if (tsd_state_get(tsd) == tsd_state_purgatory) { + tsd_state_set(tsd, tsd_state_reincarnated); + tsd_set(tsd); + tsd_data_init_nocleanup(tsd); + } else { + assert(tsd_state_get(tsd) == tsd_state_reincarnated); + } + + return tsd; +} + +void * +malloc_tsd_malloc(size_t size) { + return a0malloc(CACHELINE_CEILING(size)); +} + +void +malloc_tsd_dalloc(void *wrapper) { + a0dalloc(wrapper); +} + +#if defined(JEMALLOC_MALLOC_THREAD_CLEANUP) || defined(_WIN32) +static unsigned ncleanups; +static malloc_tsd_cleanup_t cleanups[MALLOC_TSD_CLEANUPS_MAX]; + +#ifndef _WIN32 +JEMALLOC_EXPORT +#endif +void +_malloc_thread_cleanup(void) { + bool pending[MALLOC_TSD_CLEANUPS_MAX], again; + unsigned i; + + for (i = 0; i < ncleanups; i++) { + pending[i] = true; + } + + do { + again = false; + for (i = 0; i < ncleanups; i++) { + if (pending[i]) { + pending[i] = cleanups[i](); + if (pending[i]) { + again = true; + } + } + } + } while (again); +} + +#ifndef _WIN32 +JEMALLOC_EXPORT +#endif +void +_malloc_tsd_cleanup_register(bool (*f)(void)) { + assert(ncleanups < MALLOC_TSD_CLEANUPS_MAX); + cleanups[ncleanups] = f; + ncleanups++; +} + +#endif + +static void +tsd_do_data_cleanup(tsd_t *tsd) { + prof_tdata_cleanup(tsd); + iarena_cleanup(tsd); + arena_cleanup(tsd); + tcache_cleanup(tsd); + witnesses_cleanup(tsd_witness_tsdp_get_unsafe(tsd)); + *tsd_reentrancy_levelp_get(tsd) = 1; +} + +void +tsd_cleanup(void *arg) { + tsd_t *tsd = (tsd_t *)arg; + + switch (tsd_state_get(tsd)) { + case tsd_state_uninitialized: + /* Do nothing. */ + break; + case tsd_state_minimal_initialized: + /* This implies the thread only did free() in its life time. */ + /* Fall through. */ + case tsd_state_reincarnated: + /* + * Reincarnated means another destructor deallocated memory + * after the destructor was called. Cleanup isn't required but + * is still called for testing and completeness. + */ + assert_tsd_data_cleanup_done(tsd); + JEMALLOC_FALLTHROUGH; + case tsd_state_nominal: + case tsd_state_nominal_slow: + tsd_do_data_cleanup(tsd); + tsd_state_set(tsd, tsd_state_purgatory); + tsd_set(tsd); + break; + case tsd_state_purgatory: + /* + * The previous time this destructor was called, we set the + * state to tsd_state_purgatory so that other destructors + * wouldn't cause re-creation of the tsd. This time, do + * nothing, and do not request another callback. + */ + break; + default: + not_reached(); + } +#ifdef JEMALLOC_JET + test_callback_t test_callback = *tsd_test_callbackp_get_unsafe(tsd); + int *data = tsd_test_datap_get_unsafe(tsd); + if (test_callback != NULL) { + test_callback(data); + } +#endif +} + +tsd_t * +malloc_tsd_boot0(void) { + tsd_t *tsd; + +#if defined(JEMALLOC_MALLOC_THREAD_CLEANUP) || defined(_WIN32) + ncleanups = 0; +#endif + if (malloc_mutex_init(&tsd_nominal_tsds_lock, "tsd_nominal_tsds_lock", + WITNESS_RANK_OMIT, malloc_mutex_rank_exclusive)) { + return NULL; + } + if (tsd_boot0()) { + return NULL; + } + tsd = tsd_fetch(); + return tsd; +} + +void +malloc_tsd_boot1(void) { + tsd_boot1(); + tsd_t *tsd = tsd_fetch(); + /* malloc_slow has been set properly. Update tsd_slow. */ + tsd_slow_update(tsd); +} + +#ifdef _WIN32 +static BOOL WINAPI +_tls_callback(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpvReserved) { + switch (fdwReason) { +#ifdef JEMALLOC_LAZY_LOCK + case DLL_THREAD_ATTACH: + isthreaded = true; + break; +#endif + case DLL_THREAD_DETACH: + _malloc_thread_cleanup(); + break; + default: + break; + } + return true; +} + +/* + * We need to be able to say "read" here (in the "pragma section"), but have + * hooked "read". We won't read for the rest of the file, so we can get away + * with unhooking. + */ +#ifdef read +# undef read +#endif + +#ifdef _MSC_VER +# ifdef _M_IX86 +# pragma comment(linker, "/INCLUDE:__tls_used") +# pragma comment(linker, "/INCLUDE:_tls_callback") +# else +# pragma comment(linker, "/INCLUDE:_tls_used") +# pragma comment(linker, "/INCLUDE:" STRINGIFY(tls_callback) ) +# endif +# pragma section(".CRT$XLY",long,read) +#endif +JEMALLOC_SECTION(".CRT$XLY") JEMALLOC_ATTR(used) +BOOL (WINAPI *const tls_callback)(HINSTANCE hinstDLL, + DWORD fdwReason, LPVOID lpvReserved) = _tls_callback; +#endif + +#if (!defined(JEMALLOC_MALLOC_THREAD_CLEANUP) && !defined(JEMALLOC_TLS) && \ + !defined(_WIN32)) +void * +tsd_init_check_recursion(tsd_init_head_t *head, tsd_init_block_t *block) { + pthread_t self = pthread_self(); + tsd_init_block_t *iter; + + /* Check whether this thread has already inserted into the list. */ + malloc_mutex_lock(TSDN_NULL, &head->lock); + ql_foreach(iter, &head->blocks, link) { + if (iter->thread == self) { + malloc_mutex_unlock(TSDN_NULL, &head->lock); + return iter->data; + } + } + /* Insert block into list. */ + ql_elm_new(block, link); + block->thread = self; + ql_tail_insert(&head->blocks, block, link); + malloc_mutex_unlock(TSDN_NULL, &head->lock); + return NULL; +} + +void +tsd_init_finish(tsd_init_head_t *head, tsd_init_block_t *block) { + malloc_mutex_lock(TSDN_NULL, &head->lock); + ql_remove(&head->blocks, block, link); + malloc_mutex_unlock(TSDN_NULL, &head->lock); +} +#endif + +void +tsd_prefork(tsd_t *tsd) { + malloc_mutex_prefork(tsd_tsdn(tsd), &tsd_nominal_tsds_lock); +} + +void +tsd_postfork_parent(tsd_t *tsd) { + malloc_mutex_postfork_parent(tsd_tsdn(tsd), &tsd_nominal_tsds_lock); +} + +void +tsd_postfork_child(tsd_t *tsd) { + malloc_mutex_postfork_child(tsd_tsdn(tsd), &tsd_nominal_tsds_lock); + ql_new(&tsd_nominal_tsds); + + if (tsd_state_get(tsd) <= tsd_state_nominal_max) { + tsd_add_nominal(tsd); + } +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/util.c b/src/duckdb/extension/jemalloc/jemalloc/src/util.c new file mode 100644 index 000000000..b73848fb5 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/util.c @@ -0,0 +1,49 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/util.h" + +/* Reads the next size pair in a multi-sized option. */ +bool +multi_setting_parse_next(const char **setting_segment_cur, size_t *len_left, + size_t *key_start, size_t *key_end, size_t *value) { + const char *cur = *setting_segment_cur; + char *end; + uintmax_t um; + + set_errno(0); + + /* First number, then '-' */ + um = malloc_strtoumax(cur, &end, 0); + if (get_errno() != 0 || *end != '-') { + return true; + } + *key_start = (size_t)um; + cur = end + 1; + + /* Second number, then ':' */ + um = malloc_strtoumax(cur, &end, 0); + if (get_errno() != 0 || *end != ':') { + return true; + } + *key_end = (size_t)um; + cur = end + 1; + + /* Last number */ + um = malloc_strtoumax(cur, &end, 0); + if (get_errno() != 0) { + return true; + } + *value = (size_t)um; + + /* Consume the separator if there is one. */ + if (*end == '|') { + end++; + } + + *len_left -= end - *setting_segment_cur; + *setting_segment_cur = end; + + return false; +} + diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/witness.c b/src/duckdb/extension/jemalloc/jemalloc/src/witness.c new file mode 100644 index 000000000..4474af04c --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/witness.c @@ -0,0 +1,122 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/malloc_io.h" + +void +witness_init(witness_t *witness, const char *name, witness_rank_t rank, + witness_comp_t *comp, void *opaque) { + witness->name = name; + witness->rank = rank; + witness->comp = comp; + witness->opaque = opaque; +} + +static void +witness_print_witness(witness_t *w, unsigned n) { + assert(n > 0); + if (n == 1) { + malloc_printf(" %s(%u)", w->name, w->rank); + } else { + malloc_printf(" %s(%u)X%u", w->name, w->rank, n); + } +} + +static void +witness_print_witnesses(const witness_list_t *witnesses) { + witness_t *w, *last = NULL; + unsigned n = 0; + ql_foreach(w, witnesses, link) { + if (last != NULL && w->rank > last->rank) { + assert(w->name != last->name); + witness_print_witness(last, n); + n = 0; + } else if (last != NULL) { + assert(w->rank == last->rank); + assert(w->name == last->name); + } + last = w; + ++n; + } + if (last != NULL) { + witness_print_witness(last, n); + } +} + +static void +witness_lock_error_impl(const witness_list_t *witnesses, + const witness_t *witness) { + malloc_printf(": Lock rank order reversal:"); + witness_print_witnesses(witnesses); + malloc_printf(" %s(%u)\n", witness->name, witness->rank); + abort(); +} +witness_lock_error_t *JET_MUTABLE witness_lock_error = witness_lock_error_impl; + +static void +witness_owner_error_impl(const witness_t *witness) { + malloc_printf(": Should own %s(%u)\n", witness->name, + witness->rank); + abort(); +} +witness_owner_error_t *JET_MUTABLE witness_owner_error = + witness_owner_error_impl; + +static void +witness_not_owner_error_impl(const witness_t *witness) { + malloc_printf(": Should not own %s(%u)\n", witness->name, + witness->rank); + abort(); +} +witness_not_owner_error_t *JET_MUTABLE witness_not_owner_error = + witness_not_owner_error_impl; + +static void +witness_depth_error_impl(const witness_list_t *witnesses, + witness_rank_t rank_inclusive, unsigned depth) { + malloc_printf(": Should own %u lock%s of rank >= %u:", depth, + (depth != 1) ? "s" : "", rank_inclusive); + witness_print_witnesses(witnesses); + malloc_printf("\n"); + abort(); +} +witness_depth_error_t *JET_MUTABLE witness_depth_error = + witness_depth_error_impl; + +void +witnesses_cleanup(witness_tsd_t *witness_tsd) { + witness_assert_lockless(witness_tsd_tsdn(witness_tsd)); + + /* Do nothing. */ +} + +void +witness_prefork(witness_tsd_t *witness_tsd) { + if (!config_debug) { + return; + } + witness_tsd->forking = true; +} + +void +witness_postfork_parent(witness_tsd_t *witness_tsd) { + if (!config_debug) { + return; + } + witness_tsd->forking = false; +} + +void +witness_postfork_child(witness_tsd_t *witness_tsd) { + if (!config_debug) { + return; + } +#ifndef JEMALLOC_MUTEX_INIT_CB + witness_list_t *witnesses; + + witnesses = &witness_tsd->witnesses; + ql_new(witnesses); +#endif + witness_tsd->forking = false; +} diff --git a/src/duckdb/extension/jemalloc/jemalloc/src/zone.c b/src/duckdb/extension/jemalloc/jemalloc/src/zone.c new file mode 100644 index 000000000..8a774a708 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc/src/zone.c @@ -0,0 +1,469 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" + +#ifdef JEMALLOC_ZONE + +/* Definitions of the following structs in malloc/malloc.h might be too old + * for the built binary to run on newer versions of OSX. So use the newest + * possible version of those structs. + */ +typedef struct _malloc_zone_t { + void *reserved1; + void *reserved2; + size_t (*size)(struct _malloc_zone_t *, const void *); + void *(*malloc)(struct _malloc_zone_t *, size_t); + void *(*calloc)(struct _malloc_zone_t *, size_t, size_t); + void *(*valloc)(struct _malloc_zone_t *, size_t); + void (*free)(struct _malloc_zone_t *, void *); + void *(*realloc)(struct _malloc_zone_t *, void *, size_t); + void (*destroy)(struct _malloc_zone_t *); + const char *zone_name; + unsigned (*batch_malloc)(struct _malloc_zone_t *, size_t, void **, unsigned); + void (*batch_free)(struct _malloc_zone_t *, void **, unsigned); + struct malloc_introspection_t *introspect; + unsigned version; + void *(*memalign)(struct _malloc_zone_t *, size_t, size_t); + void (*free_definite_size)(struct _malloc_zone_t *, void *, size_t); + size_t (*pressure_relief)(struct _malloc_zone_t *, size_t); +} malloc_zone_t; + +typedef struct { + vm_address_t address; + vm_size_t size; +} vm_range_t; + +typedef struct malloc_statistics_t { + unsigned blocks_in_use; + size_t size_in_use; + size_t max_size_in_use; + size_t size_allocated; +} malloc_statistics_t; + +typedef kern_return_t memory_reader_t(task_t, vm_address_t, vm_size_t, void **); + +typedef void vm_range_recorder_t(task_t, void *, unsigned type, vm_range_t *, unsigned); + +typedef struct malloc_introspection_t { + kern_return_t (*enumerator)(task_t, void *, unsigned, vm_address_t, memory_reader_t, vm_range_recorder_t); + size_t (*good_size)(malloc_zone_t *, size_t); + boolean_t (*check)(malloc_zone_t *); + void (*print)(malloc_zone_t *, boolean_t); + void (*log)(malloc_zone_t *, void *); + void (*force_lock)(malloc_zone_t *); + void (*force_unlock)(malloc_zone_t *); + void (*statistics)(malloc_zone_t *, malloc_statistics_t *); + boolean_t (*zone_locked)(malloc_zone_t *); + boolean_t (*enable_discharge_checking)(malloc_zone_t *); + boolean_t (*disable_discharge_checking)(malloc_zone_t *); + void (*discharge)(malloc_zone_t *, void *); +#ifdef __BLOCKS__ + void (*enumerate_discharged_pointers)(malloc_zone_t *, void (^)(void *, void *)); +#else + void *enumerate_unavailable_without_blocks; +#endif + void (*reinit_lock)(malloc_zone_t *); +} malloc_introspection_t; + +extern kern_return_t malloc_get_all_zones(task_t, memory_reader_t, vm_address_t **, unsigned *); + +extern malloc_zone_t *malloc_default_zone(void); + +extern void malloc_zone_register(malloc_zone_t *zone); + +extern void malloc_zone_unregister(malloc_zone_t *zone); + +/* + * The malloc_default_purgeable_zone() function is only available on >= 10.6. + * We need to check whether it is present at runtime, thus the weak_import. + */ +extern malloc_zone_t *malloc_default_purgeable_zone(void) +JEMALLOC_ATTR(weak_import); + +/******************************************************************************/ +/* Data. */ + +static malloc_zone_t *default_zone, *purgeable_zone; +static malloc_zone_t jemalloc_zone; +static struct malloc_introspection_t jemalloc_zone_introspect; +static pid_t zone_force_lock_pid = -1; + +/******************************************************************************/ +/* Function prototypes for non-inline static functions. */ + +static size_t zone_size(malloc_zone_t *zone, const void *ptr); +static void *zone_malloc(malloc_zone_t *zone, size_t size); +static void *zone_calloc(malloc_zone_t *zone, size_t num, size_t size); +static void *zone_valloc(malloc_zone_t *zone, size_t size); +static void zone_free(malloc_zone_t *zone, void *ptr); +static void *zone_realloc(malloc_zone_t *zone, void *ptr, size_t size); +static void *zone_memalign(malloc_zone_t *zone, size_t alignment, + size_t size); +static void zone_free_definite_size(malloc_zone_t *zone, void *ptr, + size_t size); +static void zone_destroy(malloc_zone_t *zone); +static unsigned zone_batch_malloc(struct _malloc_zone_t *zone, size_t size, + void **results, unsigned num_requested); +static void zone_batch_free(struct _malloc_zone_t *zone, + void **to_be_freed, unsigned num_to_be_freed); +static size_t zone_pressure_relief(struct _malloc_zone_t *zone, size_t goal); +static size_t zone_good_size(malloc_zone_t *zone, size_t size); +static kern_return_t zone_enumerator(task_t task, void *data, unsigned type_mask, + vm_address_t zone_address, memory_reader_t reader, + vm_range_recorder_t recorder); +static boolean_t zone_check(malloc_zone_t *zone); +static void zone_print(malloc_zone_t *zone, boolean_t verbose); +static void zone_log(malloc_zone_t *zone, void *address); +static void zone_force_lock(malloc_zone_t *zone); +static void zone_force_unlock(malloc_zone_t *zone); +static void zone_statistics(malloc_zone_t *zone, + malloc_statistics_t *stats); +static boolean_t zone_locked(malloc_zone_t *zone); +static void zone_reinit_lock(malloc_zone_t *zone); + +/******************************************************************************/ +/* + * Functions. + */ + +static size_t +zone_size(malloc_zone_t *zone, const void *ptr) { + /* + * There appear to be places within Darwin (such as setenv(3)) that + * cause calls to this function with pointers that *no* zone owns. If + * we knew that all pointers were owned by *some* zone, we could split + * our zone into two parts, and use one as the default allocator and + * the other as the default deallocator/reallocator. Since that will + * not work in practice, we must check all pointers to assure that they + * reside within a mapped extent before determining size. + */ + return ivsalloc(tsdn_fetch(), ptr); +} + +static void * +zone_malloc(malloc_zone_t *zone, size_t size) { + return je_malloc(size); +} + +static void * +zone_calloc(malloc_zone_t *zone, size_t num, size_t size) { + return je_calloc(num, size); +} + +static void * +zone_valloc(malloc_zone_t *zone, size_t size) { + void *ret = NULL; /* Assignment avoids useless compiler warning. */ + + je_posix_memalign(&ret, PAGE, size); + + return ret; +} + +static void +zone_free(malloc_zone_t *zone, void *ptr) { + if (ivsalloc(tsdn_fetch(), ptr) != 0) { + je_free(ptr); + return; + } + + free(ptr); +} + +static void * +zone_realloc(malloc_zone_t *zone, void *ptr, size_t size) { + if (ivsalloc(tsdn_fetch(), ptr) != 0) { + return je_realloc(ptr, size); + } + + return realloc(ptr, size); +} + +static void * +zone_memalign(malloc_zone_t *zone, size_t alignment, size_t size) { + void *ret = NULL; /* Assignment avoids useless compiler warning. */ + + je_posix_memalign(&ret, alignment, size); + + return ret; +} + +static void +zone_free_definite_size(malloc_zone_t *zone, void *ptr, size_t size) { + size_t alloc_size; + + alloc_size = ivsalloc(tsdn_fetch(), ptr); + if (alloc_size != 0) { + assert(alloc_size == size); + je_free(ptr); + return; + } + + free(ptr); +} + +static void +zone_destroy(malloc_zone_t *zone) { + /* This function should never be called. */ + not_reached(); +} + +static unsigned +zone_batch_malloc(struct _malloc_zone_t *zone, size_t size, void **results, + unsigned num_requested) { + unsigned i; + + for (i = 0; i < num_requested; i++) { + results[i] = je_malloc(size); + if (!results[i]) + break; + } + + return i; +} + +static void +zone_batch_free(struct _malloc_zone_t *zone, void **to_be_freed, + unsigned num_to_be_freed) { + unsigned i; + + for (i = 0; i < num_to_be_freed; i++) { + zone_free(zone, to_be_freed[i]); + to_be_freed[i] = NULL; + } +} + +static size_t +zone_pressure_relief(struct _malloc_zone_t *zone, size_t goal) { + return 0; +} + +static size_t +zone_good_size(malloc_zone_t *zone, size_t size) { + if (size == 0) { + size = 1; + } + return sz_s2u(size); +} + +static kern_return_t +zone_enumerator(task_t task, void *data, unsigned type_mask, + vm_address_t zone_address, memory_reader_t reader, + vm_range_recorder_t recorder) { + return KERN_SUCCESS; +} + +static boolean_t +zone_check(malloc_zone_t *zone) { + return true; +} + +static void +zone_print(malloc_zone_t *zone, boolean_t verbose) { +} + +static void +zone_log(malloc_zone_t *zone, void *address) { +} + +static void +zone_force_lock(malloc_zone_t *zone) { + if (isthreaded) { + /* + * See the note in zone_force_unlock, below, to see why we need + * this. + */ + assert(zone_force_lock_pid == -1); + zone_force_lock_pid = getpid(); + jemalloc_prefork(); + } +} + +static void +zone_force_unlock(malloc_zone_t *zone) { + /* + * zone_force_lock and zone_force_unlock are the entry points to the + * forking machinery on OS X. The tricky thing is, the child is not + * allowed to unlock mutexes locked in the parent, even if owned by the + * forking thread (and the mutex type we use in OS X will fail an assert + * if we try). In the child, we can get away with reinitializing all + * the mutexes, which has the effect of unlocking them. In the parent, + * doing this would mean we wouldn't wake any waiters blocked on the + * mutexes we unlock. So, we record the pid of the current thread in + * zone_force_lock, and use that to detect if we're in the parent or + * child here, to decide which unlock logic we need. + */ + if (isthreaded) { + assert(zone_force_lock_pid != -1); + if (getpid() == zone_force_lock_pid) { + jemalloc_postfork_parent(); + } else { + jemalloc_postfork_child(); + } + zone_force_lock_pid = -1; + } +} + +static void +zone_statistics(malloc_zone_t *zone, malloc_statistics_t *stats) { + /* We make no effort to actually fill the values */ + stats->blocks_in_use = 0; + stats->size_in_use = 0; + stats->max_size_in_use = 0; + stats->size_allocated = 0; +} + +static boolean_t +zone_locked(malloc_zone_t *zone) { + /* Pretend no lock is being held */ + return false; +} + +static void +zone_reinit_lock(malloc_zone_t *zone) { + /* As of OSX 10.12, this function is only used when force_unlock would + * be used if the zone version were < 9. So just use force_unlock. */ + zone_force_unlock(zone); +} + +static void +zone_init(void) { + jemalloc_zone.size = zone_size; + jemalloc_zone.malloc = zone_malloc; + jemalloc_zone.calloc = zone_calloc; + jemalloc_zone.valloc = zone_valloc; + jemalloc_zone.free = zone_free; + jemalloc_zone.realloc = zone_realloc; + jemalloc_zone.destroy = zone_destroy; + jemalloc_zone.zone_name = "jemalloc_zone"; + jemalloc_zone.batch_malloc = zone_batch_malloc; + jemalloc_zone.batch_free = zone_batch_free; + jemalloc_zone.introspect = &jemalloc_zone_introspect; + jemalloc_zone.version = 9; + jemalloc_zone.memalign = zone_memalign; + jemalloc_zone.free_definite_size = zone_free_definite_size; + jemalloc_zone.pressure_relief = zone_pressure_relief; + + jemalloc_zone_introspect.enumerator = zone_enumerator; + jemalloc_zone_introspect.good_size = zone_good_size; + jemalloc_zone_introspect.check = zone_check; + jemalloc_zone_introspect.print = zone_print; + jemalloc_zone_introspect.log = zone_log; + jemalloc_zone_introspect.force_lock = zone_force_lock; + jemalloc_zone_introspect.force_unlock = zone_force_unlock; + jemalloc_zone_introspect.statistics = zone_statistics; + jemalloc_zone_introspect.zone_locked = zone_locked; + jemalloc_zone_introspect.enable_discharge_checking = NULL; + jemalloc_zone_introspect.disable_discharge_checking = NULL; + jemalloc_zone_introspect.discharge = NULL; +#ifdef __BLOCKS__ + jemalloc_zone_introspect.enumerate_discharged_pointers = NULL; +#else + jemalloc_zone_introspect.enumerate_unavailable_without_blocks = NULL; +#endif + jemalloc_zone_introspect.reinit_lock = zone_reinit_lock; +} + +static malloc_zone_t * +zone_default_get(void) { + malloc_zone_t **zones = NULL; + unsigned int num_zones = 0; + + /* + * On OSX 10.12, malloc_default_zone returns a special zone that is not + * present in the list of registered zones. That zone uses a "lite zone" + * if one is present (apparently enabled when malloc stack logging is + * enabled), or the first registered zone otherwise. In practice this + * means unless malloc stack logging is enabled, the first registered + * zone is the default. So get the list of zones to get the first one, + * instead of relying on malloc_default_zone. + */ + if (KERN_SUCCESS != malloc_get_all_zones(0, NULL, + (vm_address_t**)&zones, &num_zones)) { + /* + * Reset the value in case the failure happened after it was + * set. + */ + num_zones = 0; + } + + if (num_zones) { + return zones[0]; + } + + return malloc_default_zone(); +} + +/* As written, this function can only promote jemalloc_zone. */ +static void +zone_promote(void) { + malloc_zone_t *zone; + + do { + /* + * Unregister and reregister the default zone. On OSX >= 10.6, + * unregistering takes the last registered zone and places it + * at the location of the specified zone. Unregistering the + * default zone thus makes the last registered one the default. + * On OSX < 10.6, unregistering shifts all registered zones. + * The first registered zone then becomes the default. + */ + malloc_zone_unregister(default_zone); + malloc_zone_register(default_zone); + + /* + * On OSX 10.6, having the default purgeable zone appear before + * the default zone makes some things crash because it thinks it + * owns the default zone allocated pointers. We thus + * unregister/re-register it in order to ensure it's always + * after the default zone. On OSX < 10.6, there is no purgeable + * zone, so this does nothing. On OSX >= 10.6, unregistering + * replaces the purgeable zone with the last registered zone + * above, i.e. the default zone. Registering it again then puts + * it at the end, obviously after the default zone. + */ + if (purgeable_zone != NULL) { + malloc_zone_unregister(purgeable_zone); + malloc_zone_register(purgeable_zone); + } + + zone = zone_default_get(); + } while (zone != &jemalloc_zone); +} + +JEMALLOC_ATTR(constructor) +void +zone_register(void) { + /* + * If something else replaced the system default zone allocator, don't + * register jemalloc's. + */ + default_zone = zone_default_get(); + if (!default_zone->zone_name || strcmp(default_zone->zone_name, + "DefaultMallocZone") != 0) { + return; + } + + /* + * The default purgeable zone is created lazily by OSX's libc. It uses + * the default zone when it is created for "small" allocations + * (< 15 KiB), but assumes the default zone is a scalable_zone. This + * obviously fails when the default zone is the jemalloc zone, so + * malloc_default_purgeable_zone() is called beforehand so that the + * default purgeable zone is created when the default zone is still + * a scalable_zone. As purgeable zones only exist on >= 10.6, we need + * to check for the existence of malloc_default_purgeable_zone() at + * run time. + */ + purgeable_zone = (malloc_default_purgeable_zone == NULL) ? NULL : + malloc_default_purgeable_zone(); + + /* Register the custom zone. At this point it won't be the default. */ + zone_init(); + malloc_zone_register(&jemalloc_zone); + + /* Promote the custom zone to be default. */ + zone_promote(); +} + +#endif diff --git a/src/duckdb/extension/jemalloc/jemalloc_extension.cpp b/src/duckdb/extension/jemalloc/jemalloc_extension.cpp new file mode 100644 index 000000000..3735cb1d8 --- /dev/null +++ b/src/duckdb/extension/jemalloc/jemalloc_extension.cpp @@ -0,0 +1,142 @@ +#define DUCKDB_EXTENSION_MAIN +#include "jemalloc_extension.hpp" + +#include "duckdb/common/allocator.hpp" +#include "jemalloc/jemalloc.h" +#include "malloc_ncpus.h" + +#include + +namespace duckdb { + +void JemallocExtension::Load(DuckDB &db) { + // NOP: This extension can only be loaded statically +} + +std::string JemallocExtension::Name() { + return "jemalloc"; +} + +data_ptr_t JemallocExtension::Allocate(PrivateAllocatorData *private_data, idx_t size) { + return data_ptr_cast(duckdb_je_malloc(size)); +} + +void JemallocExtension::Free(PrivateAllocatorData *private_data, data_ptr_t pointer, idx_t size) { + duckdb_je_free(pointer); +} + +data_ptr_t JemallocExtension::Reallocate(PrivateAllocatorData *private_data, data_ptr_t pointer, idx_t old_size, + idx_t size) { + return data_ptr_cast(duckdb_je_realloc(pointer, size)); +} + +static void JemallocCTL(const char *name, void *old_ptr, size_t *old_len, void *new_ptr, size_t new_len) { + if (duckdb_je_mallctl(name, old_ptr, old_len, new_ptr, new_len) != 0) { +#ifdef DEBUG + // We only want to throw an exception here when debugging + throw InternalException("je_mallctl failed for setting \"%s\"", name); +#endif + } +} + +template +static void SetJemallocCTL(const char *name, T &val) { + JemallocCTL(name, nullptr, nullptr, &val, sizeof(T)); +} + +static void SetJemallocCTL(const char *name) { + JemallocCTL(name, nullptr, nullptr, nullptr, 0); +} + +template +static T GetJemallocCTL(const char *name) { + T result; + size_t len = sizeof(T); + JemallocCTL(name, &result, &len, nullptr, 0); + return result; +} + +static inline string PurgeArenaString(idx_t arena_idx) { + return StringUtil::Format("arena.%llu.purge", arena_idx); +} + +int64_t JemallocExtension::DecayDelay() { + return DUCKDB_JEMALLOC_DECAY; +} + +void JemallocExtension::ThreadFlush(idx_t threshold) { + // We flush after exceeding the threshold + if (GetJemallocCTL("thread.peak.read") > threshold) { + return; + } + + // Flush thread-local cache + SetJemallocCTL("thread.tcache.flush"); + + // Flush this thread's arena + const auto purge_arena = PurgeArenaString(idx_t(GetJemallocCTL("thread.arena"))); + SetJemallocCTL(purge_arena.c_str()); + + // Reset the peak after resetting + SetJemallocCTL("thread.peak.reset"); +} + +void JemallocExtension::ThreadIdle() { + // Indicate that this thread is idle + SetJemallocCTL("thread.idle"); + + // Reset the peak after resetting + SetJemallocCTL("thread.peak.reset"); +} + +void JemallocExtension::FlushAll() { + // Flush thread-local cache + SetJemallocCTL("thread.tcache.flush"); + + // Flush all arenas + const auto purge_arena = PurgeArenaString(MALLCTL_ARENAS_ALL); + SetJemallocCTL(purge_arena.c_str()); + + // Reset the peak after resetting + SetJemallocCTL("thread.peak.reset"); +} + +void JemallocExtension::SetBackgroundThreads(bool enable) { +#ifndef __APPLE__ + SetJemallocCTL("background_thread", enable); +#endif +} + +std::string JemallocExtension::Version() const { +#ifdef EXT_VERSION_JEMALLOC + return EXT_VERSION_JEMALLOC; +#else + return ""; +#endif +} + +} // namespace duckdb + +extern "C" { + +unsigned duckdb_malloc_ncpus() { +#ifdef DUCKDB_NO_THREADS + return 1 +#else + return duckdb::NumericCast(std::thread::hardware_concurrency()); +#endif +} + +DUCKDB_EXTENSION_API void jemalloc_init(duckdb::DatabaseInstance &db) { + duckdb::DuckDB db_wrapper(db); + db_wrapper.LoadExtension(); +} + +DUCKDB_EXTENSION_API const char *jemalloc_version() { + return duckdb::DuckDB::LibraryVersion(); +} +} + +#ifndef DUCKDB_EXTENSION_MAIN +#error DUCKDB_EXTENSION_MAIN not defined +#endif diff --git a/src/duckdb/extension/json/json_functions/json_structure.cpp b/src/duckdb/extension/json/json_functions/json_structure.cpp index 51652bae4..361ac773f 100644 --- a/src/duckdb/extension/json/json_functions/json_structure.cpp +++ b/src/duckdb/extension/json/json_functions/json_structure.cpp @@ -626,6 +626,8 @@ static double CalculateTypeSimilarity(const LogicalType &merged, const LogicalTy // This can happen for empty structs/maps ("{}"), or in rare cases where an inconsistent struct becomes // consistent when merged, but does not have enough children to be considered a map. return CalculateMapAndStructSimilarity(type, merged, true, max_depth, depth); + } else if (type.id() != LogicalTypeId::STRUCT) { + return -1; } // Only structs can be merged into a struct diff --git a/src/duckdb/src/common/adbc/adbc.cpp b/src/duckdb/src/common/adbc/adbc.cpp index 1d002b578..7323f3b1f 100644 --- a/src/duckdb/src/common/adbc/adbc.cpp +++ b/src/duckdb/src/common/adbc/adbc.cpp @@ -172,6 +172,15 @@ AdbcStatusCode StatementSetSubstraitPlan(struct AdbcStatement *statement, const return ADBC_STATUS_INVALID_ARGUMENT; } auto wrapper = static_cast(statement->private_data); + if (wrapper->ingestion_stream.release) { + // Release any resources currently held by the ingestion stream before we overwrite it + wrapper->ingestion_stream.release(&wrapper->ingestion_stream); + wrapper->ingestion_stream.release = nullptr; + } + if (wrapper->statement) { + duckdb_destroy_prepare(&wrapper->statement); + wrapper->statement = nullptr; + } wrapper->substrait_plan = static_cast(malloc(sizeof(uint8_t) * length)); wrapper->plan_length = length; memcpy(wrapper->substrait_plan, plan, length); @@ -912,6 +921,15 @@ AdbcStatusCode StatementSetSqlQuery(struct AdbcStatement *statement, const char } auto wrapper = static_cast(statement->private_data); + if (wrapper->ingestion_stream.release) { + // Release any resources currently held by the ingestion stream before we overwrite it + wrapper->ingestion_stream.release(&wrapper->ingestion_stream); + wrapper->ingestion_stream.release = nullptr; + } + if (wrapper->statement) { + duckdb_destroy_prepare(&wrapper->statement); + wrapper->statement = nullptr; + } auto res = duckdb_prepare(wrapper->connection, query, &wrapper->statement); auto error_msg = duckdb_prepare_error(wrapper->statement); return CheckResult(res, error, error_msg); diff --git a/src/duckdb/src/common/file_buffer.cpp b/src/duckdb/src/common/file_buffer.cpp index 7cde4c6f8..b1b1febb3 100644 --- a/src/duckdb/src/common/file_buffer.cpp +++ b/src/duckdb/src/common/file_buffer.cpp @@ -42,19 +42,22 @@ FileBuffer::~FileBuffer() { allocator.FreeData(internal_buffer, internal_size); } -void FileBuffer::ReallocBuffer(size_t new_size) { +void FileBuffer::ReallocBuffer(idx_t new_size) { data_ptr_t new_buffer; if (internal_buffer) { new_buffer = allocator.ReallocateData(internal_buffer, internal_size, new_size); } else { new_buffer = allocator.AllocateData(new_size); } + + // FIXME: should we throw one of our exceptions here? if (!new_buffer) { throw std::bad_alloc(); } internal_buffer = new_buffer; internal_size = new_size; - // Caller must update these. + + // The caller must update these. buffer = nullptr; size = 0; } diff --git a/src/duckdb/src/common/random_engine.cpp b/src/duckdb/src/common/random_engine.cpp index ebc0abd43..7741e8f4d 100644 --- a/src/duckdb/src/common/random_engine.cpp +++ b/src/duckdb/src/common/random_engine.cpp @@ -59,7 +59,7 @@ uint32_t RandomEngine::NextRandomInteger32(uint32_t min, uint32_t max) { return min + static_cast(NextRandom32() * double(max - min)); } -void RandomEngine::SetSeed(uint32_t seed) { +void RandomEngine::SetSeed(uint64_t seed) { random_state->pcg.seed(seed); } diff --git a/src/duckdb/src/execution/join_hashtable.cpp b/src/duckdb/src/execution/join_hashtable.cpp index d5122470e..14abdc61e 100644 --- a/src/duckdb/src/execution/join_hashtable.cpp +++ b/src/duckdb/src/execution/join_hashtable.cpp @@ -1528,18 +1528,28 @@ bool JoinHashTable::PrepareExternalFinalize(const idx_t max_ht_size) { // Create vector with unfinished partition indices auto &partitions = sink_collection->GetPartitions(); + auto min_partition_size = NumericLimits::Maximum(); vector partition_indices; partition_indices.reserve(num_partitions); for (idx_t partition_idx = 0; partition_idx < num_partitions; partition_idx++) { - if (!completed_partitions.RowIsValidUnsafe(partition_idx)) { - partition_indices.push_back(partition_idx); + if (completed_partitions.RowIsValidUnsafe(partition_idx)) { + continue; } + partition_indices.push_back(partition_idx); + // Keep track of min partition size + const auto size = + partitions[partition_idx]->SizeInBytes() + PointerTableSize(partitions[partition_idx]->Count()); + min_partition_size = MinValue(min_partition_size, size); } + // Sort partitions by size, from small to large - std::sort(partition_indices.begin(), partition_indices.end(), [&](const idx_t &lhs, const idx_t &rhs) { + std::stable_sort(partition_indices.begin(), partition_indices.end(), [&](const idx_t &lhs, const idx_t &rhs) { const auto lhs_size = partitions[lhs]->SizeInBytes() + PointerTableSize(partitions[lhs]->Count()); const auto rhs_size = partitions[rhs]->SizeInBytes() + PointerTableSize(partitions[rhs]->Count()); - return lhs_size < rhs_size; + // We divide by min_partition_size, effectively rouding everything down to a multiple of min_partition_size + // Makes it so minor differences in partition sizes don't mess up the original order + // Retaining as much of the original order as possible reduces I/O (partition idx determines eviction queue idx) + return lhs_size / min_partition_size < rhs_size / min_partition_size; }); // Determine which partitions should go next diff --git a/src/duckdb/src/execution/operator/aggregate/physical_window.cpp b/src/duckdb/src/execution/operator/aggregate/physical_window.cpp index a85b74a47..8b8b2a162 100644 --- a/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +++ b/src/duckdb/src/execution/operator/aggregate/physical_window.cpp @@ -2,7 +2,6 @@ #include "duckdb/common/sort/partition_state.hpp" #include "duckdb/function/window/window_aggregate_function.hpp" -#include "duckdb/function/window/window_cumedist_function.hpp" #include "duckdb/function/window/window_executor.hpp" #include "duckdb/function/window/window_rank_function.hpp" #include "duckdb/function/window/window_rownumber_function.hpp" diff --git a/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp b/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp index 7d6e2e3ed..08311cfe3 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp @@ -133,7 +133,7 @@ AdaptiveSnifferResult CSVSniffer::MinimalSniff() { vector potential_header; for (idx_t col_idx = 0; col_idx < data_chunk.ColumnCount(); col_idx++) { auto &cur_vector = data_chunk.data[col_idx]; - auto vector_data = FlatVector::GetData(cur_vector); + const auto vector_data = FlatVector::GetData(cur_vector); auto &validity = FlatVector::Validity(cur_vector); HeaderValue val; if (validity.RowIsValid(0)) { @@ -181,7 +181,7 @@ SnifferResult CSVSniffer::AdaptiveSniff(const CSVSchema &file_schema) { return min_sniff_res.ToSnifferResult(); } -SnifferResult CSVSniffer::SniffCSV(bool force_match) { +SnifferResult CSVSniffer::SniffCSV(const bool force_match) { buffer_manager->sniffing = true; // 1. Dialect Detection DetectDialect(); diff --git a/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp b/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp index 14099df86..16466e3e9 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp @@ -362,7 +362,8 @@ void CSVSniffer::AnalyzeDialectCandidate(unique_ptr scanner, (single_column_before || ((more_values || more_columns) && !require_more_padding) || (more_than_one_column && require_less_padding) || quoted) && !invalid_padding && comments_are_acceptable) { - if (!candidates.empty() && set_columns.IsSet() && max_columns_found == set_columns.Size()) { + if (!candidates.empty() && set_columns.IsSet() && max_columns_found == set_columns.Size() && + consistent_rows <= best_consistent_rows) { // We have a candidate that fits our requirements better if (candidates.front()->ever_quoted || !scanner->ever_quoted) { return; diff --git a/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp b/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp index 4d69dc975..424468c55 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp @@ -17,8 +17,8 @@ static string GenerateColumnName(const idx_t total_cols, const idx_t col_number, // Helper function for UTF-8 aware space trimming static string TrimWhitespace(const string &col_name) { utf8proc_int32_t codepoint; - auto str = reinterpret_cast(col_name.c_str()); - idx_t size = col_name.size(); + const auto str = reinterpret_cast(col_name.c_str()); + const idx_t size = col_name.size(); // Find the first character that is not left trimmed idx_t begin = 0; while (begin < size) { @@ -96,6 +96,44 @@ static string NormalizeColumnName(const string &col_name) { return col_name_cleaned; } +static void ReplaceNames(vector &detected_names, CSVStateMachine &state_machine, + unordered_map> &best_sql_types_candidates_per_column_idx, + CSVReaderOptions &options, const vector &best_header_row, + CSVErrorHandler &error_handler) { + auto &dialect_options = state_machine.dialect_options; + if (!options.columns_set) { + if (options.file_options.hive_partitioning || options.file_options.union_by_name || options.multi_file_reader) { + // Just do the replacement + for (idx_t i = 0; i < MinValue(detected_names.size(), options.name_list.size()); i++) { + detected_names[i] = options.name_list[i]; + } + return; + } + if (options.name_list.size() > dialect_options.num_cols) { + if (options.null_padding) { + // we increase our types + idx_t col = 0; + for (idx_t i = dialect_options.num_cols; i < options.name_list.size(); i++) { + detected_names.push_back(GenerateColumnName(options.name_list.size(), col++)); + best_sql_types_candidates_per_column_idx[i] = {LogicalType::VARCHAR}; + } + + dialect_options.num_cols = options.name_list.size(); + + } else { + // we throw an error + const auto error = CSVError::HeaderSniffingError( + options, best_header_row, options.name_list.size(), + state_machine.dialect_options.state_machine_options.delimiter.GetValue()); + error_handler.Error(error); + } + } + for (idx_t i = 0; i < options.name_list.size(); i++) { + detected_names[i] = options.name_list[i]; + } + } +} + // If our columns were set by the user, we verify if their names match with the first row bool CSVSniffer::DetectHeaderWithSetColumn(ClientContext &context, vector &best_header_row, const SetColumns &set_columns, CSVReaderOptions &options) { @@ -181,11 +219,8 @@ CSVSniffer::DetectHeaderInternal(ClientContext &context, vector &be detected_names.push_back(GenerateColumnName(dialect_options.num_cols, col)); } // If the user provided names, we must replace our header with the user provided names - if (!options.columns_set) { - for (idx_t i = 0; i < MinValue(best_header_row.size(), options.name_list.size()); i++) { - detected_names[i] = options.name_list[i]; - } - } + ReplaceNames(detected_names, state_machine, best_sql_types_candidates_per_column_idx, options, best_header_row, + error_handler); return detected_names; } // information for header detection @@ -199,11 +234,8 @@ CSVSniffer::DetectHeaderInternal(ClientContext &context, vector &be detected_names.push_back(GenerateColumnName(dialect_options.num_cols, col)); } dialect_options.rows_until_header += 1; - if (!options.columns_set) { - for (idx_t i = 0; i < MinValue(detected_names.size(), options.name_list.size()); i++) { - detected_names[i] = options.name_list[i]; - } - } + ReplaceNames(detected_names, state_machine, best_sql_types_candidates_per_column_idx, options, + best_header_row, error_handler); return detected_names; } auto error = @@ -295,16 +327,17 @@ CSVSniffer::DetectHeaderInternal(ClientContext &context, vector &be } // If the user provided names, we must replace our header with the user provided names - if (!options.columns_set) { - for (idx_t i = 0; i < MinValue(detected_names.size(), options.name_list.size()); i++) { - detected_names[i] = options.name_list[i]; - } - } + ReplaceNames(detected_names, state_machine, best_sql_types_candidates_per_column_idx, options, best_header_row, + error_handler); return detected_names; } void CSVSniffer::DetectHeader() { auto &sniffer_state_machine = best_candidate->GetStateMachine(); names = DetectHeaderInternal(buffer_manager->context, best_header_row, sniffer_state_machine, set_columns, best_sql_types_candidates_per_column_idx, options, *error_handler); + for (idx_t i = max_columns_found; i < names.size(); i++) { + detected_types.push_back(LogicalType::VARCHAR); + } + max_columns_found = names.size(); } } // namespace duckdb diff --git a/src/duckdb/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp b/src/duckdb/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp index 19cae335c..7b42a6d88 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp @@ -57,14 +57,14 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op const bool multi_byte_delimiter = delimiter_value.size() != 1; - bool enable_unquoted_escape = state_machine_options.rfc_4180.GetValue() == false && - state_machine_options.quote != state_machine_options.escape && - state_machine_options.escape != '\0'; + const bool enable_unquoted_escape = state_machine_options.rfc_4180.GetValue() == false && + state_machine_options.quote != state_machine_options.escape && + state_machine_options.escape != '\0'; // Now set values depending on configuration // 1) Standard/Invalid State - vector std_inv {static_cast(CSVState::STANDARD), static_cast(CSVState::INVALID), - static_cast(CSVState::STANDARD_NEWLINE)}; - for (auto &state : std_inv) { + const vector std_inv {static_cast(CSVState::STANDARD), static_cast(CSVState::INVALID), + static_cast(CSVState::STANDARD_NEWLINE)}; + for (const auto &state : std_inv) { if (multi_byte_delimiter) { transition_array[delimiter_first_byte][state] = CSVState::DELIMITER_FIRST_BYTE; } else { @@ -75,7 +75,9 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op if (state == static_cast(CSVState::STANDARD_NEWLINE)) { transition_array[static_cast('\n')][state] = CSVState::STANDARD; } else { - transition_array[static_cast('\n')][state] = CSVState::RECORD_SEPARATOR; + if (!state_machine_options.rfc_4180.GetValue()) { + transition_array[static_cast('\n')][state] = CSVState::RECORD_SEPARATOR; + } } } else { transition_array[static_cast('\r')][state] = CSVState::RECORD_SEPARATOR; @@ -96,7 +98,7 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op transition_array[' '][static_cast(CSVState::DELIMITER)] = CSVState::EMPTY_SPACE; } - vector delimiter_states { + const vector delimiter_states { static_cast(CSVState::DELIMITER), static_cast(CSVState::DELIMITER_FIRST_BYTE), static_cast(CSVState::DELIMITER_SECOND_BYTE), static_cast(CSVState::DELIMITER_THIRD_BYTE)}; diff --git a/src/duckdb/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp b/src/duckdb/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp index 018641286..efcd73f61 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp @@ -60,7 +60,7 @@ void CSVFileScan::SetStart() { } CSVFileScan::CSVFileScan(ClientContext &context, const string &file_path_p, const CSVReaderOptions &options_p, - const idx_t file_idx_p, const ReadCSVData &bind_data, const vector &column_ids, + idx_t file_idx_p, const ReadCSVData &bind_data, const vector &column_ids, CSVSchema &file_schema, bool per_file_single_threaded) : file_path(file_path_p), file_idx(file_idx_p), error_handler(make_shared_ptr(options_p.ignore_errors.GetValue())), options(options_p) { diff --git a/src/duckdb/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/duckdb/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index 18772aad9..3874e74df 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -87,6 +87,15 @@ unique_ptr CSVGlobalState::Next(optional_ptrGetValidationLine()); } if (single_threaded) { + { + lock_guard parallel_lock(main_mutex); + if (previous_scanner) { + // Cleanup previous scanner. + previous_scanner->buffer_tracker.reset(); + current_buffer_in_use.reset(); + previous_scanner->csv_file_scan->Finish(); + } + } idx_t cur_idx; bool empty_file = false; do { @@ -108,6 +117,7 @@ unique_ptr CSVGlobalState::Next(optional_ptr(context, bind_data.files[cur_idx], bind_data.options, cur_idx, bind_data, column_ids, file_schema, true); empty_file = file_scan->file_size == 0; + if (!empty_file) { lock_guard parallel_lock(main_mutex); file_scans.emplace_back(std::move(file_scan)); @@ -116,11 +126,7 @@ unique_ptr CSVGlobalState::Next(optional_ptr(*file_scans.back()->buffer_manager, current_boundary.GetBufferIdx()); - if (previous_scanner) { - previous_scanner->buffer_tracker.reset(); - current_buffer_in_use.reset(); - previous_scanner->csv_file_scan->Finish(); - } + return make_uniq(scanner_idx++, current_file->buffer_manager, current_file->state_machine, current_file->error_handler, current_file, false, current_boundary); @@ -178,7 +184,7 @@ unique_ptr CSVGlobalState::Next(optional_ptron_disk_file) { return system_threads; } idx_t total_threads = file_scans.front()->file_size / CSVIterator::BYTES_PER_THREAD + 1; diff --git a/src/duckdb/src/execution/sample/reservoir_sample.cpp b/src/duckdb/src/execution/sample/reservoir_sample.cpp index 1e46f228e..ba777b609 100644 --- a/src/duckdb/src/execution/sample/reservoir_sample.cpp +++ b/src/duckdb/src/execution/sample/reservoir_sample.cpp @@ -505,7 +505,7 @@ void ReservoirSample::EvictOverBudgetSamples() { D_ASSERT(num_samples_to_keep <= sample_count); D_ASSERT(stats_sample); D_ASSERT(sample_count == FIXED_SAMPLE_SIZE); - auto new_reservoir_chunk = CreateNewSampleChunk(types, FIXED_SAMPLE_SIZE); + auto new_reservoir_chunk = CreateNewSampleChunk(types, sample_count); // The current selection vector can potentially have 2048 valid mappings. // If we need to save a sample with less rows than that, we need to do the following diff --git a/src/duckdb/src/function/table/read_csv.cpp b/src/duckdb/src/function/table/read_csv.cpp index d27c34700..b999573ae 100644 --- a/src/duckdb/src/function/table/read_csv.cpp +++ b/src/duckdb/src/function/table/read_csv.cpp @@ -126,7 +126,9 @@ static unique_ptr ReadCSVBind(ClientContext &context, TableFunctio auto &options = result->options; auto multi_file_reader = MultiFileReader::Create(input.table_function); auto multi_file_list = multi_file_reader->CreateFileList(context, input.inputs[0]); - + if (multi_file_list->GetTotalFileCount() > 1) { + options.multi_file_reader = true; + } options.FromNamedParameters(input.named_parameters, context); options.file_options.AutoDetectHivePartitioning(*multi_file_list, context); diff --git a/src/duckdb/src/function/table/system/duckdb_memory.cpp b/src/duckdb/src/function/table/system/duckdb_memory.cpp index e67fa5106..a1eb044b7 100644 --- a/src/duckdb/src/function/table/system/duckdb_memory.cpp +++ b/src/duckdb/src/function/table/system/duckdb_memory.cpp @@ -27,7 +27,6 @@ static unique_ptr DuckDBMemoryBind(ClientContext &context, TableFu unique_ptr DuckDBMemoryInit(ClientContext &context, TableFunctionInitInput &input) { auto result = make_uniq(); - result->entries = BufferManager::GetBufferManager(context).GetMemoryUsageInfo(); return std::move(result); } diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index 53720a795..14490baa9 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,17 +1,17 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "4-dev3741" +#define DUCKDB_PATCH_VERSION "0" #endif #ifndef DUCKDB_MINOR_VERSION -#define DUCKDB_MINOR_VERSION 1 +#define DUCKDB_MINOR_VERSION 0 #endif #ifndef DUCKDB_MAJOR_VERSION -#define DUCKDB_MAJOR_VERSION 1 +#define DUCKDB_MAJOR_VERSION 0 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.1.4-dev3741" +#define DUCKDB_VERSION "v0.0.0" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "ab8c909857" +#define DUCKDB_SOURCE_ID "deadbeeff" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/function/window/window_boundaries_state.cpp b/src/duckdb/src/function/window/window_boundaries_state.cpp index 92f14860b..234d0cadc 100644 --- a/src/duckdb/src/function/window/window_boundaries_state.cpp +++ b/src/duckdb/src/function/window/window_boundaries_state.cpp @@ -328,7 +328,9 @@ WindowBoundsSet WindowBoundariesState::GetWindowBounds(const BoundWindowExpressi case ExpressionType::WINDOW_CUME_DIST: result.insert(PARTITION_BEGIN); result.insert(PARTITION_END); - result.insert(PEER_END); + if (wexpr.arg_orders.empty()) { + result.insert(PEER_END); + } break; case ExpressionType::WINDOW_NTILE: case ExpressionType::WINDOW_LEAD: diff --git a/src/duckdb/src/function/window/window_cumedist_function.cpp b/src/duckdb/src/function/window/window_cumedist_function.cpp deleted file mode 100644 index 7090b84a5..000000000 --- a/src/duckdb/src/function/window/window_cumedist_function.cpp +++ /dev/null @@ -1,28 +0,0 @@ -#include "duckdb/function/window/window_cumedist_function.hpp" -#include "duckdb/function/window/window_shared_expressions.hpp" - -namespace duckdb { - -//===--------------------------------------------------------------------===// -// WindowCumeDistExecutor -//===--------------------------------------------------------------------===// -WindowCumeDistExecutor::WindowCumeDistExecutor(BoundWindowExpression &wexpr, ClientContext &context, - WindowSharedExpressions &shared) - : WindowExecutor(wexpr, context, shared) { -} - -void WindowCumeDistExecutor::EvaluateInternal(WindowExecutorGlobalState &gstate, WindowExecutorLocalState &lstate, - DataChunk &eval_chunk, Vector &result, idx_t count, idx_t row_idx) const { - auto &lbstate = lstate.Cast(); - auto partition_begin = FlatVector::GetData(lbstate.bounds.data[PARTITION_BEGIN]); - auto partition_end = FlatVector::GetData(lbstate.bounds.data[PARTITION_END]); - auto peer_end = FlatVector::GetData(lbstate.bounds.data[PEER_END]); - auto rdata = FlatVector::GetData(result); - for (idx_t i = 0; i < count; ++i, ++row_idx) { - auto denom = static_cast(NumericCast(partition_end[i] - partition_begin[i])); - double cume_dist = denom > 0 ? ((double)(peer_end[i] - partition_begin[i])) / denom : 0; - rdata[i] = cume_dist; - } -} - -} // namespace duckdb diff --git a/src/duckdb/src/function/window/window_rank_function.cpp b/src/duckdb/src/function/window/window_rank_function.cpp index 05a390948..d1299430a 100644 --- a/src/duckdb/src/function/window/window_rank_function.cpp +++ b/src/duckdb/src/function/window/window_rank_function.cpp @@ -242,4 +242,38 @@ void WindowPercentRankExecutor::EvaluateInternal(WindowExecutorGlobalState &gsta } } +//===--------------------------------------------------------------------===// +// WindowCumeDistExecutor +//===--------------------------------------------------------------------===// +WindowCumeDistExecutor::WindowCumeDistExecutor(BoundWindowExpression &wexpr, ClientContext &context, + WindowSharedExpressions &shared) + : WindowPeerExecutor(wexpr, context, shared) { +} + +void WindowCumeDistExecutor::EvaluateInternal(WindowExecutorGlobalState &gstate, WindowExecutorLocalState &lstate, + DataChunk &eval_chunk, Vector &result, idx_t count, idx_t row_idx) const { + auto &gpeer = gstate.Cast(); + auto &lpeer = lstate.Cast(); + auto partition_begin = FlatVector::GetData(lpeer.bounds.data[PARTITION_BEGIN]); + auto partition_end = FlatVector::GetData(lpeer.bounds.data[PARTITION_END]); + auto rdata = FlatVector::GetData(result); + + if (gpeer.token_tree) { + for (idx_t i = 0; i < count; ++i, ++row_idx) { + const auto denom = static_cast(NumericCast(partition_end[i] - partition_begin[i])); + const auto peer_end = gpeer.token_tree->PeerEnd(partition_begin[i], partition_end[i], row_idx); + const auto num = static_cast(peer_end - partition_begin[i]); + rdata[i] = denom > 0 ? (num / denom) : 0; + } + return; + } + + auto peer_end = FlatVector::GetData(lpeer.bounds.data[PEER_END]); + for (idx_t i = 0; i < count; ++i, ++row_idx) { + const auto denom = static_cast(NumericCast(partition_end[i] - partition_begin[i])); + const auto num = static_cast(peer_end[i] - partition_begin[i]); + rdata[i] = denom > 0 ? (num / denom) : 0; + } +} + } // namespace duckdb diff --git a/src/duckdb/src/function/window/window_token_tree.cpp b/src/duckdb/src/function/window/window_token_tree.cpp index b4d819be2..82b5124e4 100644 --- a/src/duckdb/src/function/window/window_token_tree.cpp +++ b/src/duckdb/src/function/window/window_token_tree.cpp @@ -120,4 +120,23 @@ idx_t WindowTokenTree::Rank(const idx_t lower, const idx_t upper, const idx_t ro } } +template +static idx_t NextPeer(const TREE &tree, const idx_t lower, const idx_t upper, const idx_t row_idx) { + idx_t rank = 0; + // Because tokens are dense, we can find the next peer by adding 1 to the probed token value + const auto needle = tree.LowestLevel()[row_idx] + 1; + tree.AggregateLowerBound(lower, upper, needle, [&](idx_t level, const idx_t run_begin, const idx_t run_pos) { + rank += run_pos - run_begin; + }); + return rank; +} + +idx_t WindowTokenTree::PeerEnd(const idx_t lower, const idx_t upper, const idx_t row_idx) const { + if (mst64) { + return NextPeer(*mst64, lower, upper, row_idx); + } else { + return NextPeer(*mst32, lower, upper, row_idx); + } +} + } // namespace duckdb diff --git a/src/duckdb/src/include/duckdb.h b/src/duckdb/src/include/duckdb.h index e2fdbfc51..5efd2bd6b 100644 --- a/src/duckdb/src/include/duckdb.h +++ b/src/duckdb/src/include/duckdb.h @@ -670,6 +670,8 @@ struct duckdb_extension_access { const void *(*get_api)(duckdb_extension_info info, const char *version); }; +#ifndef DUCKDB_API_EXCLUDE_FUNCTIONS + //===--------------------------------------------------------------------===// // Functions //===--------------------------------------------------------------------===// @@ -4499,6 +4501,8 @@ Destroys the cast function object. */ DUCKDB_API void duckdb_destroy_cast_function(duckdb_cast_function *cast_function); +#endif + #ifdef __cplusplus } #endif diff --git a/src/duckdb/src/include/duckdb/common/arrow/appender/varchar_data.hpp b/src/duckdb/src/include/duckdb/common/arrow/appender/varchar_data.hpp index 12f68e431..6dc37652d 100644 --- a/src/duckdb/src/include/duckdb/common/arrow/appender/varchar_data.hpp +++ b/src/duckdb/src/include/duckdb/common/arrow/appender/varchar_data.hpp @@ -87,7 +87,8 @@ struct ArrowVarcharData { D_ASSERT(append_data.options.arrow_offset_size == ArrowOffsetSize::REGULAR); throw InvalidInputException( "Arrow Appender: The maximum total string size for regular string buffers is " - "%u but the offset of %lu exceeds this.", + "%u but the offset of %lu exceeds this.\n* SET arrow_large_buffer_size=true to use large string " + "buffers", NumericLimits::Maximum(), current_offset); } offset_data[offset_idx] = UnsafeNumericCast(current_offset); diff --git a/src/duckdb/src/include/duckdb/common/assert.hpp b/src/duckdb/src/include/duckdb/common/assert.hpp index 62721f73f..916f5c1c7 100644 --- a/src/duckdb/src/include/duckdb/common/assert.hpp +++ b/src/duckdb/src/include/duckdb/common/assert.hpp @@ -10,8 +10,19 @@ #pragma once -#if (defined(DUCKDB_USE_STANDARD_ASSERT) || !defined(DEBUG)) && !defined(DUCKDB_FORCE_ASSERT) && !defined(__MVS__) +// clang-format off +#if ( \ + /* Not a debug build */ \ + !defined(DEBUG) && \ + /* FORCE_ASSERT is not set (enables assertions even on release mode when set to true) */ \ + !defined(DUCKDB_FORCE_ASSERT) && \ + /* The project is not compiled for Microsoft Visual Studio */ \ + !defined(__MVS__) \ +) +// clang-format on +//! On most builds, NDEBUG is defined, turning the assert call into a NO-OP +//! Only the 'else' condition is supposed to check the assertions #include #define D_ASSERT assert namespace duckdb { diff --git a/src/duckdb/src/include/duckdb/common/atomic_ptr.hpp b/src/duckdb/src/include/duckdb/common/atomic_ptr.hpp new file mode 100644 index 000000000..da2281d87 --- /dev/null +++ b/src/duckdb/src/include/duckdb/common/atomic_ptr.hpp @@ -0,0 +1,102 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// duckdb/common/atomic_ptr.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "duckdb/common/exception.hpp" +#include "duckdb/common/atomic.hpp" +#include "duckdb/common/shared_ptr.hpp" +#include "duckdb/common/unique_ptr.hpp" + +namespace duckdb { + +template +class atomic_ptr { // NOLINT: mimic std casing +public: + atomic_ptr() noexcept : ptr(nullptr) { + } + atomic_ptr(T *ptr_p) : ptr(ptr_p) { // NOLINT: allow implicit creation from pointer + } + atomic_ptr(T &ref) : ptr(&ref) { // NOLINT: allow implicit creation from reference + } + atomic_ptr(const unique_ptr &ptr_p) : ptr(ptr_p.get()) { // NOLINT: allow implicit creation from unique pointer + } + atomic_ptr(const shared_ptr &ptr_p) : ptr(ptr_p.get()) { // NOLINT: allow implicit creation from shared pointer + } + + void CheckValid(const T *ptr) const { + if (MemorySafety::ENABLED) { + return; + } + if (!ptr) { + throw InternalException("Attempting to dereference an optional pointer that is not set"); + } + } + + T *GetPointer() { + auto res = ptr.load(); + CheckValid(res); + return res; + } + + const T *GetPointer() const { + auto res = ptr.load(); + CheckValid(res); + return res; + } + + operator bool() const { // NOLINT: allow implicit conversion to bool + return ptr; + } + T &operator*() { + return *GetPointer(); + } + const T &operator*() const { + return *GetPointer(); + } + T *operator->() { + return GetPointer(); + } + const T *operator->() const { + return GetPointer(); + } + T *get() { // NOLINT: mimic std casing + return GetPointer(); + } + const T *get() const { // NOLINT: mimic std casing + return GetPointer(); + } + // this looks dirty - but this is the default behavior of raw pointers + T *get_mutable() const { // NOLINT: mimic std casing + return GetPointer(); + } + + void set(T &ref) { + ptr = &ref; + } + + void reset() { + ptr = nullptr; + } + + bool operator==(const atomic_ptr &rhs) const { + return ptr.load() == rhs.ptr.load(); + } + + bool operator!=(const atomic_ptr &rhs) const { + return ptr.load() != rhs.ptr.load(); + } + +private: + atomic ptr; +}; + +template +using unsafe_atomic_ptr = atomic_ptr; + +} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/common/extension_type_info.hpp b/src/duckdb/src/include/duckdb/common/extension_type_info.hpp new file mode 100644 index 000000000..48e12d3c3 --- /dev/null +++ b/src/duckdb/src/include/duckdb/common/extension_type_info.hpp @@ -0,0 +1,19 @@ +#pragma once + +#include "duckdb/common/string.hpp" +#include "duckdb/common/types/value.hpp" +#include "duckdb/common/serializer/serializer.hpp" + +namespace duckdb { + +struct ExtensionTypeInfo { + vector modifiers; + unordered_map properties; + +public: + void Serialize(Serializer &serializer) const; + static unique_ptr Deserialize(Deserializer &source); + bool Equals(optional_ptr other_p) const; +}; + +} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/common/file_buffer.hpp b/src/duckdb/src/include/duckdb/common/file_buffer.hpp index 818b53ed6..8db3e0d0c 100644 --- a/src/duckdb/src/include/duckdb/common/file_buffer.hpp +++ b/src/duckdb/src/include/duckdb/common/file_buffer.hpp @@ -34,7 +34,8 @@ class FileBuffer { Allocator &allocator; //! The buffer that users can write to data_ptr_t buffer; - //! The size of the portion that users can write to, this is equivalent to internal_size - BLOCK_HEADER_SIZE + //! The user-facing size of the buffer. + //! This is equivalent to internal_size - BLOCK_HEADER_SIZE. uint64_t size; public: @@ -49,13 +50,16 @@ class FileBuffer { return type; } - // Same rules as the constructor. We will add room for a header, in additio to - // the requested user bytes. We will then sector-align the result. + // Same rules as the constructor. We add room for a header, in addition to + // the requested user bytes. We then sector-align the result. void Resize(uint64_t user_size); uint64_t AllocSize() const { return internal_size; } + uint64_t Size() const { + return size; + } data_ptr_t InternalBuffer() { return internal_buffer; } @@ -66,18 +70,19 @@ class FileBuffer { }; MemoryRequirement CalculateMemory(uint64_t user_size); - void Initialize(DebugInitialize info); protected: - //! The type of the buffer + //! The type of the buffer. FileBufferType type; - //! The pointer to the internal buffer that will be read or written, including the buffer header + //! The pointer to the internal buffer that will be read from or written to. + //! This includes the buffer header. data_ptr_t internal_buffer; - //! The aligned size as passed to the constructor. This is the size that is read or written to disk. + //! The aligned size as passed to the constructor. + //! This is the size that is read from or written to disk. uint64_t internal_size; - void ReallocBuffer(size_t malloc_size); + void ReallocBuffer(idx_t new_size); void Init(); }; diff --git a/src/duckdb/src/include/duckdb/common/random_engine.hpp b/src/duckdb/src/include/duckdb/common/random_engine.hpp index 59531e1d9..8a5a3097e 100644 --- a/src/duckdb/src/include/duckdb/common/random_engine.hpp +++ b/src/duckdb/src/include/duckdb/common/random_engine.hpp @@ -36,7 +36,7 @@ class RandomEngine { uint32_t NextRandomInteger(uint32_t min, uint32_t max); uint64_t NextRandomInteger64(); - void SetSeed(uint32_t seed); + void SetSeed(uint64_t seed); static RandomEngine &Get(ClientContext &context); diff --git a/src/duckdb/src/include/duckdb/common/type_util.hpp b/src/duckdb/src/include/duckdb/common/type_util.hpp index c198bafcd..d63f528ba 100644 --- a/src/duckdb/src/include/duckdb/common/type_util.hpp +++ b/src/duckdb/src/include/duckdb/common/type_util.hpp @@ -48,6 +48,14 @@ PhysicalType GetTypeId() { return PhysicalType::INT64; } else if (std::is_same()) { return PhysicalType::INT64; + } else if (std::is_same()) { + return PhysicalType::INT64; + } else if (std::is_same()) { + return PhysicalType::INT64; + } else if (std::is_same()) { + return PhysicalType::INT64; + } else if (std::is_same()) { + return PhysicalType::INT64; } else if (std::is_same()) { return PhysicalType::FLOAT; } else if (std::is_same()) { diff --git a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_file_scanner.hpp b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_file_scanner.hpp index 826ef217a..3ebc88ff3 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_file_scanner.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_file_scanner.hpp @@ -45,7 +45,7 @@ class CSVFileScan { const ReadCSVData &bind_data, const vector &column_ids, CSVSchema &file_schema); //! Constructor for new CSV Files, we must initialize the buffer manager and the state machine //! Path to this file - CSVFileScan(ClientContext &context, const string &file_path, const CSVReaderOptions &options, const idx_t file_idx, + CSVFileScan(ClientContext &context, const string &file_path, const CSVReaderOptions &options, idx_t file_idx, const ReadCSVData &bind_data, const vector &column_ids, CSVSchema &file_schema, bool per_file_single_threaded); diff --git a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp index fc8288f87..0fc506659 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp @@ -135,6 +135,8 @@ struct CSVReaderOptions { map write_date_format = {{LogicalTypeId::DATE, Value()}, {LogicalTypeId::TIMESTAMP, Value()}}; //! Whether a type format is specified map has_format = {{LogicalTypeId::DATE, false}, {LogicalTypeId::TIMESTAMP, false}}; + //! If this reader is a multifile reader + bool multi_file_reader = false; void Serialize(Serializer &serializer) const; static CSVReaderOptions Deserialize(Deserializer &deserializer); diff --git a/src/duckdb/src/include/duckdb/function/window/window_cumedist_function.hpp b/src/duckdb/src/include/duckdb/function/window/window_cumedist_function.hpp deleted file mode 100644 index 0ba062d47..000000000 --- a/src/duckdb/src/include/duckdb/function/window/window_cumedist_function.hpp +++ /dev/null @@ -1,24 +0,0 @@ -//===----------------------------------------------------------------------===// -// DuckDB -// -// duckdb/function/window/window_cumedist_function.hpp -// -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include "duckdb/function/window/window_executor.hpp" - -namespace duckdb { - -class WindowCumeDistExecutor : public WindowExecutor { -public: - WindowCumeDistExecutor(BoundWindowExpression &wexpr, ClientContext &context, WindowSharedExpressions &shared); - -protected: - void EvaluateInternal(WindowExecutorGlobalState &gstate, WindowExecutorLocalState &lstate, DataChunk &eval_chunk, - Vector &result, idx_t count, idx_t row_idx) const override; -}; - -} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/function/window/window_rank_function.hpp b/src/duckdb/src/include/duckdb/function/window/window_rank_function.hpp index e1a0cf486..3e53012cc 100644 --- a/src/duckdb/src/include/duckdb/function/window/window_rank_function.hpp +++ b/src/duckdb/src/include/duckdb/function/window/window_rank_function.hpp @@ -48,4 +48,13 @@ class WindowPercentRankExecutor : public WindowPeerExecutor { Vector &result, idx_t count, idx_t row_idx) const override; }; +class WindowCumeDistExecutor : public WindowPeerExecutor { +public: + WindowCumeDistExecutor(BoundWindowExpression &wexpr, ClientContext &context, WindowSharedExpressions &shared); + +protected: + void EvaluateInternal(WindowExecutorGlobalState &gstate, WindowExecutorLocalState &lstate, DataChunk &eval_chunk, + Vector &result, idx_t count, idx_t row_idx) const override; +}; + } // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/function/window/window_token_tree.hpp b/src/duckdb/src/include/duckdb/function/window/window_token_tree.hpp index b55012552..4df4dab8f 100644 --- a/src/duckdb/src/include/duckdb/function/window/window_token_tree.hpp +++ b/src/duckdb/src/include/duckdb/function/window/window_token_tree.hpp @@ -32,6 +32,9 @@ class WindowTokenTree : public WindowMergeSortTree { //! Find the rank of the row within the range idx_t Rank(const idx_t lower, const idx_t upper, const idx_t row_idx) const; + //! Find the next peer after the row and within the range + idx_t PeerEnd(const idx_t lower, const idx_t upper, const idx_t row_idx) const; + //! Peer boundaries. vector deltas; diff --git a/src/duckdb/src/include/duckdb/optimizer/build_probe_side_optimizer.hpp b/src/duckdb/src/include/duckdb/optimizer/build_probe_side_optimizer.hpp index cdb7dbfda..e60938745 100644 --- a/src/duckdb/src/include/duckdb/optimizer/build_probe_side_optimizer.hpp +++ b/src/duckdb/src/include/duckdb/optimizer/build_probe_side_optimizer.hpp @@ -11,7 +11,6 @@ #include "duckdb/common/unordered_set.hpp" #include "duckdb/common/vector.hpp" #include "duckdb/planner/logical_operator.hpp" -#include "duckdb/planner/operator/logical_filter.hpp" namespace duckdb { @@ -32,7 +31,6 @@ class BuildProbeSideOptimizer : LogicalOperatorVisitor { public: explicit BuildProbeSideOptimizer(ClientContext &context, LogicalOperator &op); - void VisitOperator(LogicalOperator &op) override; void VisitExpression(unique_ptr *expression) override {}; diff --git a/src/duckdb/src/include/duckdb_extension.h b/src/duckdb/src/include/duckdb_extension.h index 1c57e3f97..5eca823d5 100644 --- a/src/duckdb/src/include/duckdb_extension.h +++ b/src/duckdb/src/include/duckdb_extension.h @@ -11,8 +11,7 @@ // WARNING: this file is autogenerated by scripts/generate_c_api.py, manual changes will be overwritten // !!!!!!! -// WARNING: this API is not yet stable, this means that this API is only guaranteed to work for this specific DuckDB -// version +// WARNING: this API is not yet stable, meaning it is only guaranteed to work for this specific DuckDB version. #pragma once diff --git a/src/duckdb/src/main/capi/prepared-c.cpp b/src/duckdb/src/main/capi/prepared-c.cpp index a065f41a5..d033083d0 100644 --- a/src/duckdb/src/main/capi/prepared-c.cpp +++ b/src/duckdb/src/main/capi/prepared-c.cpp @@ -73,7 +73,7 @@ duckdb_state duckdb_prepare(duckdb_connection connection, const char *query, auto wrapper = new PreparedStatementWrapper(); Connection *conn = reinterpret_cast(connection); wrapper->statement = conn->Prepare(query); - *out_prepared_statement = (duckdb_prepared_statement)wrapper; + *out_prepared_statement = reinterpret_cast(wrapper); return !wrapper->statement->HasError() ? DuckDBSuccess : DuckDBError; } diff --git a/src/duckdb/src/optimizer/build_probe_side_optimizer.cpp b/src/duckdb/src/optimizer/build_probe_side_optimizer.cpp index 7ee56dc9c..efd21ed20 100644 --- a/src/duckdb/src/optimizer/build_probe_side_optimizer.cpp +++ b/src/duckdb/src/optimizer/build_probe_side_optimizer.cpp @@ -8,6 +8,10 @@ #include "duckdb/planner/operator/logical_get.hpp" #include "duckdb/planner/operator/logical_join.hpp" #include "duckdb/planner/operator/logical_order.hpp" +#include "duckdb/optimizer/column_binding_replacer.hpp" +#include "duckdb/optimizer/optimizer.hpp" +#include "duckdb/planner/operator/logical_cross_product.hpp" +#include "duckdb/planner/operator/logical_projection.hpp" namespace duckdb { @@ -68,7 +72,7 @@ static void FlipChildren(LogicalOperator &op) { return; } case LogicalOperatorType::LOGICAL_CROSS_PRODUCT: { - // don't need to do anything here. + // don't need to do anything here return; } default: @@ -207,6 +211,7 @@ void BuildProbeSideOptimizer::TryFlipJoinChildren(LogicalOperator &op) const { } void BuildProbeSideOptimizer::VisitOperator(LogicalOperator &op) { + // then the currentoperator switch (op.type) { case LogicalOperatorType::LOGICAL_DELIM_JOIN: { auto &join = op.Cast(); diff --git a/src/duckdb/src/optimizer/join_order/relation_manager.cpp b/src/duckdb/src/optimizer/join_order/relation_manager.cpp index e67a76c04..5e809a4c5 100644 --- a/src/duckdb/src/optimizer/join_order/relation_manager.cpp +++ b/src/duckdb/src/optimizer/join_order/relation_manager.cpp @@ -319,8 +319,8 @@ bool RelationManager::ExtractJoinRelations(JoinOrderOptimizer &optimizer, Logica return can_reorder_left && can_reorder_right; } case LogicalOperatorType::LOGICAL_CROSS_PRODUCT: { - bool can_reorder_right = ExtractJoinRelations(optimizer, *op->children[1], filter_operators, op); bool can_reorder_left = ExtractJoinRelations(optimizer, *op->children[0], filter_operators, op); + bool can_reorder_right = ExtractJoinRelations(optimizer, *op->children[1], filter_operators, op); return can_reorder_left && can_reorder_right; } case LogicalOperatorType::LOGICAL_DUMMY_SCAN: { diff --git a/src/duckdb/src/parser/transform/expression/transform_function.cpp b/src/duckdb/src/parser/transform/expression/transform_function.cpp index fd12b6ce0..7c64a1327 100644 --- a/src/duckdb/src/parser/transform/expression/transform_function.cpp +++ b/src/duckdb/src/parser/transform/expression/transform_function.cpp @@ -119,12 +119,12 @@ static bool IsOrderableWindowFunction(ExpressionType type) { case ExpressionType::WINDOW_PERCENT_RANK: case ExpressionType::WINDOW_ROW_NUMBER: case ExpressionType::WINDOW_NTILE: + case ExpressionType::WINDOW_CUME_DIST: return true; case ExpressionType::WINDOW_LEAD: case ExpressionType::WINDOW_LAG: case ExpressionType::WINDOW_AGGREGATE: case ExpressionType::WINDOW_RANK_DENSE: - case ExpressionType::WINDOW_CUME_DIST: return false; default: throw InternalException("Unknown orderable window type %s", ExpressionTypeToString(type).c_str()); diff --git a/src/duckdb/src/planner/filter/in_filter.cpp b/src/duckdb/src/planner/filter/in_filter.cpp index ed8cde827..9c848292a 100644 --- a/src/duckdb/src/planner/filter/in_filter.cpp +++ b/src/duckdb/src/planner/filter/in_filter.cpp @@ -48,7 +48,7 @@ FilterPropagateResult InFilter::CheckStatistics(BaseStatistics &stats) { string InFilter::ToString(const string &column_name) { string in_list; for (auto &val : values) { - if (in_list.empty()) { + if (!in_list.empty()) { in_list += ", "; } in_list += val.ToSQLString(); diff --git a/src/duckdb/src/planner/subquery/flatten_dependent_join.cpp b/src/duckdb/src/planner/subquery/flatten_dependent_join.cpp index da4336a4e..af79baa83 100644 --- a/src/duckdb/src/planner/subquery/flatten_dependent_join.cpp +++ b/src/duckdb/src/planner/subquery/flatten_dependent_join.cpp @@ -620,6 +620,29 @@ unique_ptr FlattenDependentJoins::PushDownDependentJoinInternal #endif plan->children[0] = PushDownDependentJoin(std::move(plan->children[0])); plan->children[1] = PushDownDependentJoin(std::move(plan->children[1])); + for (idx_t i = 0; i < plan->children.size(); i++) { + if (plan->children[i]->type == LogicalOperatorType::LOGICAL_CROSS_PRODUCT) { + auto proj_index = binder.GenerateTableIndex(); + auto bindings = plan->children[i]->GetColumnBindings(); + plan->children[i]->ResolveOperatorTypes(); + auto types = plan->children[i]->types; + vector> expressions; + expressions.reserve(bindings.size()); + D_ASSERT(bindings.size() == types.size()); + + // No column binding replaceent is needed because the parent operator is + // a setop which will immediately assign new bindings. + for (idx_t col_idx = 0; col_idx < bindings.size(); col_idx++) { + expressions.push_back(make_uniq(types[col_idx], bindings[col_idx])); + } + auto proj = make_uniq(proj_index, std::move(expressions)); + proj->children.push_back(std::move(plan->children[i])); + plan->children[i] = std::move(proj); + } + } + + // here we need to check the children. If they have reorderable bindings, you need to plan a projection + // on top that will guarantee the order of the bindings. #ifdef DEBUG D_ASSERT(plan->children[0]->GetColumnBindings().size() == plan->children[1]->GetColumnBindings().size()); plan->children[0]->ResolveOperatorTypes(); diff --git a/src/duckdb/src/storage/serialization/serialize_nodes.cpp b/src/duckdb/src/storage/serialization/serialize_nodes.cpp index 2c7230383..59fc2681d 100644 --- a/src/duckdb/src/storage/serialization/serialize_nodes.cpp +++ b/src/duckdb/src/storage/serialization/serialize_nodes.cpp @@ -211,6 +211,7 @@ void CSVReaderOptions::Serialize(Serializer &serializer) const { serializer.WritePropertyWithDefault(139, "encoding", encoding); serializer.WriteProperty>(140, "rfc_4180", dialect_options.state_machine_options.rfc_4180); serializer.WriteProperty>(141, "multi_byte_delimiter", GetMultiByteDelimiter()); + serializer.WritePropertyWithDefault(142, "multi_file_reader", multi_file_reader); } CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) { @@ -295,6 +296,7 @@ CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) { result.dialect_options.rows_until_header = dialect_options_rows_until_header; result.encoding = std::move(encoding); result.dialect_options.state_machine_options.rfc_4180 = dialect_options_state_machine_options_rfc_4180; + deserializer.ReadPropertyWithDefault(142, "multi_file_reader", result.multi_file_reader); return result; } diff --git a/src/duckdb/src/storage/single_file_block_manager.cpp b/src/duckdb/src/storage/single_file_block_manager.cpp index 00d8aa94b..2b63fa247 100644 --- a/src/duckdb/src/storage/single_file_block_manager.cpp +++ b/src/duckdb/src/storage/single_file_block_manager.cpp @@ -265,7 +265,7 @@ void SingleFileBlockManager::ReadAndChecksum(FileBuffer &block, uint64_t locatio // compute the checksum auto stored_checksum = Load(block.InternalBuffer()); - auto computed_checksum = Checksum(block.buffer, block.size); + auto computed_checksum = Checksum(block.buffer, block.Size()); // verify the checksum if (stored_checksum != computed_checksum) { @@ -277,7 +277,7 @@ void SingleFileBlockManager::ReadAndChecksum(FileBuffer &block, uint64_t locatio void SingleFileBlockManager::ChecksumAndWrite(FileBuffer &block, uint64_t location) const { // compute the checksum and write it to the start of the buffer (if not temp buffer) - uint64_t checksum = Checksum(block.buffer, block.size); + uint64_t checksum = Checksum(block.buffer, block.Size()); Store(checksum, block.InternalBuffer()); // now write the buffer block.Write(*handle, location); diff --git a/src/duckdb/src/storage/standard_buffer_manager.cpp b/src/duckdb/src/storage/standard_buffer_manager.cpp index 28d82754f..c23ba16e0 100644 --- a/src/duckdb/src/storage/standard_buffer_manager.cpp +++ b/src/duckdb/src/storage/standard_buffer_manager.cpp @@ -469,15 +469,15 @@ void StandardBufferManager::WriteTemporaryBuffer(MemoryTag tag, block_id_t block RequireTemporaryDirectory(); // Append to a few grouped files. - if (buffer.size == GetBlockSize()) { - evicted_data_per_tag[uint8_t(tag)] += GetBlockSize(); + if (buffer.AllocSize() == GetBlockAllocSize()) { + evicted_data_per_tag[uint8_t(tag)] += GetBlockAllocSize(); temporary_directory.handle->GetTempFile().WriteTemporaryBuffer(block_id, buffer); return; } // Get the path to write to. auto path = GetTemporaryPath(block_id); - evicted_data_per_tag[uint8_t(tag)] += buffer.size; + evicted_data_per_tag[uint8_t(tag)] += buffer.AllocSize(); // Create the file and write the size followed by the buffer contents. auto &fs = FileSystem::GetFileSystem(db); @@ -528,7 +528,7 @@ void StandardBufferManager::DeleteTemporaryFile(BlockHandle &block) { } // check if we should delete the file from the shared pool of files, or from the general file system if (temporary_directory.handle->GetTempFile().HasTemporaryBuffer(id)) { - evicted_data_per_tag[uint8_t(block.GetMemoryTag())] -= GetBlockSize(); + evicted_data_per_tag[uint8_t(block.GetMemoryTag())] -= GetBlockAllocSize(); temporary_directory.handle->GetTempFile().DeleteTemporaryBuffer(id); return; } diff --git a/src/duckdb/src/storage/temporary_file_manager.cpp b/src/duckdb/src/storage/temporary_file_manager.cpp index 4b6bc2cf2..478f5e338 100644 --- a/src/duckdb/src/storage/temporary_file_manager.cpp +++ b/src/duckdb/src/storage/temporary_file_manager.cpp @@ -224,7 +224,7 @@ unique_ptr TemporaryFileHandle::ReadTemporaryBuffer(idx_t block_inde void TemporaryFileHandle::WriteTemporaryBuffer(FileBuffer &buffer, const idx_t block_index, AllocatedData &compressed_buffer) const { // We group DEFAULT_BLOCK_ALLOC_SIZE blocks into the same file. - D_ASSERT(buffer.size == BufferManager::GetBufferManager(db).GetBlockSize()); + D_ASSERT(buffer.AllocSize() == BufferManager::GetBufferManager(db).GetBlockAllocSize()); if (identifier.size == TemporaryBufferSize::DEFAULT) { buffer.Write(*handle, GetPositionInFile(block_index)); } else { @@ -427,7 +427,7 @@ TemporaryFileManager::TemporaryFileManagerLock::TemporaryFileManagerLock(mutex & void TemporaryFileManager::WriteTemporaryBuffer(block_id_t block_id, FileBuffer &buffer) { // We group DEFAULT_BLOCK_ALLOC_SIZE blocks into the same file. - D_ASSERT(buffer.size == BufferManager::GetBufferManager(db).GetBlockSize()); + D_ASSERT(buffer.AllocSize() == BufferManager::GetBufferManager(db).GetBlockAllocSize()); const auto adaptivity_idx = TaskScheduler::GetEstimatedCPUId() % COMPRESSION_ADAPTIVITIES; auto &compression_adaptivity = compression_adaptivities[adaptivity_idx]; diff --git a/src/duckdb/ub_extension_core_functions_aggregate_distributive.cpp b/src/duckdb/ub_extension_core_functions_aggregate_distributive.cpp index f8c552db1..22a9ac797 100644 --- a/src/duckdb/ub_extension_core_functions_aggregate_distributive.cpp +++ b/src/duckdb/ub_extension_core_functions_aggregate_distributive.cpp @@ -1,20 +1,20 @@ -#include "extension/core_functions/aggregate/distributive/skew.cpp" +#include "extension/core_functions/aggregate/distributive/kurtosis.cpp" -#include "extension/core_functions/aggregate/distributive/bool.cpp" +#include "extension/core_functions/aggregate/distributive/string_agg.cpp" -#include "extension/core_functions/aggregate/distributive/bitstring_agg.cpp" +#include "extension/core_functions/aggregate/distributive/sum.cpp" -#include "extension/core_functions/aggregate/distributive/bitagg.cpp" +#include "extension/core_functions/aggregate/distributive/arg_min_max.cpp" -#include "extension/core_functions/aggregate/distributive/kurtosis.cpp" +#include "extension/core_functions/aggregate/distributive/approx_count.cpp" -#include "extension/core_functions/aggregate/distributive/sum.cpp" +#include "extension/core_functions/aggregate/distributive/skew.cpp" -#include "extension/core_functions/aggregate/distributive/arg_min_max.cpp" +#include "extension/core_functions/aggregate/distributive/bitagg.cpp" -#include "extension/core_functions/aggregate/distributive/product.cpp" +#include "extension/core_functions/aggregate/distributive/bitstring_agg.cpp" -#include "extension/core_functions/aggregate/distributive/string_agg.cpp" +#include "extension/core_functions/aggregate/distributive/product.cpp" -#include "extension/core_functions/aggregate/distributive/approx_count.cpp" +#include "extension/core_functions/aggregate/distributive/bool.cpp" diff --git a/src/duckdb/ub_extension_core_functions_aggregate_holistic.cpp b/src/duckdb/ub_extension_core_functions_aggregate_holistic.cpp index 2e18fa37e..7ee6f047a 100644 --- a/src/duckdb/ub_extension_core_functions_aggregate_holistic.cpp +++ b/src/duckdb/ub_extension_core_functions_aggregate_holistic.cpp @@ -1,12 +1,12 @@ -#include "extension/core_functions/aggregate/holistic/mad.cpp" +#include "extension/core_functions/aggregate/holistic/approx_top_k.cpp" #include "extension/core_functions/aggregate/holistic/quantile.cpp" #include "extension/core_functions/aggregate/holistic/reservoir_quantile.cpp" -#include "extension/core_functions/aggregate/holistic/approximate_quantile.cpp" +#include "extension/core_functions/aggregate/holistic/mad.cpp" -#include "extension/core_functions/aggregate/holistic/approx_top_k.cpp" +#include "extension/core_functions/aggregate/holistic/approximate_quantile.cpp" #include "extension/core_functions/aggregate/holistic/mode.cpp" diff --git a/src/duckdb/ub_extension_core_functions_aggregate_nested.cpp b/src/duckdb/ub_extension_core_functions_aggregate_nested.cpp index 8f3f5452f..9d9f036b7 100644 --- a/src/duckdb/ub_extension_core_functions_aggregate_nested.cpp +++ b/src/duckdb/ub_extension_core_functions_aggregate_nested.cpp @@ -1,6 +1,6 @@ #include "extension/core_functions/aggregate/nested/binned_histogram.cpp" -#include "extension/core_functions/aggregate/nested/histogram.cpp" - #include "extension/core_functions/aggregate/nested/list.cpp" +#include "extension/core_functions/aggregate/nested/histogram.cpp" + diff --git a/src/duckdb/ub_extension_core_functions_aggregate_regression.cpp b/src/duckdb/ub_extension_core_functions_aggregate_regression.cpp index 3b2713788..a7d5acb11 100644 --- a/src/duckdb/ub_extension_core_functions_aggregate_regression.cpp +++ b/src/duckdb/ub_extension_core_functions_aggregate_regression.cpp @@ -1,14 +1,14 @@ -#include "extension/core_functions/aggregate/regression/regr_avg.cpp" +#include "extension/core_functions/aggregate/regression/regr_sxy.cpp" #include "extension/core_functions/aggregate/regression/regr_intercept.cpp" -#include "extension/core_functions/aggregate/regression/regr_sxx_syy.cpp" - #include "extension/core_functions/aggregate/regression/regr_count.cpp" -#include "extension/core_functions/aggregate/regression/regr_slope.cpp" - #include "extension/core_functions/aggregate/regression/regr_r2.cpp" -#include "extension/core_functions/aggregate/regression/regr_sxy.cpp" +#include "extension/core_functions/aggregate/regression/regr_avg.cpp" + +#include "extension/core_functions/aggregate/regression/regr_slope.cpp" + +#include "extension/core_functions/aggregate/regression/regr_sxx_syy.cpp" diff --git a/src/duckdb/ub_extension_core_functions_scalar_array.cpp b/src/duckdb/ub_extension_core_functions_scalar_array.cpp index 9b9a475b4..e4f63a369 100644 --- a/src/duckdb/ub_extension_core_functions_scalar_array.cpp +++ b/src/duckdb/ub_extension_core_functions_scalar_array.cpp @@ -1,4 +1,4 @@ -#include "extension/core_functions/scalar/array/array_value.cpp" - #include "extension/core_functions/scalar/array/array_functions.cpp" +#include "extension/core_functions/scalar/array/array_value.cpp" + diff --git a/src/duckdb/ub_extension_core_functions_scalar_date.cpp b/src/duckdb/ub_extension_core_functions_scalar_date.cpp index 11b7e8756..614e5e4e8 100644 --- a/src/duckdb/ub_extension_core_functions_scalar_date.cpp +++ b/src/duckdb/ub_extension_core_functions_scalar_date.cpp @@ -1,20 +1,20 @@ -#include "extension/core_functions/scalar/date/make_date.cpp" - -#include "extension/core_functions/scalar/date/date_trunc.cpp" +#include "extension/core_functions/scalar/date/current.cpp" -#include "extension/core_functions/scalar/date/date_part.cpp" +#include "extension/core_functions/scalar/date/age.cpp" #include "extension/core_functions/scalar/date/date_diff.cpp" -#include "extension/core_functions/scalar/date/age.cpp" +#include "extension/core_functions/scalar/date/date_sub.cpp" + +#include "extension/core_functions/scalar/date/to_interval.cpp" #include "extension/core_functions/scalar/date/time_bucket.cpp" -#include "extension/core_functions/scalar/date/date_sub.cpp" +#include "extension/core_functions/scalar/date/date_trunc.cpp" -#include "extension/core_functions/scalar/date/current.cpp" +#include "extension/core_functions/scalar/date/epoch.cpp" -#include "extension/core_functions/scalar/date/to_interval.cpp" +#include "extension/core_functions/scalar/date/date_part.cpp" -#include "extension/core_functions/scalar/date/epoch.cpp" +#include "extension/core_functions/scalar/date/make_date.cpp" diff --git a/src/duckdb/ub_extension_core_functions_scalar_list.cpp b/src/duckdb/ub_extension_core_functions_scalar_list.cpp index 87d225f95..e3ad42759 100644 --- a/src/duckdb/ub_extension_core_functions_scalar_list.cpp +++ b/src/duckdb/ub_extension_core_functions_scalar_list.cpp @@ -1,22 +1,22 @@ #include "extension/core_functions/scalar/list/flatten.cpp" -#include "extension/core_functions/scalar/list/list_distance.cpp" +#include "extension/core_functions/scalar/list/list_transform.cpp" -#include "extension/core_functions/scalar/list/array_slice.cpp" +#include "extension/core_functions/scalar/list/range.cpp" -#include "extension/core_functions/scalar/list/list_transform.cpp" +#include "extension/core_functions/scalar/list/list_value.cpp" #include "extension/core_functions/scalar/list/list_filter.cpp" -#include "extension/core_functions/scalar/list/list_reduce.cpp" - -#include "extension/core_functions/scalar/list/list_sort.cpp" +#include "extension/core_functions/scalar/list/list_has_any_or_all.cpp" #include "extension/core_functions/scalar/list/list_aggregates.cpp" -#include "extension/core_functions/scalar/list/list_has_any_or_all.cpp" +#include "extension/core_functions/scalar/list/list_distance.cpp" -#include "extension/core_functions/scalar/list/range.cpp" +#include "extension/core_functions/scalar/list/array_slice.cpp" -#include "extension/core_functions/scalar/list/list_value.cpp" +#include "extension/core_functions/scalar/list/list_sort.cpp" + +#include "extension/core_functions/scalar/list/list_reduce.cpp" diff --git a/src/duckdb/ub_extension_core_functions_scalar_map.cpp b/src/duckdb/ub_extension_core_functions_scalar_map.cpp index b191443c0..52bd226f0 100644 --- a/src/duckdb/ub_extension_core_functions_scalar_map.cpp +++ b/src/duckdb/ub_extension_core_functions_scalar_map.cpp @@ -1,14 +1,14 @@ #include "extension/core_functions/scalar/map/map_keys_values.cpp" -#include "extension/core_functions/scalar/map/cardinality.cpp" - -#include "extension/core_functions/scalar/map/map_concat.cpp" - #include "extension/core_functions/scalar/map/map_extract.cpp" #include "extension/core_functions/scalar/map/map_from_entries.cpp" +#include "extension/core_functions/scalar/map/map_entries.cpp" + #include "extension/core_functions/scalar/map/map.cpp" -#include "extension/core_functions/scalar/map/map_entries.cpp" +#include "extension/core_functions/scalar/map/map_concat.cpp" + +#include "extension/core_functions/scalar/map/cardinality.cpp" diff --git a/src/duckdb/ub_extension_core_functions_scalar_string.cpp b/src/duckdb/ub_extension_core_functions_scalar_string.cpp index b57063a4b..f01d70e21 100644 --- a/src/duckdb/ub_extension_core_functions_scalar_string.cpp +++ b/src/duckdb/ub_extension_core_functions_scalar_string.cpp @@ -1,48 +1,48 @@ -#include "extension/core_functions/scalar/string/hex.cpp" +#include "extension/core_functions/scalar/string/starts_with.cpp" -#include "extension/core_functions/scalar/string/unicode.cpp" +#include "extension/core_functions/scalar/string/jaccard.cpp" -#include "extension/core_functions/scalar/string/translate.cpp" +#include "extension/core_functions/scalar/string/levenshtein.cpp" -#include "extension/core_functions/scalar/string/left_right.cpp" +#include "extension/core_functions/scalar/string/damerau_levenshtein.cpp" -#include "extension/core_functions/scalar/string/hamming.cpp" +#include "extension/core_functions/scalar/string/bar.cpp" -#include "extension/core_functions/scalar/string/damerau_levenshtein.cpp" +#include "extension/core_functions/scalar/string/printf.cpp" -#include "extension/core_functions/scalar/string/ascii.cpp" +#include "extension/core_functions/scalar/string/replace.cpp" -#include "extension/core_functions/scalar/string/starts_with.cpp" +#include "extension/core_functions/scalar/string/hamming.cpp" -#include "extension/core_functions/scalar/string/repeat.cpp" +#include "extension/core_functions/scalar/string/instr.cpp" -#include "extension/core_functions/scalar/string/chr.cpp" +#include "extension/core_functions/scalar/string/ascii.cpp" -#include "extension/core_functions/scalar/string/levenshtein.cpp" +#include "extension/core_functions/scalar/string/reverse.cpp" -#include "extension/core_functions/scalar/string/pad.cpp" +#include "extension/core_functions/scalar/string/url_encode.cpp" -#include "extension/core_functions/scalar/string/bar.cpp" +#include "extension/core_functions/scalar/string/parse_path.cpp" -#include "extension/core_functions/scalar/string/replace.cpp" +#include "extension/core_functions/scalar/string/left_right.cpp" #include "extension/core_functions/scalar/string/to_base.cpp" -#include "extension/core_functions/scalar/string/printf.cpp" +#include "extension/core_functions/scalar/string/pad.cpp" -#include "extension/core_functions/scalar/string/format_bytes.cpp" +#include "extension/core_functions/scalar/string/trim.cpp" -#include "extension/core_functions/scalar/string/instr.cpp" +#include "extension/core_functions/scalar/string/format_bytes.cpp" -#include "extension/core_functions/scalar/string/url_encode.cpp" +#include "extension/core_functions/scalar/string/hex.cpp" -#include "extension/core_functions/scalar/string/jaccard.cpp" +#include "extension/core_functions/scalar/string/repeat.cpp" -#include "extension/core_functions/scalar/string/reverse.cpp" +#include "extension/core_functions/scalar/string/translate.cpp" -#include "extension/core_functions/scalar/string/trim.cpp" +#include "extension/core_functions/scalar/string/chr.cpp" -#include "extension/core_functions/scalar/string/parse_path.cpp" +#include "extension/core_functions/scalar/string/unicode.cpp" #include "extension/core_functions/scalar/string/jaro_winkler.cpp" diff --git a/src/duckdb/ub_extension_core_functions_scalar_union.cpp b/src/duckdb/ub_extension_core_functions_scalar_union.cpp index c23d6ebff..fad24f297 100644 --- a/src/duckdb/ub_extension_core_functions_scalar_union.cpp +++ b/src/duckdb/ub_extension_core_functions_scalar_union.cpp @@ -1,6 +1,6 @@ -#include "extension/core_functions/scalar/union/union_value.cpp" - #include "extension/core_functions/scalar/union/union_extract.cpp" +#include "extension/core_functions/scalar/union/union_value.cpp" + #include "extension/core_functions/scalar/union/union_tag.cpp" diff --git a/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp b/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp index de69a2f94..1229394c7 100644 --- a/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp +++ b/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp @@ -348,17 +348,17 @@ #include "extension/icu/third_party/icu/i18n/wintzimpl.cpp" +#include "extension/icu/third_party/icu/i18n/double-conversion-fast-dtoa.cpp" + #include "extension/icu/third_party/icu/i18n/double-conversion-cached-powers.cpp" +#include "extension/icu/third_party/icu/i18n/double-conversion-strtod.cpp" + #include "extension/icu/third_party/icu/i18n/double-conversion-double-to-string.cpp" +#include "extension/icu/third_party/icu/i18n/double-conversion-bignum-dtoa.cpp" + #include "extension/icu/third_party/icu/i18n/double-conversion-bignum.cpp" #include "extension/icu/third_party/icu/i18n/double-conversion-string-to-double.cpp" -#include "extension/icu/third_party/icu/i18n/double-conversion-strtod.cpp" - -#include "extension/icu/third_party/icu/i18n/double-conversion-fast-dtoa.cpp" - -#include "extension/icu/third_party/icu/i18n/double-conversion-bignum-dtoa.cpp" - diff --git a/src/duckdb/ub_src_function_window.cpp b/src/duckdb/ub_src_function_window.cpp index 16e5fa655..fbd3fc960 100644 --- a/src/duckdb/ub_src_function_window.cpp +++ b/src/duckdb/ub_src_function_window.cpp @@ -12,8 +12,6 @@ #include "src/function/window/window_custom_aggregator.cpp" -#include "src/function/window/window_cumedist_function.cpp" - #include "src/function/window/window_distinct_aggregator.cpp" #include "src/function/window/window_executor.cpp" diff --git a/src/main/java/org/duckdb/DuckDBDatabaseMetaData.java b/src/main/java/org/duckdb/DuckDBDatabaseMetaData.java index 2a6f8f5e5..b3a15855e 100644 --- a/src/main/java/org/duckdb/DuckDBDatabaseMetaData.java +++ b/src/main/java/org/duckdb/DuckDBDatabaseMetaData.java @@ -689,8 +689,8 @@ public ResultSet getCatalogs() throws SQLException { public ResultSet getSchemas() throws SQLException { Statement statement = conn.createStatement(); statement.closeOnCompletion(); - return statement.executeQuery( - "SELECT schema_name AS 'TABLE_SCHEM', catalog_name AS 'TABLE_CATALOG' FROM information_schema.schemata ORDER BY \"TABLE_CATALOG\", \"TABLE_SCHEM\""); + return statement.executeQuery("SELECT schema_name AS 'TABLE_SCHEM', catalog_name AS 'TABLE_CATALOG' FROM " + + "information_schema.schemata ORDER BY \"TABLE_CATALOG\", \"TABLE_SCHEM\""); } @Override diff --git a/src/main/java/org/duckdb/DuckDBPreparedStatement.java b/src/main/java/org/duckdb/DuckDBPreparedStatement.java index 1b078f037..b1dfc36f2 100644 --- a/src/main/java/org/duckdb/DuckDBPreparedStatement.java +++ b/src/main/java/org/duckdb/DuckDBPreparedStatement.java @@ -189,8 +189,8 @@ public int executeUpdate() throws SQLException { requireNonBatch(); execute(); if (!(returnsChangedRows || returnsNothing)) { - throw new SQLException( - "executeUpdate() can only be used with queries that return nothing (eg, a DDL statement), or update rows"); + throw new SQLException("executeUpdate() can only be used with queries that return nothing (eg, a DDL " + + "statement), or update rows"); } return getUpdateCountInternal(); } diff --git a/src/test/java/org/duckdb/TestDuckDBJDBC.java b/src/test/java/org/duckdb/TestDuckDBJDBC.java index 1908d89e9..04fe5e730 100644 --- a/src/test/java/org/duckdb/TestDuckDBJDBC.java +++ b/src/test/java/org/duckdb/TestDuckDBJDBC.java @@ -270,60 +270,60 @@ public static void test_enum() throws Exception { "CREATE TYPE enum_long AS ENUM ('enum0' ,'enum1' ,'enum2' ,'enum3' ,'enum4' ,'enum5' ,'enum6'" + ",'enum7' ,'enum8' ,'enum9' ,'enum10' ,'enum11' ,'enum12' ,'enum13' ,'enum14' ,'enum15' ,'enum16' ,'enum17'" - + - ",'enum18' ,'enum19' ,'enum20' ,'enum21' ,'enum22' ,'enum23' ,'enum24' ,'enum25' ,'enum26' ,'enum27' ,'enum28'" - + - ",'enum29' ,'enum30' ,'enum31' ,'enum32' ,'enum33' ,'enum34' ,'enum35' ,'enum36' ,'enum37' ,'enum38' ,'enum39'" - + - ",'enum40' ,'enum41' ,'enum42' ,'enum43' ,'enum44' ,'enum45' ,'enum46' ,'enum47' ,'enum48' ,'enum49' ,'enum50'" - + - ",'enum51' ,'enum52' ,'enum53' ,'enum54' ,'enum55' ,'enum56' ,'enum57' ,'enum58' ,'enum59' ,'enum60' ,'enum61'" - + - ",'enum62' ,'enum63' ,'enum64' ,'enum65' ,'enum66' ,'enum67' ,'enum68' ,'enum69' ,'enum70' ,'enum71' ,'enum72'" - + - ",'enum73' ,'enum74' ,'enum75' ,'enum76' ,'enum77' ,'enum78' ,'enum79' ,'enum80' ,'enum81' ,'enum82' ,'enum83'" - + - ",'enum84' ,'enum85' ,'enum86' ,'enum87' ,'enum88' ,'enum89' ,'enum90' ,'enum91' ,'enum92' ,'enum93' ,'enum94'" + + ",'enum18' ,'enum19' ,'enum20' ,'enum21' ,'enum22' ,'enum23' ,'enum24' ,'enum25' ,'enum26' ,'enum27' " + + ",'enum28'" + + ",'enum29' ,'enum30' ,'enum31' ,'enum32' ,'enum33' ,'enum34' ,'enum35' ,'enum36' ,'enum37' ,'enum38' " + + ",'enum39'" + + ",'enum40' ,'enum41' ,'enum42' ,'enum43' ,'enum44' ,'enum45' ,'enum46' ,'enum47' ,'enum48' ,'enum49' " + + ",'enum50'" + + ",'enum51' ,'enum52' ,'enum53' ,'enum54' ,'enum55' ,'enum56' ,'enum57' ,'enum58' ,'enum59' ,'enum60' " + + ",'enum61'" + + ",'enum62' ,'enum63' ,'enum64' ,'enum65' ,'enum66' ,'enum67' ,'enum68' ,'enum69' ,'enum70' ,'enum71' " + + ",'enum72'" + + ",'enum73' ,'enum74' ,'enum75' ,'enum76' ,'enum77' ,'enum78' ,'enum79' ,'enum80' ,'enum81' ,'enum82' " + + ",'enum83'" + + ",'enum84' ,'enum85' ,'enum86' ,'enum87' ,'enum88' ,'enum89' ,'enum90' ,'enum91' ,'enum92' ,'enum93' " + + ",'enum94'" + ",'enum95' ,'enum96' ,'enum97' ,'enum98' ,'enum99' ,'enum100' ,'enum101' ,'enum102' ,'enum103' ,'enum104' " - + - ",'enum105' ,'enum106' ,'enum107' ,'enum108' ,'enum109' ,'enum110' ,'enum111' ,'enum112' ,'enum113' ,'enum114'" - + - ",'enum115' ,'enum116' ,'enum117' ,'enum118' ,'enum119' ,'enum120' ,'enum121' ,'enum122' ,'enum123' ,'enum124'" - + - ",'enum125' ,'enum126' ,'enum127' ,'enum128' ,'enum129' ,'enum130' ,'enum131' ,'enum132' ,'enum133' ,'enum134'" - + - ",'enum135' ,'enum136' ,'enum137' ,'enum138' ,'enum139' ,'enum140' ,'enum141' ,'enum142' ,'enum143' ,'enum144'" - + - ",'enum145' ,'enum146' ,'enum147' ,'enum148' ,'enum149' ,'enum150' ,'enum151' ,'enum152' ,'enum153' ,'enum154'" - + - ",'enum155' ,'enum156' ,'enum157' ,'enum158' ,'enum159' ,'enum160' ,'enum161' ,'enum162' ,'enum163' ,'enum164'" - + - ",'enum165' ,'enum166' ,'enum167' ,'enum168' ,'enum169' ,'enum170' ,'enum171' ,'enum172' ,'enum173' ,'enum174'" - + - ",'enum175' ,'enum176' ,'enum177' ,'enum178' ,'enum179' ,'enum180' ,'enum181' ,'enum182' ,'enum183' ,'enum184'" - + - ",'enum185' ,'enum186' ,'enum187' ,'enum188' ,'enum189' ,'enum190' ,'enum191' ,'enum192' ,'enum193' ,'enum194'" - + - ",'enum195' ,'enum196' ,'enum197' ,'enum198' ,'enum199' ,'enum200' ,'enum201' ,'enum202' ,'enum203' ,'enum204'" - + - ",'enum205' ,'enum206' ,'enum207' ,'enum208' ,'enum209' ,'enum210' ,'enum211' ,'enum212' ,'enum213' ,'enum214'" - + - ",'enum215' ,'enum216' ,'enum217' ,'enum218' ,'enum219' ,'enum220' ,'enum221' ,'enum222' ,'enum223' ,'enum224'" - + - ",'enum225' ,'enum226' ,'enum227' ,'enum228' ,'enum229' ,'enum230' ,'enum231' ,'enum232' ,'enum233' ,'enum234'" - + - ",'enum235' ,'enum236' ,'enum237' ,'enum238' ,'enum239' ,'enum240' ,'enum241' ,'enum242' ,'enum243' ,'enum244'" - + - ",'enum245' ,'enum246' ,'enum247' ,'enum248' ,'enum249' ,'enum250' ,'enum251' ,'enum252' ,'enum253' ,'enum254'" - + - ",'enum255' ,'enum256' ,'enum257' ,'enum258' ,'enum259' ,'enum260' ,'enum261' ,'enum262' ,'enum263' ,'enum264'" - + - ",'enum265' ,'enum266' ,'enum267' ,'enum268' ,'enum269' ,'enum270' ,'enum271' ,'enum272' ,'enum273' ,'enum274'" - + - ",'enum275' ,'enum276' ,'enum277' ,'enum278' ,'enum279' ,'enum280' ,'enum281' ,'enum282' ,'enum283' ,'enum284'" - + - ",'enum285' ,'enum286' ,'enum287' ,'enum288' ,'enum289' ,'enum290' ,'enum291' ,'enum292' ,'enum293' ,'enum294'" + + ",'enum105' ,'enum106' ,'enum107' ,'enum108' ,'enum109' ,'enum110' ,'enum111' ,'enum112' ,'enum113' " + + ",'enum114'" + + ",'enum115' ,'enum116' ,'enum117' ,'enum118' ,'enum119' ,'enum120' ,'enum121' ,'enum122' ,'enum123' " + + ",'enum124'" + + ",'enum125' ,'enum126' ,'enum127' ,'enum128' ,'enum129' ,'enum130' ,'enum131' ,'enum132' ,'enum133' " + + ",'enum134'" + + ",'enum135' ,'enum136' ,'enum137' ,'enum138' ,'enum139' ,'enum140' ,'enum141' ,'enum142' ,'enum143' " + + ",'enum144'" + + ",'enum145' ,'enum146' ,'enum147' ,'enum148' ,'enum149' ,'enum150' ,'enum151' ,'enum152' ,'enum153' " + + ",'enum154'" + + ",'enum155' ,'enum156' ,'enum157' ,'enum158' ,'enum159' ,'enum160' ,'enum161' ,'enum162' ,'enum163' " + + ",'enum164'" + + ",'enum165' ,'enum166' ,'enum167' ,'enum168' ,'enum169' ,'enum170' ,'enum171' ,'enum172' ,'enum173' " + + ",'enum174'" + + ",'enum175' ,'enum176' ,'enum177' ,'enum178' ,'enum179' ,'enum180' ,'enum181' ,'enum182' ,'enum183' " + + ",'enum184'" + + ",'enum185' ,'enum186' ,'enum187' ,'enum188' ,'enum189' ,'enum190' ,'enum191' ,'enum192' ,'enum193' " + + ",'enum194'" + + ",'enum195' ,'enum196' ,'enum197' ,'enum198' ,'enum199' ,'enum200' ,'enum201' ,'enum202' ,'enum203' " + + ",'enum204'" + + ",'enum205' ,'enum206' ,'enum207' ,'enum208' ,'enum209' ,'enum210' ,'enum211' ,'enum212' ,'enum213' " + + ",'enum214'" + + ",'enum215' ,'enum216' ,'enum217' ,'enum218' ,'enum219' ,'enum220' ,'enum221' ,'enum222' ,'enum223' " + + ",'enum224'" + + ",'enum225' ,'enum226' ,'enum227' ,'enum228' ,'enum229' ,'enum230' ,'enum231' ,'enum232' ,'enum233' " + + ",'enum234'" + + ",'enum235' ,'enum236' ,'enum237' ,'enum238' ,'enum239' ,'enum240' ,'enum241' ,'enum242' ,'enum243' " + + ",'enum244'" + + ",'enum245' ,'enum246' ,'enum247' ,'enum248' ,'enum249' ,'enum250' ,'enum251' ,'enum252' ,'enum253' " + + ",'enum254'" + + ",'enum255' ,'enum256' ,'enum257' ,'enum258' ,'enum259' ,'enum260' ,'enum261' ,'enum262' ,'enum263' " + + ",'enum264'" + + ",'enum265' ,'enum266' ,'enum267' ,'enum268' ,'enum269' ,'enum270' ,'enum271' ,'enum272' ,'enum273' " + + ",'enum274'" + + ",'enum275' ,'enum276' ,'enum277' ,'enum278' ,'enum279' ,'enum280' ,'enum281' ,'enum282' ,'enum283' " + + ",'enum284'" + + ",'enum285' ,'enum286' ,'enum287' ,'enum288' ,'enum289' ,'enum290' ,'enum291' ,'enum292' ,'enum293' " + + ",'enum294'" + ",'enum295' ,'enum296' ,'enum297' ,'enum298' ,'enum299');"); stmt.execute("CREATE TABLE t2 (id INT, e1 enum_long);"); @@ -926,12 +926,11 @@ public static void test_duckdb_getObject_with_class() throws Exception { + " flt FLOAT, dbl DOUBLE, dte DATE, tme TIME, ts TIMESTAMP, dec16 DECIMAL(3,1)," + " dec32 DECIMAL(9,8), dec64 DECIMAL(16,1), dec128 DECIMAL(30,10), tint TINYINT, utint UTINYINT," + " usint USMALLINT, uint UINTEGER, ubig UBIGINT, hin HUGEINT, uhin UHUGEINT, blo BLOB)"); - stmt.execute( - "INSERT INTO b VALUES ('varchary', true, 6, 42, 666, 42.666, 666.42," - + - " '1970-01-02', '01:00:34', '1970-01-03 03:42:23', 42.2, 1.23456789, 987654321012345.6, 111112222233333.44444, " - + " -4, 200, 50001, 4000111222, 18446744073709551615, 18446744073709551616, " - + " 170141183460469231731687303715884105728, 'yeah'::BLOB)"); + stmt.execute("INSERT INTO b VALUES ('varchary', true, 6, 42, 666, 42.666, 666.42," + + " '1970-01-02', '01:00:34', '1970-01-03 03:42:23', 42.2, 1.23456789, 987654321012345.6, " + + "111112222233333.44444, " + + " -4, 200, 50001, 4000111222, 18446744073709551615, 18446744073709551616, " + + " 170141183460469231731687303715884105728, 'yeah'::BLOB)"); PreparedStatement ps = conn.prepareStatement("SELECT * FROM b"); ResultSet rs = ps.executeQuery(); @@ -1035,8 +1034,8 @@ public static void test_multiple_statements_exception() throws Exception { public static void test_bigdecimal() throws Exception { Connection conn = DriverManager.getConnection(JDBC_URL); Statement stmt = conn.createStatement(); - stmt.execute( - "CREATE TABLE q (id DECIMAL(3,0), dec16 DECIMAL(4,1), dec32 DECIMAL(9,4), dec64 DECIMAL(18,7), dec128 DECIMAL(38,10))"); + stmt.execute("CREATE TABLE q (id DECIMAL(3,0), dec16 DECIMAL(4,1), dec32 DECIMAL(9,4), dec64 DECIMAL(18,7), " + + "dec128 DECIMAL(38,10))"); PreparedStatement ps1 = conn.prepareStatement("INSERT INTO q (id, dec16, dec32, dec64, dec128) VALUES (?, ?, ?, ?, ?)"); @@ -1285,8 +1284,8 @@ public static void test_big_data() throws Exception { stmt.execute("INSERT INTO a VALUES (" + i + ")"); } - ResultSet rs = stmt.executeQuery( - "SELECT CAST(i AS SMALLINT), CAST(i AS INTEGER), CAST(i AS BIGINT), CAST(i AS FLOAT), CAST(i AS DOUBLE), CAST(i as STRING), NULL FROM a"); + ResultSet rs = stmt.executeQuery("SELECT CAST(i AS SMALLINT), CAST(i AS INTEGER), CAST(i AS BIGINT), CAST(i " + + "AS FLOAT), CAST(i AS DOUBLE), CAST(i as STRING), NULL FROM a"); int count = 0; while (rs.next()) { for (int col = 1; col <= 6; col++) { @@ -1419,7 +1418,8 @@ public static void test_prepare_types() throws Exception { Connection conn = DriverManager.getConnection(JDBC_URL); PreparedStatement ps = conn.prepareStatement( - "SELECT CAST(? AS BOOLEAN) c1, CAST(? AS TINYINT) c2, CAST(? AS SMALLINT) c3, CAST(? AS INTEGER) c4, CAST(? AS BIGINT) c5, CAST(? AS FLOAT) c6, CAST(? AS DOUBLE) c7, CAST(? AS STRING) c8"); + "SELECT CAST(? AS BOOLEAN) c1, CAST(? AS TINYINT) c2, CAST(? AS SMALLINT) c3, CAST(? AS INTEGER) c4, " + + "CAST(? AS BIGINT) c5, CAST(? AS FLOAT) c6, CAST(? AS DOUBLE) c7, CAST(? AS STRING) c8"); ps.setBoolean(1, true); ps.setByte(2, (byte) 42); ps.setShort(3, (short) 43); @@ -1522,8 +1522,8 @@ public static void test_prepare_insert() throws Exception { } pStmt1.close(); - conn.createStatement().executeUpdate( - "create table ctstable2 (KEY_ID int, COF_NAME varchar(32), PRICE float, TYPE_ID int, primary key(KEY_ID) )"); + conn.createStatement().executeUpdate("create table ctstable2 (KEY_ID int, COF_NAME varchar(32), PRICE float, " + + "TYPE_ID int, primary key(KEY_ID) )"); PreparedStatement pStmt = conn.prepareStatement("insert into ctstable2 values(?, ?, ?, ?)"); for (int i = 1; i <= 10; i++) { @@ -1640,8 +1640,9 @@ public static void test_hugeint() throws Exception { Connection conn = DriverManager.getConnection(JDBC_URL); Statement stmt = conn.createStatement(); - ResultSet rs = stmt.executeQuery( - "SELECT 42::hugeint hi1, -42::hugeint hi2, 454564646545646546545646545::hugeint hi3, -454564646545646546545646545::hugeint hi4"); + ResultSet rs = + stmt.executeQuery("SELECT 42::hugeint hi1, -42::hugeint hi2, 454564646545646546545646545::hugeint hi3, " + + "-454564646545646546545646545::hugeint hi4"); assertTrue(rs.next()); assertEquals(rs.getObject("hi1"), new BigInteger("42")); assertEquals(rs.getObject("hi2"), new BigInteger("-42")); @@ -1663,8 +1664,8 @@ public static void test_temporal_types() throws Exception { Connection conn = DriverManager.getConnection(JDBC_URL); Statement stmt = conn.createStatement(); - ResultSet rs = stmt.executeQuery( - "SELECT '2019-11-26 21:11:00'::timestamp ts, '2019-11-26'::date dt, interval '5 days' iv, '21:11:00'::time te"); + ResultSet rs = stmt.executeQuery("SELECT '2019-11-26 21:11:00'::timestamp ts, '2019-11-26'::date dt, " + + "interval '5 days' iv, '21:11:00'::time te"); assertTrue(rs.next()); assertEquals(rs.getObject("ts"), Timestamp.valueOf("2019-11-26 21:11:00")); assertEquals(rs.getTimestamp("ts"), Timestamp.valueOf("2019-11-26 21:11:00")); @@ -2995,8 +2996,8 @@ public static void test_unsigned_integers() throws Exception { DuckDBConnection conn = DriverManager.getConnection(JDBC_URL).unwrap(DuckDBConnection.class); Statement stmt = conn.createStatement(); - ResultSet rs = stmt.executeQuery( - "SELECT 201::utinyint uint8, 40001::usmallint uint16, 4000000001::uinteger uint32, 18446744073709551615::ubigint uint64"); + ResultSet rs = stmt.executeQuery("SELECT 201::utinyint uint8, 40001::usmallint uint16, 4000000001::uinteger " + + "uint32, 18446744073709551615::ubigint uint64"); assertTrue(rs.next()); assertEquals(rs.getShort("uint8"), Short.valueOf((short) 201)); @@ -3450,9 +3451,8 @@ public static void test_supports_catalogs_in_table_definitions() throws Exceptio statement.execute(String.format("CREATE TABLE %s (id int)", QUALIFIED_TABLE_NAME)); } catch (SQLException ex) { if (supportsCatalogsInTableDefinitions) { - fail( - "supportsCatalogsInTableDefinitions is true but CREATE TABLE in attached database is not allowed. " + - ex.getMessage()); + fail("supportsCatalogsInTableDefinitions is true but CREATE TABLE in attached database is not " + + "allowed. " + ex.getMessage()); ex.printStackTrace(); } } @@ -3465,9 +3465,8 @@ public static void test_supports_catalogs_in_table_definitions() throws Exceptio statement.execute(String.format("DROP TABLE %s", QUALIFIED_TABLE_NAME)); } catch (SQLException ex) { if (supportsCatalogsInTableDefinitions) { - fail( - "supportsCatalogsInTableDefinitions is true but DROP TABLE in attached database is not allowed. " + - ex.getMessage()); + fail("supportsCatalogsInTableDefinitions is true but DROP TABLE in attached database is not " + + "allowed. " + ex.getMessage()); ex.printStackTrace(); } } @@ -4023,11 +4022,15 @@ static Map mapOf(Object... pairs) { correct_answer_map.put("uint", asList(0L, 4294967295L, null)); correct_answer_map.put("ubigint", asList(BigInteger.ZERO, new BigInteger("18446744073709551615"), null)); correct_answer_map.put( - "varint", - asList( - "-179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368", - "179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368", - null)); + "varint", asList("-17976931348623157081452742373170435679807056752584499659891747680315726078002" + + "853876058955863276687817154045895351438246423432132688946418276846754670353751" + + "698604991057655128207624549009038932894407586850845513394230458323690322294816" + + "5808559332123348274797826204144723168738177180919299881250404026184124858368", + "179769313486231570814527423731704356798070567525844996598917476803157260780028" + + "538760589558632766878171540458953514382464234321326889464182768467546703537516" + + "986049910576551282076245490090389328944075868508455133942304583236903222948165" + + "808559332123348274797826204144723168738177180919299881250404026184124858368", + null)); correct_answer_map.put("time", asList(LocalTime.of(0, 0), LocalTime.parse("23:59:59.999999"), null)); correct_answer_map.put("float", asList(-3.4028234663852886e+38f, 3.4028234663852886e+38f, null)); correct_answer_map.put("double", asList(-1.7976931348623157e+308d, 1.7976931348623157e+308d, null)); @@ -4048,8 +4051,7 @@ static Map mapOf(Object... pairs) { correct_answer_map.put("medium_enum", asList("enum_0", "enum_299", null)); correct_answer_map.put("large_enum", asList("enum_0", "enum_69999", null)); correct_answer_map.put("struct", asList(abnull, ducks, null)); - correct_answer_map.put("map", - asList(mapOf(), mapOf("key1", "🦆🦆🦆🦆🦆🦆", "key2", "goose"), null)); + correct_answer_map.put("map", asList(mapOf(), mapOf("key1", "🦆🦆🦆🦆🦆🦆", "key2", "goose"), null)); correct_answer_map.put("union", asList("Frank", (short) 5, null)); correct_answer_map.put( "time_tz", asList(OffsetTime.parse("00:00+15:59:59"), OffsetTime.parse("23:59:59.999999-15:59:59"), null)); @@ -4098,12 +4100,11 @@ static Map mapOf(Object... pairs) { public static void test_all_types() throws Exception { Logger logger = Logger.getAnonymousLogger(); - String sql = - "select * EXCLUDE(time, time_tz)" - + "\n , CASE WHEN time = '24:00:00'::TIME THEN '23:59:59.999999'::TIME ELSE time END AS time" - + - "\n , CASE WHEN time_tz = '24:00:00-15:59:59'::TIMETZ THEN '23:59:59.999999-15:59:59'::TIMETZ ELSE time_tz END AS time_tz" - + "\nfrom test_all_types()"; + String sql = "select * EXCLUDE(time, time_tz)" + + "\n , CASE WHEN time = '24:00:00'::TIME THEN '23:59:59.999999'::TIME ELSE time END AS time" + + "\n , CASE WHEN time_tz = '24:00:00-15:59:59'::TIMETZ THEN " + + "'23:59:59.999999-15:59:59'::TIMETZ ELSE time_tz END AS time_tz" + + "\nfrom test_all_types()"; try (Connection conn = DriverManager.getConnection(JDBC_URL); PreparedStatement stmt = conn.prepareStatement(sql)) { diff --git a/vendor.py b/vendor.py index c34a52e01..d1002198a 100644 --- a/vendor.py +++ b/vendor.py @@ -2,6 +2,7 @@ import sys import json import pickle +import platform import argparse parser = argparse.ArgumentParser(description='Inlines DuckDB Sources') @@ -13,10 +14,18 @@ args = parser.parse_args() - # list of extensions to bundle extensions = ['core_functions', 'parquet', 'icu', 'json'] +# Conditionally include jemalloc +is_android = hasattr(sys, 'getandroidapilevel') +is_pyodide = 'PYODIDE' in os.environ +use_jemalloc = ( + not is_android and not is_pyodide and platform.system() == 'Linux' and platform.architecture()[0] == '64bit' +) +if use_jemalloc: + extensions.append('jemalloc') + # path to target basedir = os.getcwd() target_dir = os.path.join(basedir, 'src', 'duckdb')