Index: sys/cddl/compat/opensolaris/sys/kmem.h =================================================================== --- sys/cddl/compat/opensolaris/sys/kmem.h (revision 187) +++ sys/cddl/compat/opensolaris/sys/kmem.h (working copy) @@ -60,6 +60,7 @@ void zfs_kmem_free(void *buf, size_t size); uint64_t kmem_size(void); uint64_t kmem_used(void); +uint64_t kmem_max_contig_free(void); kmem_cache_t *kmem_cache_create(char *name, size_t bufsize, size_t align, int (*constructor)(void *, void *, int), void (*destructor)(void *, void *), void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags); Index: sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c =================================================================== --- sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c (revision 187) +++ sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c (working copy) @@ -125,7 +125,21 @@ return ((uint64_t)kmem_map->size); } +uint64_t +kmem_max_contig_free(void) +{ + uint64_t max_free; + vm_map_lock(kmem_map); + if(kmem_map->root == NULL) + max_free = kmem_map->max_offset - kmem_map->min_offset; + else + max_free = kmem_map->root->max_free; + vm_map_unlock(kmem_map); + + return max_free; +} + static int kmem_std_constructor(void *mem, int size __unused, void *private, int flags) { Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (revision 209) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (working copy) @@ -131,6 +131,8 @@ #include #include +#include + static kmutex_t arc_reclaim_thr_lock; static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ static uint8_t arc_thread_exit; @@ -170,11 +172,21 @@ uint64_t zfs_arc_min; uint64_t zfs_arc_meta_limit = 0; int zfs_mdcomp_disable = 0; +uint64_t zfs_kmem_reclaim_thresh = 0; +uint64_t zfs_kmem_target = 0; +uint64_t zfs_kmem_slow_growth_thresh = 0; +uint64_t zfs_kmem_fragment_thresh = 0; +uint64_t zfs_kmem_fragment_target = 0; TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max); TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min); TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable); +TUNABLE_QUAD("vfs.zfs.kmem_reclaim_thresh", &zfs_kmem_reclaim_thresh); +TUNABLE_QUAD("vfs.zfs.kmem_target", &zfs_kmem_target); +TUNABLE_QUAD("vfs.zfs.kmem_slow_growth_thresh", &zfs_kmem_slow_growth_thresh); +TUNABLE_QUAD("vfs.zfs.kmem_fragment_thresh", &zfs_kmem_fragment_thresh); +TUNABLE_QUAD("vfs.zfs.kmem_fragment_target", &zfs_kmem_fragment_target); SYSCTL_DECL(_vfs_zfs); SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, "Maximum ARC size"); @@ -182,6 +194,23 @@ "Minimum ARC size"); SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN, &zfs_mdcomp_disable, 0, "Disable metadata compression"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, kmem_reclaim_thresh, CTLFLAG_RDTUN, + &zfs_kmem_reclaim_thresh, 0, + "Max allowed kmem usage before aggressive reclamation."); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, kmem_target, CTLFLAG_RDTUN, + &zfs_kmem_target, 0, + "Desired maximum kmem usage. Should be less than kmem_reclaim_thresh."); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, kmem_slow_growth_thresh, CTLFLAG_RDTUN, + &zfs_kmem_slow_growth_thresh, 0, + "Maximum kmem allowed before arc growth is slowed. " + "Should be less than kmem_target."); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, kmem_fragment_thresh, CTLFLAG_RDTUN, + &zfs_kmem_fragment_thresh, 0, + "Min allowed kmem fragment size before aggressive reclamation."); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, kmem_fragment_target, CTLFLAG_RDTUN, + &zfs_kmem_fragment_target, 0, + "Desired minimum kmem fragment size. " + "Should be greater than kmem_fragment_thresh."); /* * Note that buffers can be in one of 6 states: @@ -277,6 +306,8 @@ kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_hdr_size; kstat_named_t arcstat_memory_throttle_count; + kstat_named_t arcstat_low_kmem_count; + kstat_named_t arcstat_fragmented_kmem_count; } arc_stats_t; static arc_stats_t arc_stats = { @@ -325,7 +356,9 @@ { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, - { "memory_throttle_count", KSTAT_DATA_UINT64 } + { "memory_throttle_count", KSTAT_DATA_UINT64 }, + { "low_kmem_count", KSTAT_DATA_UINT64 }, + { "fragmented_kmem_count", KSTAT_DATA_UINT64 } }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -1780,6 +1813,10 @@ #else to_free = arc_c >> arc_shrink_shift; #endif + + if(arc_c > arc_c_max) + to_free += arc_c_max - arc_c; + if (arc_c > arc_c_min + to_free) atomic_add_64(&arc_c, -to_free); else @@ -1855,8 +1892,15 @@ return (1); #endif #else - if (kmem_used() > (kmem_size() * 3) / 4) + if (kmem_used() > zfs_kmem_reclaim_thresh) { + ARCSTAT_BUMP(arcstat_low_kmem_count); return (1); + } + + if (kmem_max_contig_free() < zfs_kmem_fragment_thresh) { + ARCSTAT_BUMP(arcstat_fragmented_kmem_count); + return (1); + } #endif #else @@ -1867,6 +1911,37 @@ } static void +arc_reclaim_pages(void) +{ +#ifdef _KERNEL + size_t kmem_bytes; + size_t kmem_frag; + size_t kmem_need_pages = 0; + + kmem_bytes = kmem_used(); + kmem_frag = kmem_max_contig_free(); + + if (kmem_bytes > zfs_kmem_target) { + kmem_need_pages = (kmem_bytes - zfs_kmem_target); + kmem_need_pages /= PAGE_SIZE; + kmem_need_pages += 1; + } + else if (kmem_frag < zfs_kmem_fragment_target) { + kmem_need_pages = (zfs_kmem_fragment_target - kmem_frag) * 2; + kmem_need_pages /= PAGE_SIZE; + kmem_need_pages += 1; + } + + if (kmem_need_pages > 0) { + atomic_add_int(&vm_pageout_deficit, kmem_need_pages); + + /* Do not call VM_WAIT here as it can deadlock */ + pagedaemon_wakeup(); + } +#endif +} + +static void arc_kmem_reap_now(arc_reclaim_strategy_t strat) { #ifdef ZIO_USE_UMA @@ -1927,6 +2002,8 @@ mutex_enter(&arc_reclaim_thr_lock); while (arc_thread_exit == 0) { + arc_reclaim_pages(); + if (arc_reclaim_needed()) { if (arc_no_grow) { @@ -1964,6 +2041,10 @@ (2 * arc_c < arc_size + arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size)) arc_adjust(); +#ifdef _KERNEL + else if(kmem_used() > zfs_kmem_target) + arc_adjust(); +#endif if (arc_eviction_list != NULL) arc_do_user_evicts(); @@ -2034,12 +2115,24 @@ if (arc_c >= arc_c_max) return; +#ifdef _KERNEL + if(kmem_used() > zfs_kmem_target) + return; +#endif + /* * If we're within (2 * maxblocksize) bytes of the target * cache size, increment the target cache size */ if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { +#ifdef _KERNEL + if(kmem_used() > zfs_kmem_slow_growth_thresh) + atomic_add_64(&arc_c, (int64_t)(bytes / 2)); + else + atomic_add_64(&arc_c, (int64_t)bytes); +#else atomic_add_64(&arc_c, (int64_t)bytes); +#endif if (arc_c > arc_c_max) arc_c = arc_c_max; else if (state == arc_anon) @@ -3439,11 +3532,11 @@ if (zfs_arc_min >= 64<<18 && zfs_arc_min <= arc_c_max) arc_c_min = zfs_arc_min; #endif - arc_c = arc_c_max; + arc_c = arc_c_min; arc_p = (arc_c >> 1); - /* limit meta-data to 1/4 of the arc capacity */ - arc_meta_limit = arc_c_max / 4; + /* limit meta-data to 2/3 of the arc capacity */ + arc_meta_limit = (arc_c_max * 2) / 3; /* Allow the tunable to override if it is reasonable */ if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) @@ -3529,6 +3622,39 @@ zfs_write_limit_shift = 0; mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL); + if(zfs_kmem_reclaim_thresh == 0) + zfs_kmem_reclaim_thresh = (kmem_size() * 6) / 8; + if(zfs_kmem_target == 0) + zfs_kmem_target = (kmem_size() * 5) / 8; + if(zfs_kmem_slow_growth_thresh == 0) + zfs_kmem_slow_growth_thresh = (kmem_size() * 4) / 8; + if(zfs_kmem_fragment_thresh == 0) + zfs_kmem_fragment_thresh = (kmem_size() * 1) / 16; + if(zfs_kmem_fragment_target == 0) + zfs_kmem_fragment_target = (kmem_size() * 1) / 8; + + /* + * Restrict kmem reclaim threshold to greater than one half kmem size + * and 4MB less than kmem max size. + */ + zfs_kmem_reclaim_thresh = MIN((kmem_size() - (1 << 22)), + zfs_kmem_reclaim_thresh); + zfs_kmem_reclaim_thresh = MAX((kmem_size() / 2), + zfs_kmem_reclaim_thresh); + zfs_kmem_target = MIN(zfs_kmem_reclaim_thresh, zfs_kmem_target); + zfs_kmem_slow_growth_thresh = MIN(zfs_kmem_target, + zfs_kmem_slow_growth_thresh); + + /* + * Restrict kmem fragment threshold to between one half kmem size and + * 4MB. + */ + zfs_kmem_fragment_thresh = MIN(kmem_size() / 2, + zfs_kmem_fragment_thresh); + zfs_kmem_fragment_thresh = MAX((1 << 22), zfs_kmem_fragment_thresh); + zfs_kmem_fragment_target = MAX(zfs_kmem_fragment_thresh, + zfs_kmem_fragment_target); + #ifdef _KERNEL /* Warn about ZFS memory and address space requirements. */ if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {