diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 936da10c526055c29ff7affd91ce5f83a8c28873..e52cd57bb5127dcc655251c85657c44c1990ace3 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -203,11 +203,22 @@ PMD-mappable transparent hugepage::
 	cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size
 
 The kernel tries to use huge, PMD-mappable page on read page fault for
-file exec mapping if CONFIG_READ_ONLY_THP_FOR_FS enabled. It's possible
-to enabled the feature by writing 1 or disablt by writing 0::
+if CONFIG_READ_ONLY_THP_FOR_FS enabled, or try non-PMD size page(eg,
+64K arm64) for file exec mapping, BIT0 for PMD THP, BIT1 for mTHP. It's
+possible to enable/disable it by configurate the corresponding bit::
 
-	echo 0x0 >/sys/kernel/mm/transparent_hugepage/thp_exec_enabled
 	echo 0x1 >/sys/kernel/mm/transparent_hugepage/thp_exec_enabled
+	echo 0x2 >/sys/kernel/mm/transparent_hugepage/thp_exec_enabled
+	echo 0x3 >/sys/kernel/mm/transparent_hugepage/thp_exec_enabled
+
+The kernel could try to enable other larger size mappings align other
+than THP size, eg, 64K on arm64, BIT0 for file mapping, BIT1 for anon
+mapping, it is disabled by default, and could enable this feature by
+writing the corresponding bit to 1::
+
+	echo 0x1 >/sys/kernel/mm/transparent_hugepage/thp_mapping_align
+	echo 0x2 >/sys/kernel/mm/transparent_hugepage/thp_mapping_align
+	echo 0x3 >/sys/kernel/mm/transparent_hugepage/thp_mapping_align
 
 khugepaged will be automatically started when one or more hugepage
 sizes are enabled (either by directly setting "always" or "madvise",
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 07948fe59b9d98fb7051818178d583c2d17d3fca..8d68d00de0a4dc903f74e10ee373a686951cd98c 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1147,6 +1147,18 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf,
  */
 #define arch_wants_old_prefaulted_pte	cpu_has_hw_af
 
+/*
+ * Request exec memory is read into pagecache in at least 64K folios. The
+ * trade-off here is performance improvement due to storing translations more
+ * effciently in the iTLB vs the potential for read amplification due to reading
+ * data from disk that won't be used. The latter is independent of base page
+ * size, so we set a page-size independent block size of 64K. This size can be
+ * contpte-mapped when 4K base pages are in use (16 pages into 1 iTLB entry),
+ * and HPA can coalesce it (4 pages into 1 TLB entry) when 16K base pages are in
+ * use.
+ */
+#define arch_wants_exec_folio_order() ilog2(SZ_64K >> PAGE_SHIFT)
+
 static inline bool pud_sect_supported(void)
 {
 	return PAGE_SIZE == SZ_4K;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 67b0e2212ca0bf736b8f7b65a7d4efef28e5a18e..27d4eff7994106e4a03dfc92e7dd4d908a47f9d9 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -314,7 +314,8 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
 	if (ret <= 0)
 		goto out;
 
-	if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))
+	if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP) &&
+	    iov_iter_count(from) > PAGE_SIZE)
 		ret = ext4_iomap_buffered_write(iocb, from);
 	else
 		ret = generic_perform_write(iocb, from);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 93a9dd03cb5c9c9e22e0ea5de944656c85742ef6..2c0e61f531f1cadc2c64142b3d576670afc02696 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3955,6 +3955,128 @@ static int ext4_iomap_writepages(struct address_space *mapping,
 	return ret;
 }
 
+static int ext4_iomap_write_begin(struct file *file,
+				  struct address_space *mapping, loff_t pos,
+				  unsigned len, struct page **pagep,
+				  void **fsdata)
+{
+	struct inode *inode = mapping->host;
+	struct iomap_iter iter = {
+		.inode	= inode,
+		.flags	= IOMAP_WRITE,
+	};
+	int ret = 0, retries = 0;
+	struct folio *folio;
+	bool delalloc;
+
+	if (unlikely(ext4_forced_shutdown(inode->i_sb)))
+		return -EIO;
+
+	trace_ext4_iomap_write_begin(inode, pos, len);
+
+	delalloc = test_opt(inode->i_sb, DELALLOC) &&
+		   !ext4_nonda_switch(inode->i_sb);
+	*fsdata = delalloc ? (void *)0 : (void *)FALL_BACK_TO_NONDELALLOC;
+
+retry:
+	iter.pos = pos;
+	iter.len = len;
+
+	folio = iomap_get_folio(&iter, pos, len);
+	if (IS_ERR(folio))
+		return PTR_ERR(folio);
+
+	WARN_ON_ONCE(pos + len > folio_pos(folio) + folio_size(folio));
+
+	if (folio_test_dirty(folio) && (i_blocks_per_folio(inode, folio) == 1))
+		goto out;
+
+	do {
+		int length;
+
+		ret = __ext4_iomap_buffered_io_begin(inode, iter.pos, iter.len,
+				iter.flags, &iter.iomap, NULL, delalloc);
+		if (ret)
+			goto out;
+
+		WARN_ON_ONCE(iter.iomap.offset > iter.pos);
+		WARN_ON_ONCE(iter.iomap.length == 0);
+		WARN_ON_ONCE(iter.iomap.offset + iter.iomap.length <= iter.pos);
+
+		length = iomap_length(&iter);
+		ret = __iomap_write_begin(&iter, iter.pos, length, folio);
+		if (ret)
+			goto out;
+
+		iter.pos += length;
+		iter.len -= length;
+	} while (iter.len);
+
+out:
+	if (ret < 0) {
+		folio_unlock(folio);
+		folio_put(folio);
+
+		/*
+		 * __ext4_iomap_buffered_io_begin() may have instantiated
+		 * a few blocks outside i_size. Trim these off again. Don't
+		 * need i_size_read because we hold inode lock.
+		 */
+		if (pos + len > inode->i_size)
+			ext4_truncate_failed_write(inode);
+
+		if (ret == -ENOSPC &&
+		    ext4_should_retry_alloc(inode->i_sb, &retries))
+			goto retry;
+	}
+
+	*pagep = folio_file_page(folio, pos >> PAGE_SHIFT);
+	return ret;
+}
+
+static int ext4_iomap_write_end(struct file *file,
+				struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	int write_mode = (int)(unsigned long)fsdata;
+	struct folio *folio = page_folio(page);
+	loff_t old_size = inode->i_size;
+	size_t written;
+
+	trace_ext4_iomap_write_end(inode, pos, len, copied);
+
+	written = __iomap_write_end(inode, pos, len, copied, folio) ?
+		  copied : 0;
+
+	/*
+	 * Update the in-memory inode size after copying the data into
+	 * the page cache. It's important to update i_size while still
+	 * holding folio lock, because folio writeout could otherwise
+	 * come in and zero beyond i_size.
+	 */
+	if (pos + written > old_size)
+		i_size_write(inode, pos + written);
+
+	folio_unlock(folio);
+	folio_put(folio);
+
+	if (old_size < pos)
+		pagecache_isize_extended(inode, old_size, pos);
+
+	/*
+	 * For delalloc, if we have pre-allocated more blocks and copied
+	 * less, we will have delalloc extents allocated outside i_size,
+	 * drop pre-allocated blocks that were not used, prevent the
+	 * write back path from allocating blocks for them.
+	 */
+	if (unlikely(!written) && write_mode != FALL_BACK_TO_NONDELALLOC)
+		ext4_truncate_failed_write(inode);
+
+	return written;
+}
+
 /*
  * For data=journal mode, folio should be marked dirty only when it was
  * writeably mapped. When that happens, it was already attached to the
@@ -4048,6 +4170,8 @@ static const struct address_space_operations ext4_iomap_aops = {
 	.read_folio		= ext4_iomap_read_folio,
 	.readahead		= ext4_iomap_readahead,
 	.writepages		= ext4_iomap_writepages,
+	.write_begin		= ext4_iomap_write_begin,
+	.write_end		= ext4_iomap_write_end,
 	.dirty_folio		= iomap_dirty_folio,
 	.bmap			= ext4_bmap,
 	.invalidate_folio	= iomap_invalidate_folio,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 292143ce354c3ef10e7088d562cebac81a527eb5..0ef4b804e18eff9ef5677d4d9418884f07ff46a1 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1738,7 +1738,8 @@ enum {
 	Opt_dioread_nolock, Opt_dioread_lock,
 	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
 	Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
-	Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan, Opt_buffered_iomap,
+	Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan,
+	Opt_buffered_iomap, Opt_nobuffered_iomap,
 	Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type,
 #ifdef CONFIG_EXT4_DEBUG
 	Opt_fc_debug_max_replay, Opt_fc_debug_force
@@ -1882,6 +1883,7 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
 						Opt_no_prefetch_block_bitmaps),
 	fsparam_s32	("mb_optimize_scan",	Opt_mb_optimize_scan),
 	fsparam_flag	("buffered_iomap",	Opt_buffered_iomap),
+	fsparam_flag	("nobuffered_iomap",	Opt_nobuffered_iomap),
 	fsparam_string	("check",		Opt_removed),	/* mount option from ext2/3 */
 	fsparam_flag	("nocheck",		Opt_removed),	/* mount option from ext2/3 */
 	fsparam_flag	("reservation",		Opt_removed),	/* mount option from ext2/3 */
@@ -1978,6 +1980,8 @@ static const struct mount_opts {
 	 MOPT_SET},
 	{Opt_buffered_iomap, EXT4_MOUNT2_BUFFERED_IOMAP,
 	 MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
+	{Opt_nobuffered_iomap, EXT4_MOUNT2_BUFFERED_IOMAP,
+	 MOPT_CLEAR | MOPT_2 | MOPT_EXT4_ONLY},
 #ifdef CONFIG_EXT4_DEBUG
 	{Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
 	 MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
@@ -2464,11 +2468,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
 			return -EINVAL;
 		}
 		return 0;
-	case Opt_buffered_iomap:
-		ext4_msg(NULL, KERN_WARNING,
-			 "iomap for buffered enabled. Warning: EXPERIMENTAL, use at your own risk");
-		ctx_set_mount_opt2(ctx, EXT4_MOUNT2_BUFFERED_IOMAP);
-		return 0;
 	}
 
 	/*
@@ -2908,12 +2907,6 @@ static int ext4_check_opt_consistency(struct fs_context *fc,
 			    !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) {
 			goto fail_dax_change_remount;
 		}
-
-		if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_BUFFERED_IOMAP) &&
-		    !test_opt2(sb, BUFFERED_IOMAP)) {
-			ext4_msg(NULL, KERN_ERR, "can't enable iomap for buffered IO on remount");
-			return -EINVAL;
-		}
 	}
 
 	return ext4_check_quota_consistency(fc, sb);
@@ -4481,6 +4474,10 @@ static void ext4_set_def_opts(struct super_block *sb,
 
 	if (sb->s_blocksize == PAGE_SIZE)
 		set_opt(sb, DIOREAD_NOLOCK);
+
+	/* Use iomap for buffered IO path on 4k pagesize */
+	if (PAGE_SIZE == SZ_4K)
+		set_opt2(sb, BUFFERED_IOMAP);
 }
 
 static int ext4_handle_clustersize(struct super_block *sb)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index fee0bb9b5d7583c6118986d732a2cac496db1f3a..dbd56c36ce2b9e2c57fecefa81212b89ea9195ca 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -665,7 +665,7 @@ static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
 	return submit_bio_wait(&bio);
 }
 
-static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
+int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
 		size_t len, struct folio *folio)
 {
 	const struct iomap *srcmap = iomap_iter_srcmap(iter);
@@ -727,6 +727,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(__iomap_write_begin);
 
 static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos,
 		size_t len)
@@ -825,7 +826,7 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
 	return status;
 }
 
-static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
+bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
 		size_t copied, struct folio *folio)
 {
 	flush_dcache_folio(folio);
@@ -848,6 +849,7 @@ static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
 	filemap_dirty_folio(inode->i_mapping, folio);
 	return true;
 }
+EXPORT_SYMBOL_GPL(__iomap_write_end);
 
 static void iomap_write_end_inline(const struct iomap_iter *iter,
 		struct folio *folio, loff_t pos, size_t copied)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index abf2340a2d18a646fa37c14c86d63a9fb74b1d71..8fdf17e80359cfa7328e6e214bf1b91f18b3c118 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -51,6 +51,9 @@ enum transparent_hugepage_flag {
 	TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
 	TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
 	TRANSPARENT_HUGEPAGE_FILE_EXEC_THP_FLAG,
+	TRANSPARENT_HUGEPAGE_FILE_EXEC_MTHP_FLAG,
+	TRANSPARENT_HUGEPAGE_FILE_MAPPING_ALIGN_FLAG,
+	TRANSPARENT_HUGEPAGE_ANON_MAPPING_ALIGN_FLAG,
 };
 
 struct kobject;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 6fc1c858013d1e4dda4ed38fa4083acf25d16d36..daf0a86ba37789d80cdd1d9d68a4553320fb81fb 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -258,6 +258,10 @@ static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i)
 
 ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
 		const struct iomap_ops *ops);
+int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
+		size_t len, struct folio *folio);
+bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
+		size_t copied, struct folio *folio);
 int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
 		struct iomap *iomap, loff_t pos, loff_t length, ssize_t written,
 		int (*punch)(struct inode *inode, loff_t pos, loff_t length));
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index ecc561d49d5b000219609454c1f969df9ceaec24..a0fafb8e7005a18b7dcec6d6a9b4d1e861d8ec0d 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -435,6 +435,18 @@ static inline bool arch_has_hw_pte_young(void)
 }
 #endif
 
+#ifndef arch_wants_exec_folio_order
+/*
+ * Returns preferred minimum folio order for executable file-backed memory. Must
+ * be in range [0, PMD_ORDER]. Negative value implies that the HW has no
+ * preference and mm will not special-case executable memory in the pagecache.
+ */
+static inline int arch_wants_exec_folio_order(void)
+{
+	return -1;
+}
+#endif
+
 #ifndef arch_check_zapped_pte
 static inline void arch_check_zapped_pte(struct vm_area_struct *vma,
 					 pte_t pte)
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 588991b57c127e804f1fecfba3ea861777eeebc2..d500568daeb1ecb9251d702cba22f284276f3a5e 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -389,6 +389,13 @@ DEFINE_EVENT(ext4__write_begin, ext4_da_write_begin,
 	TP_ARGS(inode, pos, len)
 );
 
+DEFINE_EVENT(ext4__write_begin, ext4_iomap_write_begin,
+
+	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len),
+
+	TP_ARGS(inode, pos, len)
+);
+
 DECLARE_EVENT_CLASS(ext4__write_end,
 	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
 			unsigned int copied),
@@ -441,6 +448,14 @@ DEFINE_EVENT(ext4__write_end, ext4_da_write_end,
 	TP_ARGS(inode, pos, len, copied)
 );
 
+DEFINE_EVENT(ext4__write_end, ext4_iomap_write_end,
+
+	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
+		 unsigned int copied),
+
+	TP_ARGS(inode, pos, len, copied)
+);
+
 TRACE_EVENT(ext4_writepages,
 	TP_PROTO(struct inode *inode, struct writeback_control *wbc),
 
diff --git a/mm/filemap.c b/mm/filemap.c
index a274d2c5e232f97671a78af6135737675546e33b..d3c813429bf212487d30f74f62729fe192784b6b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -46,6 +46,7 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/splice.h>
 #include <linux/huge_mm.h>
+#include <linux/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -3141,6 +3142,10 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
 	(transparent_hugepage_flags &			\
 	 (1<<TRANSPARENT_HUGEPAGE_FILE_EXEC_THP_FLAG))
 
+#define file_exec_mthp_enabled()			\
+	(transparent_hugepage_flags &			\
+	 (1<<TRANSPARENT_HUGEPAGE_FILE_EXEC_MTHP_FLAG))
+
 static inline void try_enable_file_exec_thp(struct vm_area_struct *vma,
 					    unsigned long *vm_flags,
 					    struct file *file)
@@ -3157,6 +3162,24 @@ static inline void try_enable_file_exec_thp(struct vm_area_struct *vma,
 	if (file_exec_thp_enabled())
 		hugepage_madvise(vma, vm_flags, MADV_HUGEPAGE);
 }
+
+static inline bool file_exec_can_enable_mthp(struct address_space *mapping,
+					     unsigned long vm_flags)
+{
+#ifndef arch_wants_exec_folio_order
+	return false;
+#endif
+	if (!is_exec_mapping(vm_flags))
+		return false;
+
+	if (!mapping_large_folio_support(mapping))
+		return false;
+
+	if (!file_exec_mthp_enabled())
+		return false;
+
+	return true;
+}
 #endif
 
 /*
@@ -3195,6 +3218,25 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
 		page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER);
 		return fpin;
 	}
+
+	/*
+	 * Allow arch to request a preferred minimum folio order for executable
+	 * memory. This can often be beneficial to performance if (e.g.) arm64
+	 * can contpte-map the folio. Executable memory rarely benefits from
+	 * read-ahead anyway, due to its random access nature.
+	 */
+	if (file_exec_can_enable_mthp(mapping, vm_flags)) {
+		int order = arch_wants_exec_folio_order();
+
+		if (order >= 0) {
+			fpin = maybe_unlock_mmap_for_io(vmf, fpin);
+			ra->size = 1UL << order;
+			ra->async_size = 0;
+			ractl._index &= ~((unsigned long)ra->size - 1);
+			page_cache_ra_order(&ractl, ra, order);
+			return fpin;
+		}
+	}
 #endif
 
 	/* If we don't want any read-ahead, don't bother */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0c61e7c7c2c13cc881455463dbdf35e0711f90e3..8cb3e014a881fde45f3424aca7f63ac257790f06 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -426,30 +426,106 @@ static struct kobj_attribute hpage_pmd_size_attr =
 	__ATTR_RO(hpage_pmd_size);
 
 #ifdef CONFIG_READ_ONLY_THP_FOR_FS
+#define FILE_EXEC_THP_ENABLE	BIT(0)
+#else
+#define FILE_EXEC_THP_ENABLE	0
+#endif
+
+#define FILE_EXEC_MTHP_ENABLE	BIT(1)
+#define FILE_EXEC_THP_ALL	(FILE_EXEC_THP_ENABLE | FILE_EXEC_MTHP_ENABLE)
+
+static void thp_flag_set(enum transparent_hugepage_flag flag, bool enable)
+{
+	if (enable)
+		set_bit(flag, &transparent_hugepage_flags);
+	else
+		clear_bit(flag, &transparent_hugepage_flags);
+}
+
 static ssize_t thp_exec_enabled_show(struct kobject *kobj,
 				     struct kobj_attribute *attr, char *buf)
 {
-	return single_hugepage_flag_show(kobj, attr, buf,
-				 TRANSPARENT_HUGEPAGE_FILE_EXEC_THP_FLAG);
+	unsigned long val = 0;
+
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+	if (test_bit(TRANSPARENT_HUGEPAGE_FILE_EXEC_THP_FLAG,
+		     &transparent_hugepage_flags))
+		val |= FILE_EXEC_THP_ENABLE;
+#endif
+
+	if (test_bit(TRANSPARENT_HUGEPAGE_FILE_EXEC_MTHP_FLAG,
+		     &transparent_hugepage_flags))
+		val |= FILE_EXEC_MTHP_ENABLE;
+
+	return sysfs_emit(buf, "0x%lx\n", val);
 }
 static ssize_t thp_exec_enabled_store(struct kobject *kobj,
 		struct kobj_attribute *attr, const char *buf, size_t count)
 {
-	size_t ret = single_hugepage_flag_store(kobj, attr, buf, count,
-				 TRANSPARENT_HUGEPAGE_FILE_EXEC_THP_FLAG);
-	if (ret > 0) {
-		int err = start_stop_khugepaged();
+	unsigned long val;
+	int ret;
 
-		if (err)
-			ret = err;
-	}
+	ret = kstrtoul(buf, 16, &val);
+	if (ret < 0)
+		return ret;
+	if (val & ~FILE_EXEC_THP_ALL)
+		return -EINVAL;
 
-	return ret;
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+	thp_flag_set(TRANSPARENT_HUGEPAGE_FILE_EXEC_THP_FLAG,
+		     val & FILE_EXEC_THP_ENABLE);
+	ret = start_stop_khugepaged();
+	if (ret)
+		return ret;
+#endif
+	thp_flag_set(TRANSPARENT_HUGEPAGE_FILE_EXEC_MTHP_FLAG,
+		     val & FILE_EXEC_MTHP_ENABLE);
+
+	return count;
 }
 static struct kobj_attribute thp_exec_enabled_attr =
 	__ATTR_RW(thp_exec_enabled);
 
-#endif
+#define FILE_MAPPING_ALIGN	BIT(0)
+#define ANON_MAPPING_ALIGN	BIT(1)
+#define THP_MAPPING_ALIGN_ALL	(FILE_MAPPING_ALIGN | ANON_MAPPING_ALIGN)
+
+static ssize_t thp_mapping_align_show(struct kobject *kobj,
+				      struct kobj_attribute *attr, char *buf)
+{
+	unsigned long val = 0;
+
+	if (test_bit(TRANSPARENT_HUGEPAGE_FILE_MAPPING_ALIGN_FLAG,
+		     &transparent_hugepage_flags))
+		val |= FILE_MAPPING_ALIGN;
+
+	if (test_bit(TRANSPARENT_HUGEPAGE_ANON_MAPPING_ALIGN_FLAG,
+		     &transparent_hugepage_flags))
+		val |= ANON_MAPPING_ALIGN;
+
+	return sysfs_emit(buf, "0x%lx\n", val);
+}
+static ssize_t thp_mapping_align_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(buf, 16, &val);
+	if (ret < 0)
+		return ret;
+	if (val & ~THP_MAPPING_ALIGN_ALL)
+		return -EINVAL;
+
+	thp_flag_set(TRANSPARENT_HUGEPAGE_FILE_MAPPING_ALIGN_FLAG,
+		     val & FILE_MAPPING_ALIGN);
+	thp_flag_set(TRANSPARENT_HUGEPAGE_ANON_MAPPING_ALIGN_FLAG,
+		     val & ANON_MAPPING_ALIGN);
+
+	return count;
+}
+static struct kobj_attribute thp_mapping_align_attr =
+	__ATTR_RW(thp_mapping_align);
 
 static struct attribute *hugepage_attr[] = {
 	&enabled_attr.attr,
@@ -459,9 +535,8 @@ static struct attribute *hugepage_attr[] = {
 #ifdef CONFIG_SHMEM
 	&shmem_enabled_attr.attr,
 #endif
-#ifdef CONFIG_READ_ONLY_THP_FOR_FS
 	&thp_exec_enabled_attr.attr,
-#endif
+	&thp_mapping_align_attr.attr,
 	NULL,
 };
 
@@ -853,6 +928,65 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
 	return ret;
 }
 
+#define transparent_hugepage_file_mapping_align_enabled()		\
+	(transparent_hugepage_flags &					\
+	 (1<<TRANSPARENT_HUGEPAGE_FILE_MAPPING_ALIGN_FLAG))
+
+#define transparent_hugepage_anon_mapping_align_enabled()		\
+	(transparent_hugepage_flags &					\
+	 (1<<TRANSPARENT_HUGEPAGE_ANON_MAPPING_ALIGN_FLAG))
+
+static bool file_mapping_align_enabled(struct file *filp)
+{
+	struct address_space *mapping;
+
+	if (!transparent_hugepage_file_mapping_align_enabled())
+		return false;
+
+	if (!filp)
+		return false;
+
+	mapping = filp->f_mapping;
+	if (!mapping || !mapping_large_folio_support(mapping))
+		return false;
+
+	return true;
+}
+
+static bool anon_mapping_align_enabled(int order)
+{
+	unsigned long mask;
+
+	if (!transparent_hugepage_anon_mapping_align_enabled())
+		return 0;
+
+	mask = READ_ONCE(huge_anon_orders_always) |
+	       READ_ONCE(huge_anon_orders_madvise);
+
+	if (hugepage_global_enabled())
+		mask |= READ_ONCE(huge_anon_orders_inherit);
+
+	mask = BIT(order) & mask;
+	if (!mask)
+		return false;
+
+	return true;
+}
+
+static unsigned long folio_get_unmapped_area(struct file *filp, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	int order = arch_wants_exec_folio_order();
+
+	if (order < 0)
+		return 0;
+
+	if (file_mapping_align_enabled(filp) || anon_mapping_align_enabled(order))
+		return __thp_get_unmapped_area(filp, addr, len, pgoff, flags,
+					       PAGE_SIZE << order);
+	return 0;
+}
+
 unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags)
 {
@@ -863,6 +997,10 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 	if (ret)
 		return ret;
 
+	ret = folio_get_unmapped_area(filp, addr, len, off, flags);
+	if (ret)
+		return ret;
+
 	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
 }
 EXPORT_SYMBOL_GPL(thp_get_unmapped_area);