diff --git a/src/cmd/compile/internal/test/inl_test.go b/src/cmd/compile/internal/test/inl_test.go
index c30be67731e6e40e423ce01ae94f68e447dbb9fa..3bc569f194b283e8ebcf74a7f3ae5bedecf5b223 100644
--- a/src/cmd/compile/internal/test/inl_test.go
+++ b/src/cmd/compile/internal/test/inl_test.go
@@ -67,16 +67,18 @@ func TestIntendedInlining(t *testing.T) {
 			// GC-related ones
 			"cgoInRange",
 			"gclinkptr.ptr",
+			"gcUsesSpanInlineMarkBits",
 			"guintptr.ptr",
 			"heapBitsSlice",
 			"markBits.isMarked",
 			"muintptr.ptr",
 			"puintptr.ptr",
+			"spanHeapBitsRange",
 			"spanOf",
 			"spanOfUnchecked",
 			"typePointers.nextFast",
-			"(*gcWork).putFast",
-			"(*gcWork).tryGetFast",
+			"(*gcWork).putObjFast",
+			"(*gcWork).tryGetObjFast",
 			"(*guintptr).set",
 			"(*markBits).advance",
 			"(*mspan).allocBitsForIndex",
diff --git a/src/cmd/internal/objabi/pkgspecial.go b/src/cmd/internal/objabi/pkgspecial.go
index 871c28f58829ab4e7505b15b84ff4612e6d2b30a..dd1e73410d9610f072180f7cbcc801a5b2ca4543 100644
--- a/src/cmd/internal/objabi/pkgspecial.go
+++ b/src/cmd/internal/objabi/pkgspecial.go
@@ -50,6 +50,7 @@ var runtimePkgs = []string{
 
 	"internal/runtime/atomic",
 	"internal/runtime/exithook",
+	"internal/runtime/gc",
 	"internal/runtime/maps",
 	"internal/runtime/math",
 	"internal/runtime/sys",
diff --git a/src/go/build/deps_test.go b/src/go/build/deps_test.go
index e3e01077c18b17471979a6bba7ee75e3daf33e43..0e91dd7a254b6e46dbb764c84206a3efdb486158 100644
--- a/src/go/build/deps_test.go
+++ b/src/go/build/deps_test.go
@@ -92,6 +92,7 @@ var depsRules = `
 	< internal/runtime/syscall
 	< internal/runtime/atomic
 	< internal/runtime/exithook
+	< internal/runtime/gc
 	< internal/runtime/math
 	< internal/runtime/maps
 	< runtime
diff --git a/src/internal/goexperiment/exp_greenteagc_off.go b/src/internal/goexperiment/exp_greenteagc_off.go
new file mode 100644
index 0000000000000000000000000000000000000000..d374d02ecc02adb14c3dd0d6a2a4e7cb5de28f56
--- /dev/null
+++ b/src/internal/goexperiment/exp_greenteagc_off.go
@@ -0,0 +1,8 @@
+// Code generated by mkconsts.go. DO NOT EDIT.
+
+//go:build !goexperiment.greenteagc
+
+package goexperiment
+
+const GreenTeaGC = false
+const GreenTeaGCInt = 0
\ No newline at end of file
diff --git a/src/internal/goexperiment/exp_greenteagc_on.go b/src/internal/goexperiment/exp_greenteagc_on.go
new file mode 100644
index 0000000000000000000000000000000000000000..901618f9cad3df1f12f0d3594fa9df7b5b66fc73
--- /dev/null
+++ b/src/internal/goexperiment/exp_greenteagc_on.go
@@ -0,0 +1,8 @@
+// Code generated by mkconsts.go. DO NOT EDIT.
+
+//go:build goexperiment.greenteagc
+
+package goexperiment
+
+const GreenTeaGC = true
+const GreenTeaGCInt = 1
\ No newline at end of file
diff --git a/src/internal/goexperiment/flags.go b/src/internal/goexperiment/flags.go
index ac85fc800092a40a2090e8f719e5c012b54a682f..a1b37f3fbe0951f63c5963e3f5f66650fbfb166e 100644
--- a/src/internal/goexperiment/flags.go
+++ b/src/internal/goexperiment/flags.go
@@ -129,6 +129,9 @@ type Flags struct {
 	// Synctest enables the testing/synctest package.
 	Synctest bool
 
+	// GreenTeaGC enables the Green Tea GC implementation.
+	GreenTeaGC bool
+	
 	// Kunpeng malloc prefetch optimization.
 	PrefetchMalloc bool
 }
diff --git a/src/internal/runtime/gc/malloc.go b/src/internal/runtime/gc/malloc.go
new file mode 100644
index 0000000000000000000000000000000000000000..c69fc2a35146683e6b4780bcd6bb1308bb7b5e31
--- /dev/null
+++ b/src/internal/runtime/gc/malloc.go
@@ -0,0 +1,50 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gc
+
+import "internal/goarch"
+
+const (
+	ptrBits = 8 * goarch.PtrSize
+
+	// A malloc header is functionally a single type pointer, but
+	// we need to use 8 here to ensure 8-byte alignment of allocations
+	// on 32-bit platforms. It's wasteful, but a lot of code relies on
+	// 8-byte alignment for 8-byte atomics.
+	MallocHeaderSize = 8
+
+	// The minimum object size that has a malloc header, exclusive.
+	//
+	// The size of this value controls overheads from the malloc header.
+	// The minimum size is bound by writeHeapBitsSmall, which assumes that the
+	// pointer bitmap for objects of a size smaller than this doesn't cross
+	// more than one pointer-word boundary. This sets an upper-bound on this
+	// value at the number of bits in a uintptr, multiplied by the pointer
+	// size in bytes.
+	//
+	// We choose a value here that has a natural cutover point in terms of memory
+	// overheads. This value just happens to be the maximum possible value this
+	// can be.
+	//
+	// A span with heap bits in it will have 128 bytes of heap bits on 64-bit
+	// platforms, and 256 bytes of heap bits on 32-bit platforms. The first size
+	// class where malloc headers match this overhead for 64-bit platforms is
+	// 512 bytes (8 KiB / 512 bytes * 8 bytes-per-header = 128 bytes of overhead).
+	// On 32-bit platforms, this same point is the 256 byte size class
+	// (8 KiB / 256 bytes * 8 bytes-per-header = 256 bytes of overhead).
+	//
+	// Guaranteed to be exactly at a size class boundary. The reason this value is
+	// an exclusive minimum is subtle. Suppose we're allocating a 504-byte object
+	// and its rounded up to 512 bytes for the size class. If minSizeForMallocHeader
+	// is 512 and an inclusive minimum, then a comparison against minSizeForMallocHeader
+	// by the two values would produce different results. In other words, the comparison
+	// would not be invariant to size-class rounding. Eschewing this property means a
+	// more complex check or possibly storing additional state to determine whether a
+	// span has malloc headers.
+	MinSizeForMallocHeader = goarch.PtrSize * ptrBits
+
+	// PageSize is the increment in which spans are managed.
+	PageSize = 1 << PageShift
+)
\ No newline at end of file
diff --git a/src/runtime/mksizeclasses.go b/src/internal/runtime/gc/mksizeclasses.go
similarity index 91%
rename from src/runtime/mksizeclasses.go
rename to src/internal/runtime/gc/mksizeclasses.go
index bb06ba1eddc32caa9c6f77343c2ce9f07a03e542..0e72fa0fa4373273585e5da65bdec24c59f4e0fc 100644
--- a/src/runtime/mksizeclasses.go
+++ b/src/internal/runtime/gc/mksizeclasses.go
@@ -289,29 +289,29 @@ func maxObjsPerSpan(classes []class) int {
 
 func printClasses(w io.Writer, classes []class) {
 	fmt.Fprintln(w, "const (")
-	fmt.Fprintf(w, "minHeapAlign = %d\n", minHeapAlign)
-	fmt.Fprintf(w, "_MaxSmallSize = %d\n", maxSmallSize)
-	fmt.Fprintf(w, "smallSizeDiv = %d\n", smallSizeDiv)
-	fmt.Fprintf(w, "smallSizeMax = %d\n", smallSizeMax)
-	fmt.Fprintf(w, "largeSizeDiv = %d\n", largeSizeDiv)
-	fmt.Fprintf(w, "_NumSizeClasses = %d\n", len(classes))
-	fmt.Fprintf(w, "_PageShift = %d\n", pageShift)
-	fmt.Fprintf(w, "maxObjsPerSpan = %d\n", maxObjsPerSpan(classes))
+	fmt.Fprintf(w, "MinHeapAlign = %d\n", minHeapAlign)
+	fmt.Fprintf(w, "MaxSmallSize = %d\n", maxSmallSize)
+	fmt.Fprintf(w, "SmallSizeDiv = %d\n", smallSizeDiv)
+	fmt.Fprintf(w, "SmallSizeMax = %d\n", smallSizeMax)
+	fmt.Fprintf(w, "LargeSizeDiv = %d\n", largeSizeDiv)
+	fmt.Fprintf(w, "NumSizeClasses = %d\n", len(classes))
+	fmt.Fprintf(w, "PageShift = %d\n", pageShift)
+	fmt.Fprintf(w, "MaxObjsPerSpan = %d\n", maxObjsPerSpan(classes))
 	fmt.Fprintln(w, ")")
 
-	fmt.Fprint(w, "var class_to_size = [_NumSizeClasses]uint16 {")
+	fmt.Fprint(w, "var SizeClassToSize = [NumSizeClasses]uint16 {")
 	for _, c := range classes {
 		fmt.Fprintf(w, "%d,", c.size)
 	}
 	fmt.Fprintln(w, "}")
 
-	fmt.Fprint(w, "var class_to_allocnpages = [_NumSizeClasses]uint8 {")
+	fmt.Fprint(w, "var SizeClassToNPages = [NumSizeClasses]uint8 {")
 	for _, c := range classes {
 		fmt.Fprintf(w, "%d,", c.npages)
 	}
 	fmt.Fprintln(w, "}")
 
-	fmt.Fprint(w, "var class_to_divmagic = [_NumSizeClasses]uint32 {")
+	fmt.Fprint(w, "var SizeClassToDivMagic = [NumSizeClasses]uint32 {")
 	for _, c := range classes {
 		if c.size == 0 {
 			fmt.Fprintf(w, "0,")
@@ -332,7 +332,7 @@ func printClasses(w io.Writer, classes []class) {
 			}
 		}
 	}
-	fmt.Fprint(w, "var size_to_class8 = [smallSizeMax/smallSizeDiv+1]uint8 {")
+	fmt.Fprint(w, "var SizeToSizeClass8 = [SmallSizeMax/SmallSizeDiv+1]uint8 {")
 	for _, v := range sc {
 		fmt.Fprintf(w, "%d,", v)
 	}
@@ -349,9 +349,9 @@ func printClasses(w io.Writer, classes []class) {
 			}
 		}
 	}
-	fmt.Fprint(w, "var size_to_class128 = [(_MaxSmallSize-smallSizeMax)/largeSizeDiv+1]uint8 {")
+	fmt.Fprint(w, "var SizeToSizeClass128 = [(MaxSmallSize-SmallSizeMax)/LargeSizeDiv+1]uint8 {")
 	for _, v := range sc {
 		fmt.Fprintf(w, "%d,", v)
 	}
 	fmt.Fprintln(w, "}")
-}
+}
\ No newline at end of file
diff --git a/src/internal/runtime/gc/scan.go b/src/internal/runtime/gc/scan.go
new file mode 100644
index 0000000000000000000000000000000000000000..7f730b627941a4d1972bc93938e5a85e309af9fe
--- /dev/null
+++ b/src/internal/runtime/gc/scan.go
@@ -0,0 +1,15 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gc
+
+import "internal/goarch"
+
+// ObjMask is a bitmap where each bit corresponds to an object in a span.
+//
+// It is sized to accomodate all size classes.
+type ObjMask [MaxObjsPerSpan / (goarch.PtrSize * 8)]uintptr
+
+// PtrMask is a bitmap where each bit represents a pointer-word in a single runtime page.
+type PtrMask [PageSize / goarch.PtrSize / (goarch.PtrSize * 8)]uintptr
\ No newline at end of file
diff --git a/src/runtime/sizeclasses.go b/src/internal/runtime/gc/sizeclasses.go
similarity index 59%
rename from src/runtime/sizeclasses.go
rename to src/internal/runtime/gc/sizeclasses.go
index bbcaa9e983fd042daafdb97d2a139fc8afe06e9b..1d2caa1404e66ad69cc97ff3ed16c452490aba64 100644
--- a/src/runtime/sizeclasses.go
+++ b/src/internal/runtime/gc/sizeclasses.go
@@ -1,7 +1,7 @@
 // Code generated by mksizeclasses.go; DO NOT EDIT.
 //go:generate go run mksizeclasses.go
 
-package runtime
+package gc
 
 // class  bytes/obj  bytes/span  objects  tail waste  max waste  min align
 //     1          8        8192     1024           0     87.50%          8
@@ -82,18 +82,18 @@ package runtime
 //      8192    13         32768
 
 const (
-	minHeapAlign    = 8
-	_MaxSmallSize   = 32768
-	smallSizeDiv    = 8
-	smallSizeMax    = 1024
-	largeSizeDiv    = 128
-	_NumSizeClasses = 68
-	_PageShift      = 13
-	maxObjsPerSpan  = 1024
+	MinHeapAlign   = 8
+	MaxSmallSize   = 32768
+	SmallSizeDiv   = 8
+	SmallSizeMax   = 1024
+	LargeSizeDiv   = 128
+	NumSizeClasses = 68
+	PageShift      = 13
+	MaxObjsPerSpan = 1024
 )
 
-var class_to_size = [_NumSizeClasses]uint16{0, 8, 16, 24, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 576, 640, 704, 768, 896, 1024, 1152, 1280, 1408, 1536, 1792, 2048, 2304, 2688, 3072, 3200, 3456, 4096, 4864, 5376, 6144, 6528, 6784, 6912, 8192, 9472, 9728, 10240, 10880, 12288, 13568, 14336, 16384, 18432, 19072, 20480, 21760, 24576, 27264, 28672, 32768}
-var class_to_allocnpages = [_NumSizeClasses]uint8{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 3, 2, 3, 1, 3, 2, 3, 4, 5, 6, 1, 7, 6, 5, 4, 3, 5, 7, 2, 9, 7, 5, 8, 3, 10, 7, 4}
-var class_to_divmagic = [_NumSizeClasses]uint32{0, ^uint32(0)/8 + 1, ^uint32(0)/16 + 1, ^uint32(0)/24 + 1, ^uint32(0)/32 + 1, ^uint32(0)/48 + 1, ^uint32(0)/64 + 1, ^uint32(0)/80 + 1, ^uint32(0)/96 + 1, ^uint32(0)/112 + 1, ^uint32(0)/128 + 1, ^uint32(0)/144 + 1, ^uint32(0)/160 + 1, ^uint32(0)/176 + 1, ^uint32(0)/192 + 1, ^uint32(0)/208 + 1, ^uint32(0)/224 + 1, ^uint32(0)/240 + 1, ^uint32(0)/256 + 1, ^uint32(0)/288 + 1, ^uint32(0)/320 + 1, ^uint32(0)/352 + 1, ^uint32(0)/384 + 1, ^uint32(0)/416 + 1, ^uint32(0)/448 + 1, ^uint32(0)/480 + 1, ^uint32(0)/512 + 1, ^uint32(0)/576 + 1, ^uint32(0)/640 + 1, ^uint32(0)/704 + 1, ^uint32(0)/768 + 1, ^uint32(0)/896 + 1, ^uint32(0)/1024 + 1, ^uint32(0)/1152 + 1, ^uint32(0)/1280 + 1, ^uint32(0)/1408 + 1, ^uint32(0)/1536 + 1, ^uint32(0)/1792 + 1, ^uint32(0)/2048 + 1, ^uint32(0)/2304 + 1, ^uint32(0)/2688 + 1, ^uint32(0)/3072 + 1, ^uint32(0)/3200 + 1, ^uint32(0)/3456 + 1, ^uint32(0)/4096 + 1, ^uint32(0)/4864 + 1, ^uint32(0)/5376 + 1, ^uint32(0)/6144 + 1, ^uint32(0)/6528 + 1, ^uint32(0)/6784 + 1, ^uint32(0)/6912 + 1, ^uint32(0)/8192 + 1, ^uint32(0)/9472 + 1, ^uint32(0)/9728 + 1, ^uint32(0)/10240 + 1, ^uint32(0)/10880 + 1, ^uint32(0)/12288 + 1, ^uint32(0)/13568 + 1, ^uint32(0)/14336 + 1, ^uint32(0)/16384 + 1, ^uint32(0)/18432 + 1, ^uint32(0)/19072 + 1, ^uint32(0)/20480 + 1, ^uint32(0)/21760 + 1, ^uint32(0)/24576 + 1, ^uint32(0)/27264 + 1, ^uint32(0)/28672 + 1, ^uint32(0)/32768 + 1}
-var size_to_class8 = [smallSizeMax/smallSizeDiv + 1]uint8{0, 1, 2, 3, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32}
-var size_to_class128 = [(_MaxSmallSize-smallSizeMax)/largeSizeDiv + 1]uint8{32, 33, 34, 35, 36, 37, 37, 38, 38, 39, 39, 40, 40, 40, 41, 41, 41, 42, 43, 43, 44, 44, 44, 44, 44, 45, 45, 45, 45, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 54, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 58, 58, 58, 58, 58, 58, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 61, 61, 61, 61, 61, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67}
+var SizeClassToSize = [NumSizeClasses]uint16{0, 8, 16, 24, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 576, 640, 704, 768, 896, 1024, 1152, 1280, 1408, 1536, 1792, 2048, 2304, 2688, 3072, 3200, 3456, 4096, 4864, 5376, 6144, 6528, 6784, 6912, 8192, 9472, 9728, 10240, 10880, 12288, 13568, 14336, 16384, 18432, 19072, 20480, 21760, 24576, 27264, 28672, 32768}
+var SizeClassToNPages = [NumSizeClasses]uint8{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 3, 2, 3, 1, 3, 2, 3, 4, 5, 6, 1, 7, 6, 5, 4, 3, 5, 7, 2, 9, 7, 5, 8, 3, 10, 7, 4}
+var SizeClassToDivMagic = [NumSizeClasses]uint32{0, ^uint32(0)/8 + 1, ^uint32(0)/16 + 1, ^uint32(0)/24 + 1, ^uint32(0)/32 + 1, ^uint32(0)/48 + 1, ^uint32(0)/64 + 1, ^uint32(0)/80 + 1, ^uint32(0)/96 + 1, ^uint32(0)/112 + 1, ^uint32(0)/128 + 1, ^uint32(0)/144 + 1, ^uint32(0)/160 + 1, ^uint32(0)/176 + 1, ^uint32(0)/192 + 1, ^uint32(0)/208 + 1, ^uint32(0)/224 + 1, ^uint32(0)/240 + 1, ^uint32(0)/256 + 1, ^uint32(0)/288 + 1, ^uint32(0)/320 + 1, ^uint32(0)/352 + 1, ^uint32(0)/384 + 1, ^uint32(0)/416 + 1, ^uint32(0)/448 + 1, ^uint32(0)/480 + 1, ^uint32(0)/512 + 1, ^uint32(0)/576 + 1, ^uint32(0)/640 + 1, ^uint32(0)/704 + 1, ^uint32(0)/768 + 1, ^uint32(0)/896 + 1, ^uint32(0)/1024 + 1, ^uint32(0)/1152 + 1, ^uint32(0)/1280 + 1, ^uint32(0)/1408 + 1, ^uint32(0)/1536 + 1, ^uint32(0)/1792 + 1, ^uint32(0)/2048 + 1, ^uint32(0)/2304 + 1, ^uint32(0)/2688 + 1, ^uint32(0)/3072 + 1, ^uint32(0)/3200 + 1, ^uint32(0)/3456 + 1, ^uint32(0)/4096 + 1, ^uint32(0)/4864 + 1, ^uint32(0)/5376 + 1, ^uint32(0)/6144 + 1, ^uint32(0)/6528 + 1, ^uint32(0)/6784 + 1, ^uint32(0)/6912 + 1, ^uint32(0)/8192 + 1, ^uint32(0)/9472 + 1, ^uint32(0)/9728 + 1, ^uint32(0)/10240 + 1, ^uint32(0)/10880 + 1, ^uint32(0)/12288 + 1, ^uint32(0)/13568 + 1, ^uint32(0)/14336 + 1, ^uint32(0)/16384 + 1, ^uint32(0)/18432 + 1, ^uint32(0)/19072 + 1, ^uint32(0)/20480 + 1, ^uint32(0)/21760 + 1, ^uint32(0)/24576 + 1, ^uint32(0)/27264 + 1, ^uint32(0)/28672 + 1, ^uint32(0)/32768 + 1}
+var SizeToSizeClass8 = [SmallSizeMax/SmallSizeDiv + 1]uint8{0, 1, 2, 3, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32}
+var SizeToSizeClass128 = [(MaxSmallSize-SmallSizeMax)/LargeSizeDiv + 1]uint8{32, 33, 34, 35, 36, 37, 37, 38, 38, 39, 39, 40, 40, 40, 41, 41, 41, 42, 43, 43, 44, 44, 44, 44, 44, 45, 45, 45, 45, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 54, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 58, 58, 58, 58, 58, 58, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 61, 61, 61, 61, 61, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67}
\ No newline at end of file
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 29341dd2b351f34a0b615e39f71fea51994245ab..0ac7d25a9c99977b013bfaa98d6560e503f044d2 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -11,6 +11,7 @@ import (
 	"internal/goarch"
 	"internal/goos"
 	"internal/runtime/atomic"
+	"internal/runtime/gc"
 	"internal/runtime/sys"
 	"unsafe"
 )
@@ -363,7 +364,7 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 		slow.Mallocs = 0
 		slow.Frees = 0
 		slow.HeapReleased = 0
-		var bySize [_NumSizeClasses]struct {
+		var bySize [gc.NumSizeClasses]struct {
 			Mallocs, Frees uint64
 		}
 
@@ -391,11 +392,11 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 
 		// Collect per-sizeclass free stats.
 		var smallFree uint64
-		for i := 0; i < _NumSizeClasses; i++ {
+		for i := 0; i < gc.NumSizeClasses; i++ {
 			slow.Frees += m.smallFreeCount[i]
 			bySize[i].Frees += m.smallFreeCount[i]
 			bySize[i].Mallocs += m.smallFreeCount[i]
-			smallFree += m.smallFreeCount[i] * uint64(class_to_size[i])
+			smallFree += m.smallFreeCount[i] * uint64(gc.SizeClassToSize[i])
 		}
 		slow.Frees += m.tinyAllocCount + m.largeFreeCount
 		slow.Mallocs += slow.Frees
@@ -1231,6 +1232,7 @@ func AllocMSpan() *MSpan {
 	systemstack(func() {
 		lock(&mheap_.lock)
 		s = (*mspan)(mheap_.spanalloc.alloc())
+		s.init(0, 0)
 		unlock(&mheap_.lock)
 	})
 	return (*MSpan)(s)
@@ -1254,6 +1256,30 @@ func MSpanCountAlloc(ms *MSpan, bits []byte) int {
 	return result
 }
 
+type MSpanQueue mSpanQueue
+
+func (q *MSpanQueue) Size() int {
+	return (*mSpanQueue)(q).n
+}
+
+func (q *MSpanQueue) Push(s *MSpan) {
+	(*mSpanQueue)(q).push((*mspan)(s))
+}
+
+func (q *MSpanQueue) Pop() *MSpan {
+	s := (*mSpanQueue)(q).pop()
+	return (*MSpan)(s)
+}
+
+func (q *MSpanQueue) TakeAll(p *MSpanQueue) {
+	(*mSpanQueue)(q).takeAll((*mSpanQueue)(p))
+}
+
+func (q *MSpanQueue) PopN(n int) MSpanQueue {
+	p := (*mSpanQueue)(q).popN(n)
+	return (MSpanQueue)(p)
+}
+
 const (
 	TimeHistSubBucketBits = timeHistSubBucketBits
 	TimeHistNumSubBuckets = timeHistNumSubBuckets
diff --git a/src/runtime/gc_test.go b/src/runtime/gc_test.go
index 00280ed1b53cab943e09af8db8442a80034fa59d..e084460b8e6416a9e9e8b178b7215faaa197387a 100644
--- a/src/runtime/gc_test.go
+++ b/src/runtime/gc_test.go
@@ -875,3 +875,196 @@ func TestWeakToStrongMarkTermination(t *testing.T) {
 		t.Errorf("gcMarkDone restarted")
 	}
 }
+
+func TestMSpanQueue(t *testing.T) {
+	expectSize := func(t *testing.T, q *runtime.MSpanQueue, want int) {
+		t.Helper()
+		if got := q.Size(); got != want {
+			t.Errorf("expected size %d, got %d", want, got)
+		}
+	}
+	expectMSpan := func(t *testing.T, got, want *runtime.MSpan, op string) {
+		t.Helper()
+		if got != want {
+			t.Errorf("expected mspan %p from %s, got %p", want, op, got)
+		}
+	}
+	makeSpans := func(t *testing.T, n int) ([]*runtime.MSpan, func()) {
+		t.Helper()
+		spans := make([]*runtime.MSpan, 0, n)
+		for range cap(spans) {
+			spans = append(spans, runtime.AllocMSpan())
+		}
+		return spans, func() {
+			for i, s := range spans {
+				runtime.FreeMSpan(s)
+				spans[i] = nil
+			}
+		}
+	}
+	t.Run("Empty", func(t *testing.T) {
+		var q runtime.MSpanQueue
+		expectSize(t, &q, 0)
+		expectMSpan(t, q.Pop(), nil, "pop")
+	})
+	t.Run("PushPop", func(t *testing.T) {
+		s := runtime.AllocMSpan()
+		defer runtime.FreeMSpan(s)
+
+		var q runtime.MSpanQueue
+		q.Push(s)
+		expectSize(t, &q, 1)
+		expectMSpan(t, q.Pop(), s, "pop")
+		expectMSpan(t, q.Pop(), nil, "pop")
+	})
+	t.Run("PushPopPushPop", func(t *testing.T) {
+		s0 := runtime.AllocMSpan()
+		defer runtime.FreeMSpan(s0)
+		s1 := runtime.AllocMSpan()
+		defer runtime.FreeMSpan(s1)
+
+		var q runtime.MSpanQueue
+
+		// Push and pop s0.
+		q.Push(s0)
+		expectSize(t, &q, 1)
+		expectMSpan(t, q.Pop(), s0, "pop")
+		expectMSpan(t, q.Pop(), nil, "pop")
+
+		// Push and pop s1.
+		q.Push(s1)
+		expectSize(t, &q, 1)
+		expectMSpan(t, q.Pop(), s1, "pop")
+		expectMSpan(t, q.Pop(), nil, "pop")
+	})
+	t.Run("PushPushPopPop", func(t *testing.T) {
+		s0 := runtime.AllocMSpan()
+		defer runtime.FreeMSpan(s0)
+		s1 := runtime.AllocMSpan()
+		defer runtime.FreeMSpan(s1)
+
+		var q runtime.MSpanQueue
+		q.Push(s0)
+		expectSize(t, &q, 1)
+		q.Push(s1)
+		expectSize(t, &q, 2)
+		expectMSpan(t, q.Pop(), s0, "pop")
+		expectMSpan(t, q.Pop(), s1, "pop")
+		expectMSpan(t, q.Pop(), nil, "pop")
+	})
+	t.Run("EmptyTakeAll", func(t *testing.T) {
+		var q runtime.MSpanQueue
+		var p runtime.MSpanQueue
+		expectSize(t, &p, 0)
+		expectSize(t, &q, 0)
+		p.TakeAll(&q)
+		expectSize(t, &p, 0)
+		expectSize(t, &q, 0)
+		expectMSpan(t, q.Pop(), nil, "pop")
+		expectMSpan(t, p.Pop(), nil, "pop")
+	})
+	t.Run("Push4TakeAll", func(t *testing.T) {
+		spans, free := makeSpans(t, 4)
+		defer free()
+
+		var q runtime.MSpanQueue
+		for i, s := range spans {
+			expectSize(t, &q, i)
+			q.Push(s)
+			expectSize(t, &q, i+1)
+		}
+
+		var p runtime.MSpanQueue
+		p.TakeAll(&q)
+		expectSize(t, &p, 4)
+		for i := range p.Size() {
+			expectMSpan(t, p.Pop(), spans[i], "pop")
+		}
+		expectSize(t, &p, 0)
+		expectMSpan(t, q.Pop(), nil, "pop")
+		expectMSpan(t, p.Pop(), nil, "pop")
+	})
+	t.Run("Push4Pop3", func(t *testing.T) {
+		spans, free := makeSpans(t, 4)
+		defer free()
+
+		var q runtime.MSpanQueue
+		for i, s := range spans {
+			expectSize(t, &q, i)
+			q.Push(s)
+			expectSize(t, &q, i+1)
+		}
+		p := q.PopN(3)
+		expectSize(t, &p, 3)
+		expectSize(t, &q, 1)
+		for i := range p.Size() {
+			expectMSpan(t, p.Pop(), spans[i], "pop")
+		}
+		expectMSpan(t, q.Pop(), spans[len(spans)-1], "pop")
+		expectSize(t, &p, 0)
+		expectSize(t, &q, 0)
+		expectMSpan(t, q.Pop(), nil, "pop")
+		expectMSpan(t, p.Pop(), nil, "pop")
+	})
+	t.Run("Push4Pop0", func(t *testing.T) {
+		spans, free := makeSpans(t, 4)
+		defer free()
+
+		var q runtime.MSpanQueue
+		for i, s := range spans {
+			expectSize(t, &q, i)
+			q.Push(s)
+			expectSize(t, &q, i+1)
+		}
+		p := q.PopN(0)
+		expectSize(t, &p, 0)
+		expectSize(t, &q, 4)
+		for i := range q.Size() {
+			expectMSpan(t, q.Pop(), spans[i], "pop")
+		}
+		expectSize(t, &p, 0)
+		expectSize(t, &q, 0)
+		expectMSpan(t, q.Pop(), nil, "pop")
+		expectMSpan(t, p.Pop(), nil, "pop")
+	})
+	t.Run("Push4Pop4", func(t *testing.T) {
+		spans, free := makeSpans(t, 4)
+		defer free()
+
+		var q runtime.MSpanQueue
+		for i, s := range spans {
+			expectSize(t, &q, i)
+			q.Push(s)
+			expectSize(t, &q, i+1)
+		}
+		p := q.PopN(4)
+		expectSize(t, &p, 4)
+		expectSize(t, &q, 0)
+		for i := range p.Size() {
+			expectMSpan(t, p.Pop(), spans[i], "pop")
+		}
+		expectSize(t, &p, 0)
+		expectMSpan(t, q.Pop(), nil, "pop")
+		expectMSpan(t, p.Pop(), nil, "pop")
+	})
+	t.Run("Push4Pop5", func(t *testing.T) {
+		spans, free := makeSpans(t, 4)
+		defer free()
+
+		var q runtime.MSpanQueue
+		for i, s := range spans {
+			expectSize(t, &q, i)
+			q.Push(s)
+			expectSize(t, &q, i+1)
+		}
+		p := q.PopN(5)
+		expectSize(t, &p, 4)
+		expectSize(t, &q, 0)
+		for i := range p.Size() {
+			expectMSpan(t, p.Pop(), spans[i], "pop")
+		}
+		expectSize(t, &p, 0)
+		expectMSpan(t, q.Pop(), nil, "pop")
+		expectMSpan(t, p.Pop(), nil, "pop")
+	})
+}
diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index 8f2ae34f4da3170cf176997902eb8744328d4344..a29eab747804981f39fdc85c7a8f07d2bc8ba0ef 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -14,6 +14,7 @@ package runtime
 import (
 	"internal/abi"
 	"internal/goarch"
+	"internal/runtime/gc"
 	"unsafe"
 )
 
@@ -471,7 +472,7 @@ func dumproots() {
 
 // Bit vector of free marks.
 // Needs to be as big as the largest number of objects per span.
-var freemark [_PageSize / 8]bool
+var freemark [pageSize / 8]bool
 
 func dumpobjs() {
 	// To protect mheap_.allspans.
@@ -483,7 +484,7 @@ func dumpobjs() {
 		}
 		p := s.base()
 		size := s.elemsize
-		n := (s.npages << _PageShift) / size
+		n := (s.npages << gc.PageShift) / size
 		if n > uintptr(len(freemark)) {
 			throw("freemark array doesn't have enough entries")
 		}
diff --git a/src/runtime/lock_spinbit.go b/src/runtime/lock_spinbit.go
index 7e84f3e1c2153224b449deb8624a64f0621f0a6d..8a6c9582cc717bcff660bf263a4c83181fff46b1 100644
--- a/src/runtime/lock_spinbit.go
+++ b/src/runtime/lock_spinbit.go
@@ -9,6 +9,7 @@ package runtime
 import (
 	"internal/goarch"
 	"internal/runtime/atomic"
+	"internal/runtime/gc"
 	"unsafe"
 )
 
@@ -60,7 +61,7 @@ const (
 	mutexSpinning    = 0x100
 	mutexStackLocked = 0x200
 	mutexMMask       = 0x3FF
-	mutexMOffset     = mallocHeaderSize // alignment of heap-allocated Ms (those other than m0)
+	mutexMOffset     = gc.MallocHeaderSize // alignment of heap-allocated Ms (those other than m0)
 
 	mutexActiveSpinCount  = 4
 	mutexActiveSpinSize   = 30
@@ -90,7 +91,7 @@ type mWaitList struct {
 
 // lockVerifyMSize confirms that we can recreate the low bits of the M pointer.
 func lockVerifyMSize() {
-	size := roundupsize(unsafe.Sizeof(m{}), false) + mallocHeaderSize
+	size := roundupsize(unsafe.Sizeof(m{}), false) + gc.MallocHeaderSize
 	if size&mutexMMask != 0 {
 		print("M structure uses sizeclass ", size, "/", hex(size), " bytes; ",
 			"incompatible with mutex flag mask ", hex(mutexMMask), "\n")
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index a8cac5b0b2c273fc2ce19c2a6edaa18b2d937f35..bd083ccc24a80a79bd70fdf2a9a4d62c919cf401 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -105,6 +105,7 @@ import (
 	"internal/goexperiment"
 	"internal/goos"
 	"internal/runtime/atomic"
+	"internal/runtime/gc"
 	"internal/runtime/math"
 	"internal/runtime/sys"
 	"unsafe"
@@ -113,13 +114,13 @@ import (
 const (
 	maxTinySize   = _TinySize
 	tinySizeClass = _TinySizeClass
-	maxSmallSize  = _MaxSmallSize
-
-	pageShift = _PageShift
-	pageSize  = _PageSize
-
-	_PageSize = 1 << _PageShift
-	_PageMask = _PageSize - 1
+	maxSmallSize  = gc.MaxSmallSize
+	pageSize      = 1 << gc.PageShift
+	pageMask      = pageSize - 1
+	// Unused. Left for viewcore.
+	_PageSize              = pageSize
+	minSizeForMallocHeader = gc.MinSizeForMallocHeader
+	mallocHeaderSize       = gc.MallocHeaderSize
 
 	// _64bit = 1 on 64-bit systems, 0 on 32-bit systems
 	_64bit = 1 << (^uintptr(0) >> 63) / 2
@@ -372,7 +373,7 @@ var (
 )
 
 func mallocinit() {
-	if class_to_size[_TinySizeClass] != _TinySize {
+	if gc.SizeClassToSize[tinySizeClass] != maxTinySize {
 		throw("bad TinySizeClass")
 	}
 
@@ -433,11 +434,11 @@ func mallocinit() {
 	// span sizes are one page. Some code relies on this.
 	minSizeForMallocHeaderIsSizeClass := false
 	sizeClassesUpToMinSizeForMallocHeaderAreOnePage := true
-	for i := 0; i < len(class_to_size); i++ {
-		if class_to_allocnpages[i] > 1 {
+	for i := 0; i < len(gc.SizeClassToSize); i++ {
+		if gc.SizeClassToNPages[i] > 1 {
 			sizeClassesUpToMinSizeForMallocHeaderAreOnePage = false
 		}
-		if minSizeForMallocHeader == uintptr(class_to_size[i]) {
+		if gc.MinSizeForMallocHeader == uintptr(gc.SizeClassToSize[i]) {
 			minSizeForMallocHeaderIsSizeClass = true
 			break
 		}
@@ -450,7 +451,7 @@ func mallocinit() {
 	}
 	// Check that the pointer bitmap for all small sizes without a malloc header
 	// fits in a word.
-	if minSizeForMallocHeader/goarch.PtrSize > 8*goarch.PtrSize {
+	if gc.MinSizeForMallocHeader/goarch.PtrSize > 8*goarch.PtrSize {
 		throw("max pointer/scan bitmap size for headerless objects is too large")
 	}
 
@@ -1048,7 +1049,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	// Actually do the allocation.
 	var x unsafe.Pointer
 	var elemsize uintptr
-	if size <= maxSmallSize-mallocHeaderSize {
+	if size <= maxSmallSize-gc.MallocHeaderSize {
 		if typ == nil || !typ.Pointers() {
 			if size < maxTinySize {
 				x, elemsize = mallocgcTiny(size, typ, needzero)
@@ -1075,8 +1076,8 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		// Poison the space between the end of the requested size of x
 		// and the end of the slot. Unpoison the requested allocation.
 		frag := elemsize - size
-		if typ != nil && typ.Pointers() && !heapBitsInSpan(elemsize) && size <= maxSmallSize-mallocHeaderSize {
-			frag -= mallocHeaderSize
+		if typ != nil && typ.Pointers() && !heapBitsInSpan(elemsize) && size <= maxSmallSize-gc.MallocHeaderSize {
+			frag -= gc.MallocHeaderSize
 		}
 		asanpoison(unsafe.Add(x, size-asanRZ), asanRZ)
 		asanunpoison(x, size-asanRZ)
@@ -1276,12 +1277,12 @@ func mallocgcSmallNoscan(size uintptr, typ *_type, needzero bool) (unsafe.Pointe
 	checkGCTrigger := false
 	c := getMCache(mp)
 	var sizeclass uint8
-	if size <= smallSizeMax-8 {
-		sizeclass = size_to_class8[divRoundUp(size, smallSizeDiv)]
+	if size <= gc.SmallSizeMax-8 {
+		sizeclass = gc.SizeToSizeClass8[divRoundUp(size, gc.SmallSizeDiv)]
 	} else {
-		sizeclass = size_to_class128[divRoundUp(size-smallSizeMax, largeSizeDiv)]
+		sizeclass = gc.SizeToSizeClass128[divRoundUp(size-gc.SmallSizeMax, gc.LargeSizeDiv)]
 	}
-	size = uintptr(class_to_size[sizeclass])
+	size = uintptr(gc.SizeClassToSize[sizeclass])
 	spc := makeSpanClass(sizeclass, true)
 	span := c.alloc[spc]
 	v := nextFreeFast(span)
@@ -1364,7 +1365,7 @@ func mallocgcSmallScanNoHeader(size uintptr, typ *_type, needzero bool) (unsafe.
 
 	checkGCTrigger := false
 	c := getMCache(mp)
-	sizeclass := size_to_class8[divRoundUp(size, smallSizeDiv)]
+	sizeclass := gc.SizeToSizeClass8[divRoundUp(size, gc.SmallSizeDiv)]
 	spc := makeSpanClass(sizeclass, false)
 	span := c.alloc[spc]
 	v := nextFreeFast(span)
@@ -1382,7 +1383,7 @@ func mallocgcSmallScanNoHeader(size uintptr, typ *_type, needzero bool) (unsafe.
 	} else {
 		c.scanAlloc += heapSetTypeNoHeader(uintptr(x), size, typ, span)
 	}
-	size = uintptr(class_to_size[sizeclass])
+	size = uintptr(gc.SizeClassToSize[sizeclass])
 
 	// Ensure that the stores above that initialize x to
 	// type-safe memory and set the heap bits occur before
@@ -1455,14 +1456,14 @@ func mallocgcSmallScanHeader(size uintptr, typ *_type, needzero bool) (unsafe.Po
 
 	checkGCTrigger := false
 	c := getMCache(mp)
-	size += mallocHeaderSize
+	size += gc.MallocHeaderSize
 	var sizeclass uint8
-	if size <= smallSizeMax-8 {
-		sizeclass = size_to_class8[divRoundUp(size, smallSizeDiv)]
+	if size <= gc.SmallSizeMax-8 {
+		sizeclass = gc.SizeToSizeClass8[divRoundUp(size, gc.SmallSizeDiv)]
 	} else {
-		sizeclass = size_to_class128[divRoundUp(size-smallSizeMax, largeSizeDiv)]
+		sizeclass = gc.SizeToSizeClass128[divRoundUp(size-gc.SmallSizeMax, gc.LargeSizeDiv)]
 	}
-	size = uintptr(class_to_size[sizeclass])
+	size = uintptr(gc.SizeClassToSize[sizeclass])
 	spc := makeSpanClass(sizeclass, false)
 	span := c.alloc[spc]
 	v := nextFreeFast(span)
@@ -1474,8 +1475,8 @@ func mallocgcSmallScanHeader(size uintptr, typ *_type, needzero bool) (unsafe.Po
 		memclrNoHeapPointers(x, size)
 	}
 	header := (**_type)(x)
-	x = add(x, mallocHeaderSize)
-	c.scanAlloc += heapSetTypeSmallHeader(uintptr(x), size-mallocHeaderSize, typ, header, span)
+	x = add(x, gc.MallocHeaderSize)
+	c.scanAlloc += heapSetTypeSmallHeader(uintptr(x), size-gc.MallocHeaderSize, typ, header, span)
 
 	// Ensure that the stores above that initialize x to
 	// type-safe memory and set the heap bits occur before
@@ -1934,7 +1935,7 @@ func persistentalloc1(size, align uintptr, sysStat *sysMemStat) *notInHeap {
 		if align&(align-1) != 0 {
 			throw("persistentalloc: align is not a power of 2")
 		}
-		if align > _PageSize {
+		if align > pageSize {
 			throw("persistentalloc: align is too large")
 		}
 	} else {
diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index 148b2d788ef899f0b698ddd9e3aefaa22f97be53..3c5af8b1bfa0e877f071629027af32d19aacd07d 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -58,49 +58,13 @@ package runtime
 import (
 	"internal/abi"
 	"internal/goarch"
+	"internal/goexperiment"
 	"internal/runtime/atomic"
+	"internal/runtime/gc"
 	"internal/runtime/sys"
 	"unsafe"
 )
 
-const (
-	// A malloc header is functionally a single type pointer, but
-	// we need to use 8 here to ensure 8-byte alignment of allocations
-	// on 32-bit platforms. It's wasteful, but a lot of code relies on
-	// 8-byte alignment for 8-byte atomics.
-	mallocHeaderSize = 8
-
-	// The minimum object size that has a malloc header, exclusive.
-	//
-	// The size of this value controls overheads from the malloc header.
-	// The minimum size is bound by writeHeapBitsSmall, which assumes that the
-	// pointer bitmap for objects of a size smaller than this doesn't cross
-	// more than one pointer-word boundary. This sets an upper-bound on this
-	// value at the number of bits in a uintptr, multiplied by the pointer
-	// size in bytes.
-	//
-	// We choose a value here that has a natural cutover point in terms of memory
-	// overheads. This value just happens to be the maximum possible value this
-	// can be.
-	//
-	// A span with heap bits in it will have 128 bytes of heap bits on 64-bit
-	// platforms, and 256 bytes of heap bits on 32-bit platforms. The first size
-	// class where malloc headers match this overhead for 64-bit platforms is
-	// 512 bytes (8 KiB / 512 bytes * 8 bytes-per-header = 128 bytes of overhead).
-	// On 32-bit platforms, this same point is the 256 byte size class
-	// (8 KiB / 256 bytes * 8 bytes-per-header = 256 bytes of overhead).
-	//
-	// Guaranteed to be exactly at a size class boundary. The reason this value is
-	// an exclusive minimum is subtle. Suppose we're allocating a 504-byte object
-	// and its rounded up to 512 bytes for the size class. If minSizeForMallocHeader
-	// is 512 and an inclusive minimum, then a comparison against minSizeForMallocHeader
-	// by the two values would produce different results. In other words, the comparison
-	// would not be invariant to size-class rounding. Eschewing this property means a
-	// more complex check or possibly storing additional state to determine whether a
-	// span has malloc headers.
-	minSizeForMallocHeader = goarch.PtrSize * ptrBits
-)
-
 // heapBitsInSpan returns true if the size of an object implies its ptr/scalar
 // data is stored at the end of the span, and is accessible via span.heapBits.
 //
@@ -112,7 +76,7 @@ const (
 func heapBitsInSpan(userSize uintptr) bool {
 	// N.B. minSizeForMallocHeader is an exclusive minimum so that this function is
 	// invariant under size-class rounding on its input.
-	return userSize <= minSizeForMallocHeader
+	return userSize <= gc.MinSizeForMallocHeader
 }
 
 // typePointers is an iterator over the pointers in a heap object.
@@ -189,7 +153,7 @@ func (span *mspan) typePointersOfUnchecked(addr uintptr) typePointers {
 	if spc.sizeclass() != 0 {
 		// Pull the allocation header from the first word of the object.
 		typ = *(**_type)(unsafe.Pointer(addr))
-		addr += mallocHeaderSize
+		addr += gc.MallocHeaderSize
 	} else {
 		typ = span.largeType
 		if typ == nil {
@@ -544,6 +508,9 @@ func (s *mspan) initHeapBits() {
 		b := s.heapBits()
 		clear(b)
 	}
+	if goexperiment.GreenTeaGC && gcUsesSpanInlineMarkBits(s.elemsize) {
+		s.initInlineMarkBits()
+	}
 }
 
 // heapBits returns the heap ptr/scalar bits stored at the end of the span for
@@ -567,7 +534,7 @@ func (span *mspan) heapBits() []uintptr {
 		if span.spanclass.noscan() {
 			throw("heapBits called for noscan")
 		}
-		if span.elemsize > minSizeForMallocHeader {
+		if span.elemsize > gc.MinSizeForMallocHeader {
 			throw("heapBits called for span class that should have a malloc header")
 		}
 	}
@@ -576,22 +543,32 @@ func (span *mspan) heapBits() []uintptr {
 	// Nearly every span with heap bits is exactly one page in size. Arenas are the only exception.
 	if span.npages == 1 {
 		// This will be inlined and constant-folded down.
-		return heapBitsSlice(span.base(), pageSize)
+		return heapBitsSlice(span.base(), pageSize, span.elemsize)
 	}
-	return heapBitsSlice(span.base(), span.npages*pageSize)
+	return heapBitsSlice(span.base(), span.npages*pageSize, span.elemsize)
 }
 
 // Helper for constructing a slice for the span's heap bits.
 //
 //go:nosplit
-func heapBitsSlice(spanBase, spanSize uintptr) []uintptr {
-	bitmapSize := spanSize / goarch.PtrSize / 8
+func heapBitsSlice(spanBase, spanSize, elemsize uintptr) []uintptr {
+	base, bitmapSize := spanHeapBitsRange(spanBase, spanSize, elemsize)
 	elems := int(bitmapSize / goarch.PtrSize)
 	var sl notInHeapSlice
-	sl = notInHeapSlice{(*notInHeap)(unsafe.Pointer(spanBase + spanSize - bitmapSize)), elems, elems}
+	sl = notInHeapSlice{(*notInHeap)(unsafe.Pointer(base)), elems, elems}
 	return *(*[]uintptr)(unsafe.Pointer(&sl))
 }
 
+//go:nosplit
+func spanHeapBitsRange(spanBase, spanSize, elemsize uintptr) (base, size uintptr) {
+	size = spanSize / goarch.PtrSize / 8
+	base = spanBase + spanSize - size
+	if goexperiment.GreenTeaGC && gcUsesSpanInlineMarkBits(elemsize) {
+		base -= unsafe.Sizeof(spanInlineMarkBits{})
+	}
+	return
+}
+
 // heapBitsSmallForAddr loads the heap bits for the object stored at addr from span.heapBits.
 //
 // addr must be the base pointer of an object in the span. heapBitsInSpan(span.elemsize)
@@ -599,9 +576,8 @@ func heapBitsSlice(spanBase, spanSize uintptr) []uintptr {
 //
 //go:nosplit
 func (span *mspan) heapBitsSmallForAddr(addr uintptr) uintptr {
-	spanSize := span.npages * pageSize
-	bitmapSize := spanSize / goarch.PtrSize / 8
-	hbits := (*byte)(unsafe.Pointer(span.base() + spanSize - bitmapSize))
+	hbitsBase, _ := spanHeapBitsRange(span.base(), span.npages*pageSize, span.elemsize)
+	hbits := (*byte)(unsafe.Pointer(hbitsBase))
 
 	// These objects are always small enough that their bitmaps
 	// fit in a single word, so just load the word or two we need.
@@ -667,7 +643,8 @@ func (span *mspan) writeHeapBitsSmall(x, dataSize uintptr, typ *_type) (scanSize
 
 	// Since we're never writing more than one uintptr's worth of bits, we're either going
 	// to do one or two writes.
-	dst := unsafe.Pointer(span.base() + pageSize - pageSize/goarch.PtrSize/8)
+	dstBase, _ := spanHeapBitsRange(span.base(), pageSize, span.elemsize)
+	dst := unsafe.Pointer(dstBase)
 	o := (x - span.base()) / goarch.PtrSize
 	i := o / ptrBits
 	j := o % ptrBits
@@ -1155,15 +1132,6 @@ func markBitsForAddr(p uintptr) markBits {
 	return s.markBitsForIndex(objIndex)
 }
 
-func (s *mspan) markBitsForIndex(objIndex uintptr) markBits {
-	bytep, mask := s.gcmarkBits.bitp(objIndex)
-	return markBits{bytep, mask, objIndex}
-}
-
-func (s *mspan) markBitsForBase() markBits {
-	return markBits{&s.gcmarkBits.x, uint8(1), 0}
-}
-
 // isMarked reports whether mark bit m is set.
 func (m markBits) isMarked() bool {
 	return *m.bytep&m.mask != 0
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index 44d737b19cf7da187db6cefd58b1f64c7e0a283c..440120cdfe8ceafdab636fdd9ab14637f2247614 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -6,6 +6,7 @@ package runtime
 
 import (
 	"internal/runtime/atomic"
+	"internal/runtime/gc"
 	"internal/runtime/sys"
 	"unsafe"
 )
@@ -218,18 +219,18 @@ func (c *mcache) refill(spc spanClass) {
 
 // allocLarge allocates a span for a large object.
 func (c *mcache) allocLarge(size uintptr, noscan bool) *mspan {
-	if size+_PageSize < size {
+	if size+pageSize < size {
 		throw("out of memory")
 	}
-	npages := size >> _PageShift
-	if size&_PageMask != 0 {
+	npages := size >> gc.PageShift
+	if size&pageMask != 0 {
 		npages++
 	}
 
 	// Deduct credit for this span allocation and sweep if
 	// necessary. mHeap_Alloc will also sweep npages, so this only
 	// pays the debt down to npage pages.
-	deductSweepCredit(npages*_PageSize, npages)
+	deductSweepCredit(npages*pageSize, npages)
 
 	spc := makeSpanClass(0, noscan)
 	s := mheap_.alloc(npages, spc)
diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go
index 08ff0a5c5d0f072f4106d4a7593a88c2b0532fd0..5821c3592dbc42617da5f013130c90c5bb7b97b3 100644
--- a/src/runtime/mcentral.go
+++ b/src/runtime/mcentral.go
@@ -14,6 +14,7 @@ package runtime
 
 import (
 	"internal/runtime/atomic"
+	"internal/runtime/gc"
 	"internal/runtime/sys"
 )
 
@@ -80,7 +81,7 @@ func (c *mcentral) fullSwept(sweepgen uint32) *spanSet {
 // Allocate a span to use in an mcache.
 func (c *mcentral) cacheSpan() *mspan {
 	// Deduct credit for this span allocation and sweep if necessary.
-	spanBytes := uintptr(class_to_allocnpages[c.spanclass.sizeclass()]) * _PageSize
+	spanBytes := uintptr(gc.SizeClassToNPages[c.spanclass.sizeclass()]) * pageSize
 	deductSweepCredit(spanBytes, 0)
 
 	traceDone := false
@@ -248,18 +249,15 @@ func (c *mcentral) uncacheSpan(s *mspan) {
 
 // grow allocates a new empty span from the heap and initializes it for c's size class.
 func (c *mcentral) grow() *mspan {
-	npages := uintptr(class_to_allocnpages[c.spanclass.sizeclass()])
-	size := uintptr(class_to_size[c.spanclass.sizeclass()])
+	npages := uintptr(gc.SizeClassToNPages[c.spanclass.sizeclass()])
+	size := uintptr(gc.SizeClassToSize[c.spanclass.sizeclass()])
 
 	s := mheap_.alloc(npages, c.spanclass)
 	if s == nil {
 		return nil
 	}
 
-	// Use division by multiplication and shifts to quickly compute:
-	// n := (npages << _PageShift) / size
-	n := s.divideByElemSize(npages << _PageShift)
-	s.limit = s.base() + size*n
+	s.limit = s.base() + size*uintptr(s.nelems)
 	s.initHeapBits()
 	return s
 }
diff --git a/src/runtime/metrics.go b/src/runtime/metrics.go
index 417f1071bb7007fa36d1d910b68f03781527c181..949a2d42bd0da0cd7568d41b8a69d29c7537ddcb 100644
--- a/src/runtime/metrics.go
+++ b/src/runtime/metrics.go
@@ -8,6 +8,7 @@ package runtime
 
 import (
 	"internal/godebugs"
+	"internal/runtime/gc"
 	"unsafe"
 )
 
@@ -62,12 +63,12 @@ func initMetrics() {
 		return
 	}
 
-	sizeClassBuckets = make([]float64, _NumSizeClasses, _NumSizeClasses+1)
+	sizeClassBuckets = make([]float64, gc.NumSizeClasses, gc.NumSizeClasses+1)
 	// Skip size class 0 which is a stand-in for large objects, but large
 	// objects are tracked separately (and they actually get placed in
 	// the last bucket, not the first).
 	sizeClassBuckets[0] = 1 // The smallest allocation is 1 byte in size.
-	for i := 1; i < _NumSizeClasses; i++ {
+	for i := 1; i < gc.NumSizeClasses; i++ {
 		// Size classes have an inclusive upper-bound
 		// and exclusive lower bound (e.g. 48-byte size class is
 		// (32, 48]) whereas we want and inclusive lower-bound
@@ -79,7 +80,7 @@ func initMetrics() {
 		// value up to 2^53 and size classes are relatively small
 		// (nowhere near 2^48 even) so this will give us exact
 		// boundaries.
-		sizeClassBuckets[i] = float64(class_to_size[i] + 1)
+		sizeClassBuckets[i] = float64(gc.SizeClassToSize[i] + 1)
 	}
 	sizeClassBuckets = append(sizeClassBuckets, float64Inf())
 
@@ -615,8 +616,8 @@ func (a *heapStatsAggregate) compute() {
 		nf := a.smallFreeCount[i]
 		a.totalAllocs += na
 		a.totalFrees += nf
-		a.totalAllocated += na * uint64(class_to_size[i])
-		a.totalFreed += nf * uint64(class_to_size[i])
+		a.totalAllocated += na * uint64(gc.SizeClassToSize[i])
+		a.totalFreed += nf * uint64(gc.SizeClassToSize[i])
 	}
 	a.inObjects = a.totalAllocated - a.totalFreed
 	a.numObjects = a.totalAllocs - a.totalFrees
diff --git a/src/runtime/mfinal.go b/src/runtime/mfinal.go
index 4962a63a4146cec6442b217ce1273c0f2439f85a..aad356c6651d9855620fd1fed44700483a53da64 100644
--- a/src/runtime/mfinal.go
+++ b/src/runtime/mfinal.go
@@ -10,6 +10,7 @@ import (
 	"internal/abi"
 	"internal/goarch"
 	"internal/runtime/atomic"
+	"internal/runtime/gc"
 	"internal/runtime/sys"
 	"unsafe"
 )
@@ -466,7 +467,7 @@ func SetFinalizer(obj any, finalizer any) {
 
 	// Move base forward if we've got an allocation header.
 	if !span.spanclass.noscan() && !heapBitsInSpan(span.elemsize) && span.spanclass.sizeclass() != 0 {
-		base += mallocHeaderSize
+		base += gc.MallocHeaderSize
 	}
 
 	if uintptr(e.data) != base {
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index b86466794226e9a87ed57f22092862cbdbb16890..4f68ebf1f3eb9845f259c16ec260adc7282c8e7b 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -130,7 +130,9 @@ package runtime
 
 import (
 	"internal/cpu"
+	"internal/goarch"
 	"internal/runtime/atomic"
+	"internal/runtime/gc"
 	"unsafe"
 )
 
@@ -330,8 +332,15 @@ type workType struct {
 		busy mSpanList
 	}
 
+	_ cpu.CacheLinePad // prevents false-sharing between wbufSpans and spanq
+
+	// Global queue of spans to scan.
+	//
+	// Only used if goexperiment.GreenTeaGC.
+	spanq spanQueue
+
 	// Restore 64-bit alignment on 32-bit.
-	_ uint32
+	// _ uint32
 
 	// bytesMarked is the number of bytes marked this cycle. This
 	// includes bytes blackened in scanned objects, noscan objects
@@ -703,6 +712,10 @@ func gcStart(trigger gcTrigger) {
 			println("runtime: p", p.id, "flushGen", fg, "!= sweepgen", mheap_.sweepgen)
 			throw("p mcache not flushed")
 		}
+		// Initialize ptrBuf if necessary.
+		if p.gcw.ptrBuf == nil {
+			p.gcw.ptrBuf = (*[gc.PageSize / goarch.PtrSize]uintptr)(persistentalloc(gc.PageSize, goarch.PtrSize, &memstats.gcMiscSys))
+		}
 	}
 
 	gcBgMarkStartWorkers()
@@ -1210,6 +1223,9 @@ func gcMarkTermination(stw worldStop) {
 	//
 	// Also, flush the pinner cache, to avoid leaking that memory
 	// indefinitely.
+	if debug.gctrace > 1 {
+		clear(memstats.lastScanStats[:])
+	}
 	forEachP(waitReasonFlushProcCaches, func(pp *p) {
 		pp.mcache.prepareForSweep()
 		if pp.status == _Pidle {
@@ -1219,6 +1235,16 @@ func gcMarkTermination(stw worldStop) {
 				unlock(&mheap_.lock)
 			})
 		}
+		if debug.gctrace > 1 {
+			for i := range pp.gcw.stats {
+				memstats.lastScanStats[i].spansDenseScanned += pp.gcw.stats[i].spansDenseScanned
+				memstats.lastScanStats[i].spanObjsDenseScanned += pp.gcw.stats[i].spanObjsDenseScanned
+				memstats.lastScanStats[i].spansSparseScanned += pp.gcw.stats[i].spansSparseScanned
+				memstats.lastScanStats[i].spanObjsSparseScanned += pp.gcw.stats[i].spanObjsSparseScanned
+				memstats.lastScanStats[i].sparseObjsScanned += pp.gcw.stats[i].sparseObjsScanned
+			}
+			clear(pp.gcw.stats[:])
+		}
 		pp.pinnerCache = nil
 	})
 	if sl.valid {
@@ -1276,6 +1302,40 @@ func gcMarkTermination(stw worldStop) {
 			print(" (forced)")
 		}
 		print("\n")
+		if debug.gctrace > 1 {
+			var (
+				spansDenseScanned     uint64
+				spanObjsDenseScanned  uint64
+				spansSparseScanned    uint64
+				spanObjsSparseScanned uint64
+				sparseObjsScanned     uint64
+			)
+			for _, stats := range memstats.lastScanStats {
+				spansDenseScanned += stats.spansDenseScanned
+				spanObjsDenseScanned += stats.spanObjsDenseScanned
+				spansSparseScanned += stats.spansSparseScanned
+				spanObjsSparseScanned += stats.spanObjsSparseScanned
+				sparseObjsScanned += stats.sparseObjsScanned
+			}
+			totalObjs := sparseObjsScanned + spanObjsSparseScanned + spanObjsDenseScanned
+			totalSpans := spansSparseScanned + spansDenseScanned
+			print("scan: total ", sparseObjsScanned, "+", spanObjsSparseScanned, "+", spanObjsDenseScanned, "=", totalObjs, " objs")
+			print(", ", spansSparseScanned, "+", spansDenseScanned, "=", totalSpans, " spans\n")
+			for i, stats := range memstats.lastScanStats {
+				if stats == (sizeClassScanStats{}) {
+					continue
+				}
+				totalObjs := stats.sparseObjsScanned + stats.spanObjsSparseScanned + stats.spanObjsDenseScanned
+				totalSpans := stats.spansSparseScanned + stats.spansDenseScanned
+				if i == 0 {
+					print("scan: class L ")
+				} else {
+					print("scan: class ", gc.SizeClassToSize[i], "B ")
+				}
+				print(stats.sparseObjsScanned, "+", stats.spanObjsSparseScanned, "+", stats.spanObjsDenseScanned, "=", totalObjs, " objs")
+				print(", ", stats.spansSparseScanned, "+", stats.spansDenseScanned, "=", totalSpans, " spans\n")
+			}
+		}
 		printunlock()
 	}
 
@@ -1568,7 +1628,7 @@ func gcMarkWorkAvailable(p *p) bool {
 	if p != nil && !p.gcw.empty() {
 		return true
 	}
-	if !work.full.empty() {
+	if !work.full.empty() || !work.spanq.empty() {
 		return true // global work available
 	}
 	if work.markrootNext < work.markrootJobs {
@@ -1587,8 +1647,8 @@ func gcMark(startTime int64) {
 	work.tstart = startTime
 
 	// Check that there's no marking work remaining.
-	if work.full != 0 || work.markrootNext < work.markrootJobs {
-		print("runtime: full=", hex(work.full), " next=", work.markrootNext, " jobs=", work.markrootJobs, " nDataRoots=", work.nDataRoots, " nBSSRoots=", work.nBSSRoots, " nSpanRoots=", work.nSpanRoots, " nStackRoots=", work.nStackRoots, "\n")
+	if work.full != 0 || work.markrootNext < work.markrootJobs || !work.spanq.empty() {
+		print("runtime: full=", hex(work.full), " next=", work.markrootNext, " jobs=", work.markrootJobs, " nDataRoots=", work.nDataRoots, " nBSSRoots=", work.nBSSRoots, " nSpanRoots=", work.nSpanRoots, " nStackRoots=", work.nStackRoots, " spanq.n=", work.spanq.size(), "\n")
 		panic("non-empty mark queue after concurrent mark")
 	}
 
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index 823b2bd7df9a0474ebcfedac7a299e6042758460..f1e104e47497495248baf1c714a72a86412f30e4 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -9,6 +9,7 @@ package runtime
 import (
 	"internal/abi"
 	"internal/goarch"
+	"internal/goexperiment"
 	"internal/runtime/atomic"
 	"internal/runtime/sys"
 	"unsafe"
@@ -1187,6 +1188,14 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 			if check != nil && check() {
 				goto done
 			}
+
+			// Spin up a new worker if requested.
+			if goexperiment.GreenTeaGC && gcw.mayNeedWorker {
+				gcw.mayNeedWorker = false
+				if gcphase == _GCmark {
+					gcController.enlistWorker()
+				}
+			}
 		}
 	}
 
@@ -1210,22 +1219,38 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 			gcw.balance()
 		}
 
-		b := gcw.tryGetFast()
-		if b == 0 {
-			b = gcw.tryGet()
-			if b == 0 {
-				// Flush the write barrier
-				// buffer; this may create
-				// more work.
-				wbBufFlush()
-				b = gcw.tryGet()
+		// See mgcwork.go for the rationale behind the order in which we check these queues.
+		var b uintptr
+		var s objptr
+		if b = gcw.tryGetObjFast(); b == 0 {
+			if s = gcw.tryGetSpan(false); s == 0 {
+				if b = gcw.tryGetObj(); b == 0 {
+					// Flush the write barrier
+					// buffer; this may create
+					// more work.
+					wbBufFlush()
+					if b = gcw.tryGetObj(); b == 0 {
+						s = gcw.tryGetSpan(true)
+					}
+				}
 			}
 		}
-		if b == 0 {
+		if b != 0 {
+			scanobject(b, gcw)
+		} else if s != 0 {
+			scanSpan(s, gcw)
+		} else {
 			// Unable to get work.
 			break
 		}
-		scanobject(b, gcw)
+
+		// Spin up a new worker if requested.
+		if goexperiment.GreenTeaGC && gcw.mayNeedWorker {
+			gcw.mayNeedWorker = false
+			if gcphase == _GCmark {
+				gcController.enlistWorker()
+			}
+		}
 
 		// Flush background scan work credit to the global
 		// account if we've accumulated enough locally so
@@ -1290,38 +1315,53 @@ func gcDrainN(gcw *gcWork, scanWork int64) int64 {
 			gcw.balance()
 		}
 
-		b := gcw.tryGetFast()
-		if b == 0 {
-			b = gcw.tryGet()
-			if b == 0 {
-				// Flush the write barrier buffer;
-				// this may create more work.
-				wbBufFlush()
-				b = gcw.tryGet()
-			}
-		}
-
-		if b == 0 {
-			// Try to do a root job.
-			if work.markrootNext < work.markrootJobs {
-				job := atomic.Xadd(&work.markrootNext, +1) - 1
-				if job < work.markrootJobs {
-					workFlushed += markroot(gcw, job, false)
-					continue
+		// See mgcwork.go for the rationale behind the order in which we check these queues.
+		var b uintptr
+		var s objptr
+		if b = gcw.tryGetObjFast(); b == 0 {
+			if s = gcw.tryGetSpan(false); s == 0 {
+				if b = gcw.tryGetObj(); b == 0 {
+					// Flush the write barrier
+					// buffer; this may create
+					// more work.
+					wbBufFlush()
+					if b = gcw.tryGetObj(); b == 0 {
+						// Try to do a root job.
+						if work.markrootNext < work.markrootJobs {
+							job := atomic.Xadd(&work.markrootNext, +1) - 1
+							if job < work.markrootJobs {
+								workFlushed += markroot(gcw, job, false)
+								continue
+							}
+						}
+						s = gcw.tryGetSpan(true)
+					}
 				}
 			}
-			// No heap or root jobs.
+		}
+		if b != 0 {
+			scanobject(b, gcw)
+		} else if s != 0 {
+			scanSpan(s, gcw)
+		} else {
+			// Unable to get work.
 			break
 		}
 
-		scanobject(b, gcw)
-
 		// Flush background scan work credit.
 		if gcw.heapScanWork >= gcCreditSlack {
 			gcController.heapScanWork.Add(gcw.heapScanWork)
 			workFlushed += gcw.heapScanWork
 			gcw.heapScanWork = 0
 		}
+
+		// Spin up a new worker if requested.
+		if goexperiment.GreenTeaGC && gcw.mayNeedWorker {
+			gcw.mayNeedWorker = false
+			if gcphase == _GCmark {
+				gcController.enlistWorker()
+			}
+		}
 	}
 
 	// Unlike gcDrain, there's no need to flush remaining work
@@ -1359,10 +1399,14 @@ func scanblock(b0, n0 uintptr, ptrmask *uint8, gcw *gcWork, stk *stackScanState)
 				// Same work as in scanobject; see comments there.
 				p := *(*uintptr)(unsafe.Pointer(b + i))
 				if p != 0 {
-					if obj, span, objIndex := findObject(p, b, i); obj != 0 {
-						greyobject(obj, b, i, span, gcw, objIndex)
-					} else if stk != nil && p >= stk.stack.lo && p < stk.stack.hi {
+					if stk != nil && p >= stk.stack.lo && p < stk.stack.hi {
 						stk.putPtr(p, false)
+					} else {
+						if !tryDeferToSpanScan(p, gcw) {
+							if obj, span, objIndex := findObject(p, b, i); obj != 0 {
+								greyobject(obj, b, i, span, gcw, objIndex)
+							}
+						}
 					}
 				}
 			}
@@ -1412,8 +1456,8 @@ func scanobject(b uintptr, gcw *gcWork) {
 			// so we'll drop out immediately when we go to
 			// scan those.
 			for oblet := b + maxObletBytes; oblet < s.base()+s.elemsize; oblet += maxObletBytes {
-				if !gcw.putFast(oblet) {
-					gcw.put(oblet)
+				if !gcw.putObjFast(oblet) {
+					gcw.putObj(oblet)
 				}
 			}
 		}
@@ -1459,13 +1503,18 @@ func scanobject(b uintptr, gcw *gcWork) {
 			// heap. In this case, we know the object was
 			// just allocated and hence will be marked by
 			// allocation itself.
-			if obj, span, objIndex := findObject(obj, b, addr-b); obj != 0 {
-				greyobject(obj, b, addr-b, span, gcw, objIndex)
+			if !tryDeferToSpanScan(obj, gcw) {
+				if obj, span, objIndex := findObject(obj, b, addr-b); obj != 0 {
+					greyobject(obj, b, addr-b, span, gcw, objIndex)
+				}
 			}
 		}
 	}
 	gcw.bytesMarked += uint64(n)
 	gcw.heapScanWork += int64(scanSize)
+	if debug.gctrace > 1 {
+		gcw.stats[s.spanclass.sizeclass()].sparseObjsScanned++
+	}
 }
 
 // scanConservative scans block [b, b+n) conservatively, treating any
@@ -1559,7 +1608,9 @@ func scanConservative(b, n uintptr, ptrmask *uint8, gcw *gcWork, state *stackSca
 
 		// val points to an allocated object. Mark it.
 		obj := span.base() + idx*span.elemsize
-		greyobject(obj, b, i, span, gcw, idx)
+		if !tryDeferToSpanScan(obj, gcw) {
+			greyobject(obj, b, i, span, gcw, idx)
+		}
 	}
 }
 
@@ -1569,9 +1620,11 @@ func scanConservative(b, n uintptr, ptrmask *uint8, gcw *gcWork, state *stackSca
 //
 //go:nowritebarrier
 func shade(b uintptr) {
-	if obj, span, objIndex := findObject(b, 0, 0); obj != 0 {
-		gcw := &getg().m.p.ptr().gcw
-		greyobject(obj, 0, 0, span, gcw, objIndex)
+	gcw := &getg().m.p.ptr().gcw
+	if !tryDeferToSpanScan(b, gcw) {
+		if obj, span, objIndex := findObject(b, 0, 0); obj != 0 {
+			greyobject(obj, 0, 0, span, gcw, objIndex)
+		}
 	}
 }
 
@@ -1629,8 +1682,8 @@ func greyobject(obj, base, off uintptr, span *mspan, gcw *gcWork, objIndex uintp
 	// some benefit on platforms with inclusive shared caches.
 	sys.Prefetch(obj)
 	// Queue the obj for scanning.
-	if !gcw.putFast(obj) {
-		gcw.put(obj)
+	if !gcw.putObjFast(obj) {
+		gcw.putObj(obj)
 	}
 }
 
@@ -1700,6 +1753,10 @@ func gcmarknewobject(span *mspan, obj uintptr) {
 	// Mark object.
 	objIndex := span.objIndex(obj)
 	span.markBitsForIndex(objIndex).setMarked()
+	if goexperiment.GreenTeaGC && gcUsesSpanInlineMarkBits(span.elemsize) {
+		// No need to scan the new object.
+		span.scannedBitsForIndex(objIndex).setMarked()
+	}
 
 	// Mark span.
 	arena, pageIdx, pageMask := pageIndexOf(span.base())
@@ -1722,8 +1779,10 @@ func gcMarkTinyAllocs() {
 		if c == nil || c.tiny == 0 {
 			continue
 		}
-		_, span, objIndex := findObject(c.tiny, 0, 0)
 		gcw := &p.gcw
-		greyobject(c.tiny, 0, 0, span, gcw, objIndex)
+		if !tryDeferToSpanScan(c.tiny, gcw) {
+			_, span, objIndex := findObject(c.tiny, 0, 0)
+			greyobject(c.tiny, 0, 0, span, gcw, objIndex)
+		}
 	}
 }
diff --git a/src/runtime/mgcmark_greenteagc.go b/src/runtime/mgcmark_greenteagc.go
new file mode 100644
index 0000000000000000000000000000000000000000..e17f1bb705d987f69775b8d0421052037ea7b37e
--- /dev/null
+++ b/src/runtime/mgcmark_greenteagc.go
@@ -0,0 +1,765 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Green Tea mark algorithm
+//
+// The core idea behind Green Tea is simple: achieve better locality during
+// mark/scan by delaying scanning so that we can accumulate objects to scan
+// within the same span, then scan the objects that have accumulated on the
+// span all together.
+//
+// By batching objects this way, we increase the chance that adjacent objects
+// will be accessed, amortize the cost of accessing object metadata, and create
+// better opportunities for prefetching. We can take this even further and
+// optimize the scan loop by size class (not yet completed) all the way to the
+// point of applying SIMD techniques to really tear through the heap.
+//
+// Naturally, this depends on being able to create opportunties to batch objects
+// together. The basic idea here is to have two sets of mark bits. One set is the
+// regular set of mark bits ("marks"), while the other essentially says that the
+// objects have been scanned already ("scans"). When we see a pointer for the first
+// time we set its mark and enqueue its span. We track these spans in work queues
+// with a FIFO policy, unlike workbufs which have a LIFO policy. Empirically, a
+// FIFO policy appears to work best for accumulating objects to scan on a span.
+// Later, when we dequeue the span, we find both the union and intersection of the
+// mark and scan bitsets. The union is then written back into the scan bits, while
+// the intersection is used to decide which objects need scanning, such that the GC
+// is still precise.
+//
+// Below is the bulk of the implementation, focusing on the worst case
+// for locality, small objects. Specifically, those that are smaller than
+// a few cache lines in size and whose metadata is stored the same way (at the
+// end of the span).
+
+//go:build goexperiment.greenteagc
+
+package runtime
+
+import (
+	"internal/cpu"
+	"internal/goarch"
+	"internal/runtime/atomic"
+	"internal/runtime/gc"
+	"internal/runtime/sys"
+	"unsafe"
+)
+
+const doubleCheckGreenTea = false
+
+// spanInlineMarkBits are mark bits that are inlined into the span
+// itself. gcUsesSpanInlineMarkBits may be used to check if objects
+// of a particular size use inline mark bits.
+//
+// Inline mark bits are a little bit more than just mark bits. They
+// consist of two parts: scans and marks. Marks are like pre-mark
+// bits. They're set once a pointer to an object is discovered for
+// the first time. The marks allow us to scan many objects in bulk
+// if we queue the whole span for scanning. Before we scan such objects
+// in bulk, we copy the marks to the scans, computing a diff along the
+// way. The resulting bitmap tells us which objects we should scan.
+//
+// The inlineMarkBits also hold state sufficient for scanning any
+// object in the span, as well as state for acquiring ownership of
+// the span for queuing. This avoids the need to look at the mspan when
+// scanning.
+type spanInlineMarkBits struct {
+	scans [63]uint8         // scanned bits.
+	owned spanScanOwnership // see the comment on spanScanOwnership.
+	marks [63]uint8         // mark bits.
+	class spanClass
+}
+
+// spanScanOwnership indicates whether some thread has acquired
+// the span for scanning, and whether there has been one or more
+// attempts to acquire the span. The latter information helps to
+// fast-track span scans that only apply to a single mark, skipping
+// the relatively costly merge-and-diff process for scans and marks
+// by allowing one to just set the mark directly.
+type spanScanOwnership uint8
+
+const (
+	spanScanUnowned  spanScanOwnership = 0         // Indicates the span is not acquired for scanning.
+	spanScanOneMark                    = 1 << iota // Indicates that only one mark bit is set relative to the scan bits.
+	spanScanManyMark                               // Indicates one or more scan bits may be set relative to the mark bits.
+	// "ManyMark" need not be exactly the value it has. In practice we just
+	// want to distinguish "none" from "one" from "many," so a comparison is
+	// sufficient (as opposed to a bit test) to check between these cases.
+)
+
+// load atomically loads from a pointer to a spanScanOwnership.
+func (o *spanScanOwnership) load() spanScanOwnership {
+	return spanScanOwnership(atomic.Load8((*uint8)(unsafe.Pointer(o))))
+}
+
+func (o *spanScanOwnership) or(v spanScanOwnership) spanScanOwnership {
+	// N.B. We round down the address and use Or32 because Or8 doesn't
+	// return a result, and it's strictly necessary for this protocol.
+	//
+	// Making Or8 return a result, while making the code look nicer, would
+	// not be strictly better on any supported platform, as an Or8 that
+	// returns a result is not a common instruction. On many platforms it
+	// would be implemented exactly as it is here, and since Or8 is
+	// exclusively used in the runtime and a hot function, we want to keep
+	// using its no-result version elsewhere for performance.
+	o32 := (*uint32)(unsafe.Pointer(uintptr(unsafe.Pointer(o)) &^ 0b11))
+	off := (uintptr(unsafe.Pointer(o)) & 0b11) * 8
+	if goarch.BigEndian {
+		off = 32 - off - 8
+	}
+	return spanScanOwnership(atomic.Or32(o32, uint32(v)<<off) >> off)
+}
+
+func (imb *spanInlineMarkBits) init(class spanClass) {
+	*imb = spanInlineMarkBits{}
+	imb.class = class
+}
+
+// tryAcquire attempts to acquire the span for scanning. On success, the caller
+// must queue the span for scanning or scan the span immediately.
+func (imb *spanInlineMarkBits) tryAcquire() bool {
+	switch imb.owned.load() {
+	case spanScanUnowned:
+		// Try to mark the span as having only one object marked.
+		if imb.owned.or(spanScanOneMark) == spanScanUnowned {
+			return true
+		}
+		// If we didn't see an old value of spanScanUnowned, then we must
+		// have raced with someone else and seen spanScanOneMark or greater.
+		// Fall through and try to set spanScanManyMark.
+		fallthrough
+	case spanScanOneMark:
+		// We may be the first to set *any* bit on owned. In such a case,
+		// we still need to make sure the span is queued.
+		return imb.owned.or(spanScanManyMark) == spanScanUnowned
+	}
+	return false
+}
+
+// release releases the span for scanning, allowing another thread to queue the span.
+//
+// Returns an upper bound on the number of mark bits set since the span was queued. The
+// upper bound is described as "one" (spanScanOneMark) or "many" (spanScanManyMark, with or
+// without spanScanOneMark). If the return value indicates only one mark bit was set, the
+// caller can be certain that it was the same mark bit that caused the span to get queued.
+// Take note of the fact that this is *only* an upper-bound. In particular, it may still
+// turn out that only one mark bit was set, even if the return value indicates "many".
+func (imb *spanInlineMarkBits) release() spanScanOwnership {
+	return spanScanOwnership(atomic.Xchg8((*uint8)(unsafe.Pointer(&imb.owned)), uint8(spanScanUnowned)))
+}
+
+// spanInlineMarkBitsFromBase returns the spanInlineMarkBits for a span whose start address is base.
+//
+// The span must be gcUsesSpanInlineMarkBits(span.elemsize).
+func spanInlineMarkBitsFromBase(base uintptr) *spanInlineMarkBits {
+	return (*spanInlineMarkBits)(unsafe.Pointer(base + gc.PageSize - unsafe.Sizeof(spanInlineMarkBits{})))
+}
+
+// initInlineMarkBits initializes the inlineMarkBits stored at the end of the span.
+func (s *mspan) initInlineMarkBits() {
+	if doubleCheckGreenTea && !gcUsesSpanInlineMarkBits(s.elemsize) {
+		throw("expected span with inline mark bits")
+	}
+	s.inlineMarkBits().init(s.spanclass)
+}
+
+// mergeInlineMarks merges the span's inline mark bits into dst.
+//
+// gcUsesSpanInlineMarkBits(s.elemsize) must be true.
+func (s *mspan) mergeInlineMarks(dst *gcBits) {
+	if doubleCheckGreenTea && !gcUsesSpanInlineMarkBits(s.elemsize) {
+		throw("expected span with inline mark bits")
+	}
+	bytes := divRoundUp(uintptr(s.nelems), 8)
+	imb := s.inlineMarkBits()
+	_ = imb.marks[bytes-1]
+	for i := uintptr(0); i < bytes; i++ {
+		*dst.bytep(i) |= imb.marks[i]
+	}
+	if doubleCheckGreenTea && !s.spanclass.noscan() && imb.marks != imb.scans {
+		throw("marks don't match scans for span with pointer")
+	}
+}
+
+// inlineMarkBits returns the inline mark bits for the span.
+//
+// gcUsesSpanInlineMarkBits(s.elemsize) must be true.
+func (s *mspan) inlineMarkBits() *spanInlineMarkBits {
+	if doubleCheckGreenTea && !gcUsesSpanInlineMarkBits(s.elemsize) {
+		throw("expected span with inline mark bits")
+	}
+	return spanInlineMarkBitsFromBase(s.base())
+}
+
+func (s *mspan) markBitsForIndex(objIndex uintptr) (bits markBits) {
+	if gcUsesSpanInlineMarkBits(s.elemsize) {
+		bits.bytep = &s.inlineMarkBits().marks[objIndex/8]
+	} else {
+		bits.bytep = s.gcmarkBits.bytep(objIndex / 8)
+	}
+	bits.mask = uint8(1) << (objIndex % 8)
+	bits.index = objIndex
+	return
+}
+
+func (s *mspan) markBitsForBase() markBits {
+	if gcUsesSpanInlineMarkBits(s.elemsize) {
+		return markBits{&s.inlineMarkBits().marks[0], uint8(1), 0}
+	}
+	return markBits{&s.gcmarkBits.x, uint8(1), 0}
+}
+
+// scannedBitsForIndex returns a markBits representing the scanned bit
+// for objIndex in the inline mark bits.
+func (s *mspan) scannedBitsForIndex(objIndex uintptr) markBits {
+	return markBits{&s.inlineMarkBits().scans[objIndex/8], uint8(1) << (objIndex % 8), objIndex}
+}
+
+// gcUsesSpanInlineMarkBits returns true if a span holding objects of a certain size
+// has inline mark bits. size must be the span's elemsize.
+//
+// nosplit because this is called from gcmarknewobject, which is nosplit.
+//
+//go:nosplit
+func gcUsesSpanInlineMarkBits(size uintptr) bool {
+	return heapBitsInSpan(size) && size >= 16
+}
+
+// tryQueueOnSpan tries to queue p on the span it points to, if it
+// points to a small object span (gcUsesSpanQueue size).
+func tryDeferToSpanScan(p uintptr, gcw *gcWork) bool {
+	if useCheckmark {
+		return false
+	}
+
+	// Quickly to see if this is a span that has inline mark bits.
+	ha := heapArenaOf(p)
+	if ha == nil {
+		return false
+	}
+	pageIdx := ((p / pageSize) / 8) % uintptr(len(ha.pageInUse))
+	pageMask := byte(1 << ((p / pageSize) % 8))
+	if ha.pageUseSpanInlineMarkBits[pageIdx]&pageMask == 0 {
+		return false
+	}
+
+	// Find the object's index from the span class info stored in the inline mark bits.
+	base := alignDown(p, gc.PageSize)
+	q := spanInlineMarkBitsFromBase(base)
+	objIndex := uint16((uint64(p-base) * uint64(gc.SizeClassToDivMagic[q.class.sizeclass()])) >> 32)
+
+	// Set mark bit.
+	idx, mask := objIndex/8, uint8(1)<<(objIndex%8)
+	if atomic.Load8(&q.marks[idx])&mask != 0 {
+		return true
+	}
+	atomic.Or8(&q.marks[idx], mask)
+
+	// Fast-track noscan objects.
+	if q.class.noscan() {
+		gcw.bytesMarked += uint64(gc.SizeClassToSize[q.class.sizeclass()])
+		return true
+	}
+
+	// Queue up the pointer (as a representative for its span).
+	if q.tryAcquire() {
+		if gcw.spanq.put(makeObjPtr(base, objIndex)) {
+			if gcphase == _GCmark {
+				gcw.mayNeedWorker = true
+			}
+			gcw.flushedWork = true
+		}
+	}
+	return true
+}
+
+// tryGetSpan attempts to get an entire span to scan.
+func (w *gcWork) tryGetSpan(slow bool) objptr {
+	if s := w.spanq.get(); s != 0 {
+		return s
+	}
+
+	if slow {
+		// Check the global span queue.
+		if s := work.spanq.get(w); s != 0 {
+			return s
+		}
+
+		// Attempt to steal spans to scan from other Ps.
+		return spanQueueSteal(w)
+	}
+	return 0
+}
+
+// spanQueue is a concurrent safe queue of mspans. Each mspan is represented
+// as an objptr whose spanBase is the base address of the span.
+type spanQueue struct {
+	avail atomic.Bool      // optimization to check emptiness w/o the lock
+	_     cpu.CacheLinePad // prevents false-sharing between lock and avail
+	lock  mutex
+	q     mSpanQueue
+}
+
+func (q *spanQueue) empty() bool {
+	return !q.avail.Load()
+}
+
+func (q *spanQueue) size() int {
+	return q.q.n
+}
+
+// putBatch adds a whole batch of spans to the queue.
+func (q *spanQueue) putBatch(batch []objptr) {
+	var list mSpanQueue
+	for _, p := range batch {
+		s := spanOfUnchecked(p.spanBase())
+		s.scanIdx = p.objIndex()
+		list.push(s)
+	}
+
+	lock(&q.lock)
+	if q.q.n == 0 {
+		q.avail.Store(true)
+	}
+	q.q.takeAll(&list)
+	unlock(&q.lock)
+}
+
+// get tries to take a span off the queue.
+//
+// Returns a non-zero objptr on success. Also, moves additional
+// spans to gcw's local span queue.
+func (q *spanQueue) get(gcw *gcWork) objptr {
+	if q.empty() {
+		return 0
+	}
+	lock(&q.lock)
+	if q.q.n == 0 {
+		unlock(&q.lock)
+		return 0
+	}
+	n := q.q.n/int(gomaxprocs) + 1
+	if n > q.q.n {
+		n = q.q.n
+	}
+	if max := len(gcw.spanq.ring) / 2; n > max {
+		n = max
+	}
+	newQ := q.q.popN(n)
+	if q.q.n == 0 {
+		q.avail.Store(false)
+	}
+	unlock(&q.lock)
+
+	s := newQ.pop()
+	for newQ.n > 0 {
+		s := newQ.pop()
+		gcw.spanq.put(makeObjPtr(s.base(), s.scanIdx))
+	}
+	return makeObjPtr(s.base(), s.scanIdx)
+}
+
+// localSpanQueue is a P-local ring buffer of objptrs that represent spans.
+// Accessed without a lock.
+//
+// Multi-consumer, single-producer. The only producer is the P that owns this
+// queue, but any other P may consume from it.
+//
+// This is based on the scheduler runqueues. If making changes there, consider
+// also making them here.
+type localSpanQueue struct {
+	head atomic.Uint32
+	tail atomic.Uint32
+	ring [256]objptr
+}
+
+// put adds s to the queue. Returns true if put flushed to the global queue
+// because it was full.
+func (q *localSpanQueue) put(s objptr) (flushed bool) {
+	for {
+		h := q.head.Load() // synchronize with consumers
+		t := q.tail.Load()
+		if t-h < uint32(len(q.ring)) {
+			q.ring[t%uint32(len(q.ring))] = s
+			q.tail.Store(t + 1) // Makes the item avail for consumption.
+			return false
+		}
+		if q.putSlow(s, h, t) {
+			return true
+		}
+		// The queue is not full, now the put above must succeed.
+	}
+}
+
+// putSlow is a helper for put to move spans to the global queue.
+// Returns true on success, false on failure (nothing moved).
+func (q *localSpanQueue) putSlow(s objptr, h, t uint32) bool {
+	var batch [len(q.ring)/2 + 1]objptr
+
+	// First, grab a batch from local queue.
+	n := t - h
+	n = n / 2
+	if n != uint32(len(q.ring)/2) {
+		throw("localSpanQueue.putSlow: queue is not full")
+	}
+	for i := uint32(0); i < n; i++ {
+		batch[i] = q.ring[(h+i)%uint32(len(q.ring))]
+	}
+	if !q.head.CompareAndSwap(h, h+n) { // Commits consume.
+		return false
+	}
+	batch[n] = s
+
+	work.spanq.putBatch(batch[:])
+	return true
+}
+
+// get attempts to take a span off the queue. Might fail if the
+// queue is empty. May be called by multiple threads, but callers
+// are better off using stealFrom to amortize the cost of stealing.
+// This method is intended for use by the owner of this queue.
+func (q *localSpanQueue) get() objptr {
+	for {
+		h := q.head.Load()
+		t := q.tail.Load()
+		if t == h {
+			return 0
+		}
+		s := q.ring[h%uint32(len(q.ring))]
+		if q.head.CompareAndSwap(h, h+1) {
+			return s
+		}
+	}
+}
+
+func (q *localSpanQueue) empty() bool {
+	h := q.head.Load()
+	t := q.tail.Load()
+	return t == h
+}
+
+// stealFrom takes spans from q2 and puts them into q1. One span is removed
+// from the stolen spans and returned on success. Failure to steal returns a
+// zero objptr.
+func (q1 *localSpanQueue) stealFrom(q2 *localSpanQueue) objptr {
+	writeHead := q1.tail.Load()
+
+	var n uint32
+	for {
+		h := q2.head.Load() // load-acquire, synchronize with other consumers
+		t := q2.tail.Load() // load-acquire, synchronize with the producer
+		n = t - h
+		n = n - n/2
+		if n == 0 {
+			return 0
+		}
+		if n > uint32(len(q2.ring)/2) { // read inconsistent h and t
+			continue
+		}
+		for i := uint32(0); i < n; i++ {
+			c := q2.ring[(h+i)%uint32(len(q2.ring))]
+			q1.ring[(writeHead+i)%uint32(len(q1.ring))] = c
+		}
+		if q2.head.CompareAndSwap(h, h+n) {
+			break
+		}
+	}
+	n--
+	c := q1.ring[(writeHead+n)%uint32(len(q1.ring))]
+	if n == 0 {
+		return c
+	}
+	h := q1.head.Load()
+	if writeHead-h+n >= uint32(len(q1.ring)) {
+		throw("localSpanQueue.stealFrom: queue overflow")
+	}
+	q1.tail.Store(writeHead + n)
+	return c
+}
+
+// drain moves all spans in the queue to the global queue.
+//
+// Returns true if anything was moved.
+func (q *localSpanQueue) drain() bool {
+	var batch [len(q.ring)]objptr
+
+	var n uint32
+	for {
+		var h uint32
+		for {
+			h = q.head.Load()
+			t := q.tail.Load()
+			n = t - h
+			if n == 0 {
+				return false
+			}
+			if n <= uint32(len(q.ring)) {
+				break
+			}
+			// Read inconsistent h and t.
+		}
+		for i := uint32(0); i < n; i++ {
+			batch[i] = q.ring[(h+i)%uint32(len(q.ring))]
+		}
+		if q.head.CompareAndSwap(h, h+n) { // Commits consume.
+			break
+		}
+	}
+	if !q.empty() {
+		throw("drained local span queue, but not empty")
+	}
+
+	work.spanq.putBatch(batch[:n])
+	return true
+}
+
+// spanQueueSteal attempts to steal a span from another P's local queue.
+//
+// Returns a non-zero objptr on success.
+func spanQueueSteal(gcw *gcWork) objptr {
+	pp := getg().m.p.ptr()
+
+	for enum := stealOrder.start(cheaprand()); !enum.done(); enum.next() {
+		p2 := allp[enum.position()]
+		if pp == p2 {
+			continue
+		}
+		if s := gcw.spanq.stealFrom(&p2.gcw.spanq); s != 0 {
+			return s
+		}
+	}
+	return 0
+}
+
+// objptr consists of a span base and the index of the object in the span.
+type objptr uintptr
+
+// makeObjPtr creates an objptr from a span base address and an object index.
+func makeObjPtr(spanBase uintptr, objIndex uint16) objptr {
+	if doubleCheckGreenTea && spanBase&((1<<gc.PageShift)-1) != 0 {
+		throw("created objptr with address that is incorrectly aligned")
+	}
+	return objptr(spanBase | uintptr(objIndex))
+}
+
+func (p objptr) spanBase() uintptr {
+	return uintptr(p) &^ ((1 << gc.PageShift) - 1)
+}
+
+func (p objptr) objIndex() uint16 {
+	return uint16(p) & ((1 << gc.PageShift) - 1)
+}
+
+// scanSpan scans objects indicated marks&^scans and then scans those objects,
+// queuing the resulting pointers into gcw.
+func scanSpan(p objptr, gcw *gcWork) {
+	spanBase := p.spanBase()
+	imb := spanInlineMarkBitsFromBase(spanBase)
+	spanclass := imb.class
+	if spanclass.noscan() {
+		throw("noscan object in scanSpan")
+	}
+	elemsize := uintptr(gc.SizeClassToSize[spanclass.sizeclass()])
+
+	// Release span.
+	if imb.release() == spanScanOneMark {
+		// Nobody else set any mark bits on this span while it was acquired.
+		// That means p is the sole object we need to handle. Fast-track it.
+		objIndex := p.objIndex()
+		bytep := &imb.scans[objIndex/8]
+		mask := uint8(1) << (objIndex % 8)
+		if atomic.Load8(bytep)&mask != 0 {
+			return
+		}
+		atomic.Or8(bytep, mask)
+		gcw.bytesMarked += uint64(elemsize)
+		if debug.gctrace > 1 {
+			gcw.stats[spanclass.sizeclass()].spansSparseScanned++
+			gcw.stats[spanclass.sizeclass()].spanObjsSparseScanned++
+		}
+		b := spanBase + uintptr(objIndex)*elemsize
+		scanObjectSmall(spanBase, b, elemsize, gcw)
+		return
+	}
+
+	// Compute nelems.
+	divMagic := uint64(gc.SizeClassToDivMagic[spanclass.sizeclass()])
+	usableSpanSize := uint64(gc.PageSize - unsafe.Sizeof(spanInlineMarkBits{}))
+	if !spanclass.noscan() {
+		usableSpanSize -= gc.PageSize / goarch.PtrSize / 8
+	}
+	nelems := uint16((usableSpanSize * divMagic) >> 32)
+
+	// Grey objects and return if there's nothing else to do.
+	var toScan gc.ObjMask
+	objsMarked := spanSetScans(spanBase, nelems, imb, &toScan)
+	if objsMarked == 0 {
+		return
+	}
+	gcw.bytesMarked += uint64(objsMarked) * uint64(elemsize)
+	if debug.gctrace > 1 {
+		gcw.stats[spanclass.sizeclass()].spansDenseScanned++
+		gcw.stats[spanclass.sizeclass()].spanObjsDenseScanned += uint64(objsMarked)
+	}
+	scanObjectsSmall(spanBase, elemsize, nelems, gcw, &toScan)
+}
+
+// spanSetScans sets any unset mark bits that have their mark bits set in the inline mark bits.
+//
+// toScan is populated with bits indicating whether a particular mark bit was set.
+//
+// Returns the number of objects marked, which could be zero.
+func spanSetScans(spanBase uintptr, nelems uint16, imb *spanInlineMarkBits, toScan *gc.ObjMask) int {
+	arena, pageIdx, pageMask := pageIndexOf(spanBase)
+	if arena.pageMarks[pageIdx]&pageMask == 0 {
+		atomic.Or8(&arena.pageMarks[pageIdx], pageMask)
+	}
+
+	bytes := divRoundUp(uintptr(nelems), 8)
+	objsMarked := 0
+
+	// Careful: these two structures alias since ObjMask is much bigger
+	// than marks or scans. We do these unsafe shenanigans so that we can
+	// access the marks and scans by uintptrs rather than by byte.
+	imbMarks := (*gc.ObjMask)(unsafe.Pointer(&imb.marks))
+	imbScans := (*gc.ObjMask)(unsafe.Pointer(&imb.scans))
+
+	// Iterate over one uintptr-sized chunks at a time, computing both
+	// the union and intersection of marks and scans. Store the union
+	// into scans, and the intersection into toScan.
+	for i := uintptr(0); i < bytes; i += goarch.PtrSize {
+		scans := atomic.Loaduintptr(&imbScans[i/goarch.PtrSize])
+		marks := imbMarks[i/goarch.PtrSize]
+		scans = bswapIfBigEndian(scans)
+		marks = bswapIfBigEndian(marks)
+		if i/goarch.PtrSize == 64/goarch.PtrSize-1 {
+			scans &^= 0xff << ((goarch.PtrSize - 1) * 8) // mask out owned
+			marks &^= 0xff << ((goarch.PtrSize - 1) * 8) // mask out class
+		}
+		toGrey := marks &^ scans
+		toScan[i/goarch.PtrSize] = toGrey
+
+		// If there's anything left to grey, do it.
+		if toGrey != 0 {
+			toGrey = bswapIfBigEndian(toGrey)
+			if goarch.PtrSize == 4 {
+				atomic.Or32((*uint32)(unsafe.Pointer(&imbScans[i/goarch.PtrSize])), uint32(toGrey))
+			} else {
+				atomic.Or64((*uint64)(unsafe.Pointer(&imbScans[i/goarch.PtrSize])), uint64(toGrey))
+			}
+		}
+		objsMarked += sys.OnesCount64(uint64(toGrey))
+	}
+	return objsMarked
+}
+
+func scanObjectSmall(spanBase, b, objSize uintptr, gcw *gcWork) {
+	ptrBits := heapBitsSmallForAddrInline(spanBase, b, objSize)
+	gcw.heapScanWork += int64(sys.Len64(uint64(ptrBits)) * goarch.PtrSize)
+	nptrs := 0
+	n := sys.OnesCount64(uint64(ptrBits))
+	for range n {
+		k := sys.TrailingZeros64(uint64(ptrBits))
+		ptrBits &^= 1 << k
+		addr := b + uintptr(k)*goarch.PtrSize
+
+		// Prefetch addr since we're about to use it. This point for prefetching
+		// was chosen empirically.
+		sys.Prefetch(addr)
+
+		// N.B. ptrBuf is always large enough to hold pointers for an entire 1-page span.
+		gcw.ptrBuf[nptrs] = addr
+		nptrs++
+	}
+
+	// Process all the pointers we just got.
+	for _, p := range gcw.ptrBuf[:nptrs] {
+		p = *(*uintptr)(unsafe.Pointer(p))
+		if p == 0 {
+			continue
+		}
+		if !tryDeferToSpanScan(p, gcw) {
+			if obj, span, objIndex := findObject(p, 0, 0); obj != 0 {
+				greyobject(obj, 0, 0, span, gcw, objIndex)
+			}
+		}
+	}
+}
+
+func scanObjectsSmall(base, objSize uintptr, elems uint16, gcw *gcWork, scans *gc.ObjMask) {
+	nptrs := 0
+	for i, bits := range scans {
+		if i*(goarch.PtrSize*8) > int(elems) {
+			break
+		}
+		n := sys.OnesCount64(uint64(bits))
+		for range n {
+			j := sys.TrailingZeros64(uint64(bits))
+			bits &^= 1 << j
+
+			b := base + uintptr(i*(goarch.PtrSize*8)+j)*objSize
+			ptrBits := heapBitsSmallForAddrInline(base, b, objSize)
+			gcw.heapScanWork += int64(sys.Len64(uint64(ptrBits)) * goarch.PtrSize)
+
+			n := sys.OnesCount64(uint64(ptrBits))
+			for range n {
+				k := sys.TrailingZeros64(uint64(ptrBits))
+				ptrBits &^= 1 << k
+				addr := b + uintptr(k)*goarch.PtrSize
+
+				// Prefetch addr since we're about to use it. This point for prefetching
+				// was chosen empirically.
+				sys.Prefetch(addr)
+
+				// N.B. ptrBuf is always large enough to hold pointers for an entire 1-page span.
+				gcw.ptrBuf[nptrs] = addr
+				nptrs++
+			}
+		}
+	}
+
+	// Process all the pointers we just got.
+	for _, p := range gcw.ptrBuf[:nptrs] {
+		p = *(*uintptr)(unsafe.Pointer(p))
+		if p == 0 {
+			continue
+		}
+		if !tryDeferToSpanScan(p, gcw) {
+			if obj, span, objIndex := findObject(p, 0, 0); obj != 0 {
+				greyobject(obj, 0, 0, span, gcw, objIndex)
+			}
+		}
+	}
+}
+
+func heapBitsSmallForAddrInline(spanBase, addr, elemsize uintptr) uintptr {
+	hbitsBase, _ := spanHeapBitsRange(spanBase, gc.PageSize, elemsize)
+	hbits := (*byte)(unsafe.Pointer(hbitsBase))
+
+	// These objects are always small enough that their bitmaps
+	// fit in a single word, so just load the word or two we need.
+	//
+	// Mirrors mspan.writeHeapBitsSmall.
+	//
+	// We should be using heapBits(), but unfortunately it introduces
+	// both bounds checks panics and throw which causes us to exceed
+	// the nosplit limit in quite a few cases.
+	i := (addr - spanBase) / goarch.PtrSize / ptrBits
+	j := (addr - spanBase) / goarch.PtrSize % ptrBits
+	bits := elemsize / goarch.PtrSize
+	word0 := (*uintptr)(unsafe.Pointer(addb(hbits, goarch.PtrSize*(i+0))))
+	word1 := (*uintptr)(unsafe.Pointer(addb(hbits, goarch.PtrSize*(i+1))))
+
+	var read uintptr
+	if j+bits > ptrBits {
+		// Two reads.
+		bits0 := ptrBits - j
+		bits1 := bits - bits0
+		read = *word0 >> j
+		read |= (*word1 & ((1 << bits1) - 1)) << bits0
+	} else {
+		// One read.
+		read = (*word0 >> j) & ((1 << bits) - 1)
+	}
+	return read
+}
\ No newline at end of file
diff --git a/src/runtime/mgcmark_nogreenteagc.go b/src/runtime/mgcmark_nogreenteagc.go
new file mode 100644
index 0000000000000000000000000000000000000000..8e1841f5669e047b49639b1629d4e59cd3d90a36
--- /dev/null
+++ b/src/runtime/mgcmark_nogreenteagc.go
@@ -0,0 +1,80 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !goexperiment.greenteagc
+
+package runtime
+
+func (s *mspan) markBitsForIndex(objIndex uintptr) markBits {
+	bytep, mask := s.gcmarkBits.bitp(objIndex)
+	return markBits{bytep, mask, objIndex}
+}
+
+func (s *mspan) markBitsForBase() markBits {
+	return markBits{&s.gcmarkBits.x, uint8(1), 0}
+}
+
+func tryDeferToSpanScan(p uintptr, gcw *gcWork) bool {
+	return false
+}
+
+func (s *mspan) initInlineMarkBits() {
+}
+
+func (s *mspan) mergeInlineMarks(to *gcBits) {
+	throw("unimplemented")
+}
+
+func gcUsesSpanInlineMarkBits(_ uintptr) bool {
+	return false
+}
+
+func (s *mspan) inlineMarkBits() *spanInlineMarkBits {
+	return nil
+}
+
+func (s *mspan) scannedBitsForIndex(objIndex uintptr) markBits {
+	throw("unimplemented")
+	return markBits{}
+}
+
+type spanInlineMarkBits struct {
+}
+
+func (q *spanInlineMarkBits) tryAcquire() bool {
+	return false
+}
+
+type spanQueue struct {
+	_ uint32 // To match alignment padding requirements for atomically-accessed variables in workType.
+}
+
+func (q *spanQueue) empty() bool {
+	return true
+}
+
+func (q *spanQueue) size() int {
+	return 0
+}
+
+type localSpanQueue struct {
+}
+
+func (q *localSpanQueue) drain() bool {
+	return false
+}
+
+func (q *localSpanQueue) empty() bool {
+	return true
+}
+
+type objptr uintptr
+
+func (w *gcWork) tryGetSpan(steal bool) objptr {
+	return 0
+}
+
+func scanSpan(p objptr, gcw *gcWork) {
+	throw("unimplemented")
+}
\ No newline at end of file
diff --git a/src/runtime/mgcpacer.go b/src/runtime/mgcpacer.go
index 20630c3f9a6d7123ecf7e4f71ef10c42da10d306..0baa61230bbf766da59d7859e3dbbf06e200b2fa 100644
--- a/src/runtime/mgcpacer.go
+++ b/src/runtime/mgcpacer.go
@@ -678,21 +678,42 @@ func (c *gcControllerState) endCycle(now int64, procs int, userForced bool) {
 // another P if there are spare worker slots. It is used by putfull
 // when more work is made available.
 //
+// If goexperiment.GreenTeaGC, the caller must not hold a G's scan bit,
+// otherwise this could cause a deadlock. This is already enforced by
+// the static lock ranking.
+//
 //go:nowritebarrier
 func (c *gcControllerState) enlistWorker() {
-	// If there are idle Ps, wake one so it will run an idle worker.
-	// NOTE: This is suspected of causing deadlocks. See golang.org/issue/19112.
+	needDedicated := c.dedicatedMarkWorkersNeeded.Load() > 0
+
+	// Create new workers from idle Ps with goexperiment.GreenTeaGC.
 	//
-	//	if sched.npidle.Load() != 0 && sched.nmspinning.Load() == 0 {
-	//		wakep()
-	//		return
-	//	}
-
-	// There are no idle Ps. If we need more dedicated workers,
-	// try to preempt a running P so it will switch to a worker.
-	if c.dedicatedMarkWorkersNeeded.Load() <= 0 {
+	// Note: with Green Tea, this places a requirement on enlistWorker
+	// that it must not be called while a G's scan bit is held.
+	if goexperiment.GreenTeaGC {
+		needIdle := c.needIdleMarkWorker()
+
+		// If we're all full on dedicated and idle workers, nothing
+		// to do.
+		if !needDedicated && !needIdle {
+			return
+		}
+
+		// If there are idle Ps, wake one so it will run a worker
+		// (the scheduler will already prefer to spin up a new
+		// dedicated worker over an idle one).
+		if sched.npidle.Load() != 0 && sched.nmspinning.Load() == 0 {
+			wakep()
+			return
+		}
+	}
+
+	// If we still need more dedicated workers, try to preempt a running P
+	// so it will switch to a worker.
+	if !needDedicated {
 		return
 	}
+	
 	// Pick a random other P to preempt.
 	if gomaxprocs <= 1 {
 		return
diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index b6890bac47ec715a88957c9fc0c4c98488961232..1a9c3b3e5f9069b7c24e3bc3328d317efea554da 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@@ -517,7 +517,7 @@ func (sl *sweepLocked) sweep(preserve bool) bool {
 
 	trace := traceAcquire()
 	if trace.ok() {
-		trace.GCSweepSpan(s.npages * _PageSize)
+		trace.GCSweepSpan(s.npages * pageSize)
 		traceRelease(trace)
 	}
 
@@ -640,6 +640,11 @@ func (sl *sweepLocked) sweep(preserve bool) bool {
 		}
 	}
 
+	// Copy over the inline mark bits if necessary.
+	if gcUsesSpanInlineMarkBits(s.elemsize) {
+		s.mergeInlineMarks(s.gcmarkBits)
+	}
+
 	// Check for zombie objects.
 	if s.freeindex < s.nelems {
 		// Everything < freeindex is allocated and hence
@@ -689,6 +694,11 @@ func (sl *sweepLocked) sweep(preserve bool) bool {
 	// Initialize alloc bits cache.
 	s.refillAllocCache(0)
 
+	// Reset the object queue, if we have one.
+	if gcUsesSpanInlineMarkBits(s.elemsize) {
+		s.initInlineMarkBits()
+	}
+
 	// The span must be in our exclusive ownership until we update sweepgen,
 	// check for potential races.
 	if state := s.state.get(); state != mSpanInUse || s.sweepgen != sweepgen-1 {
@@ -981,9 +991,9 @@ func gcPaceSweeper(trigger uint64) {
 		// concurrent sweep are less likely to leave pages
 		// unswept when GC starts.
 		heapDistance -= 1024 * 1024
-		if heapDistance < _PageSize {
+		if heapDistance < pageSize {
 			// Avoid setting the sweep ratio extremely high
-			heapDistance = _PageSize
+			heapDistance = pageSize
 		}
 		pagesSwept := mheap_.pagesSwept.Load()
 		pagesInUse := mheap_.pagesInUse.Load()
diff --git a/src/runtime/mgcwork.go b/src/runtime/mgcwork.go
index 2d66fa400231de62d2015bbb6fdd592420f07ebd..fd00127daa6dab20ea36f1e58144b6b138dc8acc 100644
--- a/src/runtime/mgcwork.go
+++ b/src/runtime/mgcwork.go
@@ -6,7 +6,9 @@ package runtime
 
 import (
 	"internal/goarch"
+	"internal/goexperiment"
 	"internal/runtime/atomic"
+	"internal/runtime/gc"
 	"internal/runtime/sys"
 	"unsafe"
 )
@@ -32,13 +34,37 @@ func init() {
 // Garbage collector work pool abstraction.
 //
 // This implements a producer/consumer model for pointers to grey
-// objects. A grey object is one that is marked and on a work
-// queue. A black object is marked and not on a work queue.
+// objects.
+//
+// For objects in workbufs, a grey object is one that is marked and
+// on a work queue. A black object is marked and not on a work queue.
+//
+// For objects in the span queue, a grey object is one that is marked
+// and has an unset scan bit. A black object is marked and has its scan
+// bit set. (Green Tea GC only.)
 //
 // Write barriers, root discovery, stack scanning, and object scanning
 // produce pointers to grey objects. Scanning consumes pointers to
 // grey objects, thus blackening them, and then scans them,
 // potentially producing new pointers to grey objects.
+//
+// Work queues must be prioritized in the following order wherever work
+// is processed.
+//
+// +----------------------------------------------------------+
+// | Priority | Work queue | Restrictions | Function          |
+// |----------------------------------------------------------|
+// | 1        | Workbufs   | P-local      | tryGetObjFast     |
+// | 2        | Span queue | P-local      | tryGetSpan(false) | [greenteagc]
+// | 3        | Workbufs   | None         | tryGetObj         |
+// | 4        | Span queue | None         | tryGetSpan(true)  | [greenteagc]
+// +----------------------------------------------------------+
+//
+// The rationale behind this ordering comes from two insights:
+// 1. It's always preferable to look for P-local work first to avoid hammering on
+//    global lists.
+// 2. It's always preferable to scan individual objects first to increase the
+//    likelihood that spans will accumulate more objects to scan.
 
 // A gcWork provides the interface to produce and consume work for the
 // garbage collector.
@@ -74,6 +100,15 @@ type gcWork struct {
 	// Invariant: Both wbuf1 and wbuf2 are nil or neither are.
 	wbuf1, wbuf2 *workbuf
 
+	// spanq is a queue of spans to process.
+	//
+	// Only used if goexperiment.GreenTeaGC.
+	spanq localSpanQueue
+
+	// ptrBuf is a temporary buffer used by span scanning.
+	ptrBuf *[pageSize / goarch.PtrSize]uintptr
+
+
 	// Bytes marked (blackened) on this gcWork. This is aggregated
 	// into work.bytesMarked by dispose.
 	bytesMarked uint64
@@ -88,6 +123,15 @@ type gcWork struct {
 	// termination check. Specifically, this indicates that this
 	// gcWork may have communicated work to another gcWork.
 	flushedWork bool
+
+	// mayNeedWorker is a hint that we may need to spin up a new
+	// worker, and that gcDrain* should call enlistWorker. This flag
+	// is set only if goexperiment.GreenTeaGC. If !goexperiment.GreenTeaGC,
+	// enlistWorker is called directly instead.
+	mayNeedWorker bool
+
+	// stats are scan stats broken down by size class.
+	stats [gc.NumSizeClasses]sizeClassScanStats
 }
 
 // Most of the methods of gcWork are go:nowritebarrierrec because the
@@ -106,11 +150,11 @@ func (w *gcWork) init() {
 	w.wbuf2 = wbuf2
 }
 
-// put enqueues a pointer for the garbage collector to trace.
+// putObj enqueues a pointer for the garbage collector to trace.
 // obj must point to the beginning of a heap object or an oblet.
 //
 //go:nowritebarrierrec
-func (w *gcWork) put(obj uintptr) {
+func (w *gcWork) putObj(obj uintptr) {
 	flushed := false
 	wbuf := w.wbuf1
 	// Record that this may acquire the wbufSpans or heap lock to
@@ -141,15 +185,19 @@ func (w *gcWork) put(obj uintptr) {
 	// the end of put so that w is in a consistent state, since
 	// enlistWorker may itself manipulate w.
 	if flushed && gcphase == _GCmark {
-		gcController.enlistWorker()
+		if goexperiment.GreenTeaGC {
+			w.mayNeedWorker = true
+		} else {
+			gcController.enlistWorker()
+		}
 	}
 }
 
-// putFast does a put and reports whether it can be done quickly
+// putObjFast does a put and reports whether it can be done quickly
 // otherwise it returns false and the caller needs to call put.
 //
 //go:nowritebarrierrec
-func (w *gcWork) putFast(obj uintptr) bool {
+func (w *gcWork) putObjFast(obj uintptr) bool {
 	wbuf := w.wbuf1
 	if wbuf == nil || wbuf.nobj == len(wbuf.obj) {
 		return false
@@ -160,11 +208,11 @@ func (w *gcWork) putFast(obj uintptr) bool {
 	return true
 }
 
-// putBatch performs a put on every pointer in obj. See put for
+// putObjBatch performs a put on every pointer in obj. See put for
 // constraints on these pointers.
 //
 //go:nowritebarrierrec
-func (w *gcWork) putBatch(obj []uintptr) {
+func (w *gcWork) putObjBatch(obj []uintptr) {
 	if len(obj) == 0 {
 		return
 	}
@@ -190,18 +238,22 @@ func (w *gcWork) putBatch(obj []uintptr) {
 	}
 
 	if flushed && gcphase == _GCmark {
-		gcController.enlistWorker()
+		if goexperiment.GreenTeaGC {
+			w.mayNeedWorker = true
+		} else {
+			gcController.enlistWorker()
+		}
 	}
 }
 
-// tryGet dequeues a pointer for the garbage collector to trace.
+// tryGetObj dequeues a pointer for the garbage collector to trace.
 //
 // If there are no pointers remaining in this gcWork or in the global
 // queue, tryGet returns 0.  Note that there may still be pointers in
 // other gcWork instances or other caches.
 //
 //go:nowritebarrierrec
-func (w *gcWork) tryGet() uintptr {
+func (w *gcWork) tryGetObj() uintptr {
 	wbuf := w.wbuf1
 	if wbuf == nil {
 		w.init()
@@ -226,12 +278,12 @@ func (w *gcWork) tryGet() uintptr {
 	return wbuf.obj[wbuf.nobj]
 }
 
-// tryGetFast dequeues a pointer for the garbage collector to trace
+// tryGetObjFast dequeues a pointer for the garbage collector to trace
 // if one is readily available. Otherwise it returns 0 and
 // the caller is expected to call tryGet().
 //
 //go:nowritebarrierrec
-func (w *gcWork) tryGetFast() uintptr {
+func (w *gcWork) tryGetObjFast() uintptr {
 	wbuf := w.wbuf1
 	if wbuf == nil || wbuf.nobj == 0 {
 		return 0
@@ -267,6 +319,9 @@ func (w *gcWork) dispose() {
 		}
 		w.wbuf2 = nil
 	}
+	if w.spanq.drain() {
+		w.flushedWork = true
+	}
 	if w.bytesMarked != 0 {
 		// dispose happens relatively infrequently. If this
 		// atomic becomes a problem, we should first try to
@@ -301,7 +356,11 @@ func (w *gcWork) balance() {
 	}
 	// We flushed a buffer to the full list, so wake a worker.
 	if gcphase == _GCmark {
-		gcController.enlistWorker()
+		if goexperiment.GreenTeaGC {
+			w.mayNeedWorker = true
+		} else {
+			gcController.enlistWorker()
+		}
 	}
 }
 
@@ -309,7 +368,7 @@ func (w *gcWork) balance() {
 //
 //go:nowritebarrierrec
 func (w *gcWork) empty() bool {
-	return w.wbuf1 == nil || (w.wbuf1.nobj == 0 && w.wbuf2.nobj == 0)
+	return (w.wbuf1 == nil || (w.wbuf1.nobj == 0 && w.wbuf2.nobj == 0)) && w.spanq.empty()
 }
 
 // Internally, the GC work pool is kept in arrays in work buffers.
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index e058dd848925a496e9d7c92952f688fff60286d7..33c9d9b82ff6835385421c12e9255775ced1c444 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -11,7 +11,9 @@ package runtime
 import (
 	"internal/cpu"
 	"internal/goarch"
+	"internal/goexperiment"
 	"internal/runtime/atomic"
+	"internal/runtime/gc"
 	"internal/runtime/sys"
 	"unsafe"
 )
@@ -293,6 +295,10 @@ type heapArena struct {
 	// during marking.
 	pageSpecials [pagesPerArena / 8]uint8
 
+	// pageUseSpanDartboard is a bitmap that indicates which spans are
+	// heap spans and also gcUsesSpanDartboard.
+	pageUseSpanInlineMarkBits [pagesPerArena / 8]uint8
+
 	// checkmarks stores the debug.gccheckmark state. It is only
 	// used if debug.gccheckmark > 0.
 	checkmarks *checkmarksMap
@@ -392,13 +398,6 @@ func (b *mSpanStateBox) get() mSpanState {
 	return mSpanState(b.s.Load())
 }
 
-// mSpanList heads a linked list of spans.
-type mSpanList struct {
-	_     sys.NotInHeap
-	first *mspan // first span in list, or nil if none
-	last  *mspan // last span in list, or nil if none
-}
-
 type mspan struct {
 	_    sys.NotInHeap
 	next *mspan     // next span in list, or nil if none
@@ -437,6 +436,12 @@ type mspan struct {
 	// mallocgc, and issue 54596).
 	freeIndexForScan uint16
 
+	// Temporary storage for the object index that caused this span to
+	// be queued for scanning.
+	//
+	// Used only with goexperiment.GreenTeaGC.
+	scanIdx uint16
+
 	// Cache of the allocBits at freeindex. allocCache is shifted
 	// such that the lowest bit corresponds to the bit freeindex.
 	// allocCache holds the complement of allocBits, thus allowing
@@ -500,7 +505,7 @@ func (s *mspan) base() uintptr {
 }
 
 func (s *mspan) layout() (size, n, total uintptr) {
-	total = s.npages << _PageShift
+	total = s.npages << gc.PageShift
 	size = s.elemsize
 	if size > 0 {
 		n = total / size
@@ -562,7 +567,7 @@ func recordspan(vh unsafe.Pointer, p unsafe.Pointer) {
 type spanClass uint8
 
 const (
-	numSpanClasses = _NumSizeClasses << 1
+	numSpanClasses = gc.NumSizeClasses << 1
 	tinySpanClass  = spanClass(tinySizeClass<<1 | 1)
 )
 
@@ -742,6 +747,27 @@ func pageIndexOf(p uintptr) (arena *heapArena, pageIdx uintptr, pageMask uint8)
 	return
 }
 
+// heapArenaOf returns the heap arena for p, if one exists.
+func heapArenaOf(p uintptr) *heapArena {
+	ri := arenaIndex(p)
+	if arenaL1Bits == 0 {
+		// If there's no L1, then ri.l1() can't be out of bounds but ri.l2() can.
+		if ri.l2() >= uint(len(mheap_.arenas[0])) {
+			return nil
+		}
+	} else {
+		// If there's an L1, then ri.l1() can be out of bounds but ri.l2() can't.
+		if ri.l1() >= uint(len(mheap_.arenas)) {
+			return nil
+		}
+	}
+	l2 := mheap_.arenas[ri.l1()]
+	if arenaL1Bits != 0 && l2 == nil { // Should never happen if there's no L1.
+		return nil
+	}
+	return l2[ri.l2()]
+}
+
 // Initialize the heap.
 func (h *mheap) init() {
 	lockInit(&h.lock, lockRankMheap)
@@ -1409,14 +1435,27 @@ func (h *mheap) initSpan(s *mspan, typ spanAllocType, spanclass spanClass, base,
 			s.nelems = 1
 			s.divMul = 0
 		} else {
-			s.elemsize = uintptr(class_to_size[sizeclass])
-			if !s.spanclass.noscan() && heapBitsInSpan(s.elemsize) {
-				// Reserve space for the pointer/scan bitmap at the end.
-				s.nelems = uint16((nbytes - (nbytes / goarch.PtrSize / 8)) / s.elemsize)
+			s.elemsize = uintptr(gc.SizeClassToSize[sizeclass])
+			if goexperiment.GreenTeaGC {
+				var reserve uintptr
+				if gcUsesSpanInlineMarkBits(s.elemsize) {
+					// Reserve space for the inline mark bits.
+					reserve += unsafe.Sizeof(spanInlineMarkBits{})
+				}
+				if heapBitsInSpan(s.elemsize) && !s.spanclass.noscan() {
+					// Reserve space for the pointer/scan bitmap at the end.
+					reserve += nbytes / goarch.PtrSize / 8
+				}
+				s.nelems = uint16((nbytes - reserve) / s.elemsize)
 			} else {
-				s.nelems = uint16(nbytes / s.elemsize)
+				if !s.spanclass.noscan() && heapBitsInSpan(s.elemsize) {
+					// Reserve space for the pointer/scan bitmap at the end.
+					s.nelems = uint16((nbytes - (nbytes / goarch.PtrSize / 8)) / s.elemsize)
+				} else {
+					s.nelems = uint16(nbytes / s.elemsize)
+				}
 			}
-			s.divMul = class_to_divmagic[sizeclass]
+			s.divMul = gc.SizeClassToDivMagic[sizeclass]
 		}
 
 		// Initialize mark and allocation structures.
@@ -1462,6 +1501,11 @@ func (h *mheap) initSpan(s *mspan, typ spanAllocType, spanclass spanClass, base,
 		arena, pageIdx, pageMask := pageIndexOf(s.base())
 		atomic.Or8(&arena.pageInUse[pageIdx], pageMask)
 
+		// Mark packed span.
+		if gcUsesSpanInlineMarkBits(s.elemsize) {
+			atomic.Or8(&arena.pageUseSpanInlineMarkBits[pageIdx], pageMask)
+		}
+
 		// Update related page sweeper stats.
 		h.pagesInUse.Add(npages)
 	}
@@ -1575,13 +1619,13 @@ func (h *mheap) freeSpan(s *mspan) {
 		if msanenabled {
 			// Tell msan that this entire span is no longer in use.
 			base := unsafe.Pointer(s.base())
-			bytes := s.npages << _PageShift
+			bytes := s.npages << gc.PageShift
 			msanfree(base, bytes)
 		}
 		if asanenabled {
 			// Tell asan that this entire span is no longer in use.
 			base := unsafe.Pointer(s.base())
-			bytes := s.npages << _PageShift
+			bytes := s.npages << gc.PageShift
 			asanpoison(base, bytes)
 		}
 		h.freeSpanLocked(s, spanAllocHeap)
@@ -1637,6 +1681,11 @@ func (h *mheap) freeSpanLocked(s *mspan, typ spanAllocType) {
 		// Clear in-use bit in arena page bitmap.
 		arena, pageIdx, pageMask := pageIndexOf(s.base())
 		atomic.And8(&arena.pageInUse[pageIdx], ^pageMask)
+	
+		// Clear small heap span bit if necessary.
+		if gcUsesSpanInlineMarkBits(s.elemsize) {
+			atomic.And8(&arena.pageUseSpanInlineMarkBits[pageIdx], ^pageMask)
+		}
 	default:
 		throw("mheap.freeSpanLocked - invalid span state")
 	}
@@ -1728,6 +1777,13 @@ func (span *mspan) inList() bool {
 	return span.list != nil
 }
 
+// mSpanList heads a linked list of spans.
+type mSpanList struct {
+	_     sys.NotInHeap
+	first *mspan // first span in list, or nil if none
+	last  *mspan // last span in list, or nil if none
+}
+
 // Initialize an empty doubly-linked list.
 func (list *mSpanList) init() {
 	list.first = nil
@@ -1819,6 +1875,87 @@ func (list *mSpanList) takeAll(other *mSpanList) {
 	other.first, other.last = nil, nil
 }
 
+// mSpanQueue is like an mSpanList but is FIFO instead of LIFO and may
+// be allocated on the stack. (mSpanList can be visible from the mspan
+// itself, so it is marked as not-in-heap).
+type mSpanQueue struct {
+	head, tail *mspan
+	n          int
+}
+
+// push adds s to the end of the queue.
+func (q *mSpanQueue) push(s *mspan) {
+	if s.next != nil {
+		throw("span already on list")
+	}
+	if q.tail == nil {
+		q.tail, q.head = s, s
+	} else {
+		q.tail.next = s
+		q.tail = s
+	}
+	q.n++
+}
+
+// pop removes a span from the head of the queue, if any.
+func (q *mSpanQueue) pop() *mspan {
+	if q.head == nil {
+		return nil
+	}
+	s := q.head
+	q.head = s.next
+	s.next = nil
+	if q.head == nil {
+		q.tail = nil
+	}
+	q.n--
+	return s
+}
+
+// takeAll removes all the spans from q2 and adds them to the end of q1, in order.
+func (q1 *mSpanQueue) takeAll(q2 *mSpanQueue) {
+	if q2.head == nil {
+		return
+	}
+	if q1.head == nil {
+		*q1 = *q2
+	} else {
+		q1.tail.next = q2.head
+		q1.tail = q2.tail
+		q1.n += q2.n
+	}
+	q2.tail = nil
+	q2.head = nil
+	q2.n = 0
+}
+
+// popN removes n spans from the head of the queue and returns them as a new queue.
+func (q *mSpanQueue) popN(n int) mSpanQueue {
+	var newQ mSpanQueue
+	if n <= 0 {
+		return newQ
+	}
+	if n >= q.n {
+		newQ = *q
+		q.tail = nil
+		q.head = nil
+		q.n = 0
+		return newQ
+	}
+	s := q.head
+	for range n - 1 {
+		s = s.next
+	}
+	q.n -= n
+	newQ.head = q.head
+	newQ.tail = s
+	newQ.n = n
+	q.head = s.next
+	s.next = nil
+	return newQ
+}
+
+
 const (
 	// _KindSpecialFinalizer is for tracking finalizers.
 	_KindSpecialFinalizer = 1
diff --git a/src/runtime/mpagealloc.go b/src/runtime/mpagealloc.go
index 46d3ebacaf8af73ff1ffe82a77b20bd8542da2bc..d81092c35db04e3503b06a12518b7c43d97a1185 100644
--- a/src/runtime/mpagealloc.go
+++ b/src/runtime/mpagealloc.go
@@ -49,6 +49,7 @@ package runtime
 
 import (
 	"internal/runtime/atomic"
+	"internal/runtime/gc"
 	"unsafe"
 )
 
@@ -58,7 +59,7 @@ const (
 	pallocChunkPages    = 1 << logPallocChunkPages
 	pallocChunkBytes    = pallocChunkPages * pageSize
 	logPallocChunkPages = 9
-	logPallocChunkBytes = logPallocChunkPages + pageShift
+	logPallocChunkBytes = logPallocChunkPages + gc.PageShift
 
 	// The number of radix bits for each level.
 	//
diff --git a/src/runtime/msize.go b/src/runtime/msize.go
index 64d1531ab098fd4a582b3e858b92966ef396adf1..a90dda7dab9dbfe34404668dc097308b8c66e98b 100644
--- a/src/runtime/msize.go
+++ b/src/runtime/msize.go
@@ -9,21 +9,23 @@
 
 package runtime
 
+import "internal/runtime/gc"
+
 // Returns size of the memory block that mallocgc will allocate if you ask for the size,
 // minus any inline space for metadata.
 func roundupsize(size uintptr, noscan bool) (reqSize uintptr) {
 	reqSize = size
-	if reqSize <= maxSmallSize-mallocHeaderSize {
+	if reqSize <= maxSmallSize-gc.MallocHeaderSize {
 		// Small object.
-		if !noscan && reqSize > minSizeForMallocHeader { // !noscan && !heapBitsInSpan(reqSize)
-			reqSize += mallocHeaderSize
+		if !noscan && reqSize > gc.MallocHeaderSize { // !noscan && !heapBitsInSpan(reqSize)
+			reqSize += gc.MallocHeaderSize
 		}
 		// (reqSize - size) is either mallocHeaderSize or 0. We need to subtract mallocHeaderSize
 		// from the result if we have one, since mallocgc will add it back in.
-		if reqSize <= smallSizeMax-8 {
-			return uintptr(class_to_size[size_to_class8[divRoundUp(reqSize, smallSizeDiv)]]) - (reqSize - size)
+		if reqSize <= gc.SmallSizeMax-8 {
+			return uintptr(gc.SizeClassToSize[gc.SizeToSizeClass8[divRoundUp(reqSize, gc.SmallSizeDiv)]]) - (reqSize - size)
 		}
-		return uintptr(class_to_size[size_to_class128[divRoundUp(reqSize-smallSizeMax, largeSizeDiv)]]) - (reqSize - size)
+		return uintptr(gc.SizeClassToSize[gc.SizeToSizeClass128[divRoundUp(reqSize-gc.SmallSizeMax, gc.LargeSizeDiv)]]) - (reqSize - size)
 	}
 	// Large object. Align reqSize up to the next page. Check for overflow.
 	reqSize += pageSize - 1
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index c10ca402217cfb301b644f1a17b3684bc4a169af..b98131b791bc5288001b10d97cb0db1c8159db93 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -8,6 +8,7 @@ package runtime
 
 import (
 	"internal/runtime/atomic"
+	"internal/runtime/gc"
 	"unsafe"
 )
 
@@ -43,9 +44,20 @@ type mstats struct {
 	last_gc_nanotime uint64 // last gc (monotonic time)
 	lastHeapInUse    uint64 // heapInUse at mark termination of the previous GC
 
+	lastScanStats [gc.NumSizeClasses]sizeClassScanStats
+
 	enablegc bool
 }
 
+type sizeClassScanStats struct {
+	spansDenseScanned     uint64
+	spanObjsDenseScanned  uint64
+	spansSparseScanned    uint64
+	spanObjsSparseScanned uint64
+	sparseObjsScanned     uint64
+}
+
+
 var memstats mstats
 
 // A MemStats records statistics about the memory allocator.
@@ -397,23 +409,23 @@ func readmemstats_m(stats *MemStats) {
 	nFree := consStats.largeFreeCount
 
 	// Collect per-sizeclass stats.
-	var bySize [_NumSizeClasses]struct {
+	var bySize [gc.NumSizeClasses]struct {
 		Size    uint32
 		Mallocs uint64
 		Frees   uint64
 	}
 	for i := range bySize {
-		bySize[i].Size = uint32(class_to_size[i])
+		bySize[i].Size = uint32(gc.SizeClassToSize[i])
 
 		// Malloc stats.
 		a := consStats.smallAllocCount[i]
-		totalAlloc += a * uint64(class_to_size[i])
+		totalAlloc += a * uint64(gc.SizeClassToSize[i])
 		nMalloc += a
 		bySize[i].Mallocs = a
 
 		// Free stats.
 		f := consStats.smallFreeCount[i]
-		totalFree += f * uint64(class_to_size[i])
+		totalFree += f * uint64(gc.SizeClassToSize[i])
 		nFree += f
 		bySize[i].Frees = f
 	}
@@ -681,10 +693,10 @@ type heapStatsDelta struct {
 	tinyAllocCount  uint64                  // number of tiny allocations
 	largeAlloc      uint64                  // bytes allocated for large objects
 	largeAllocCount uint64                  // number of large object allocations
-	smallAllocCount [_NumSizeClasses]uint64 // number of allocs for small objects
+	smallAllocCount [gc.NumSizeClasses]uint64 // number of allocs for small objects
 	largeFree       uint64                  // bytes freed for large objects (>maxSmallSize)
 	largeFreeCount  uint64                  // number of frees for large objects (>maxSmallSize)
-	smallFreeCount  [_NumSizeClasses]uint64 // number of frees for small objects (<=maxSmallSize)
+	smallFreeCount  [gc.NumSizeClasses]uint64 // number of frees for small objects (<=maxSmallSize)
 
 	// NOTE: This struct must be a multiple of 8 bytes in size because it
 	// is stored in an array. If it's not, atomic accesses to the above
diff --git a/src/runtime/mwbbuf.go b/src/runtime/mwbbuf.go
index b998d2b2bdf5f9a14ff3c0d053ba60ee316d56f1..537d5585920ceda7daa698141c39cb483e08ce46 100644
--- a/src/runtime/mwbbuf.go
+++ b/src/runtime/mwbbuf.go
@@ -237,6 +237,9 @@ func wbBufFlush1(pp *p) {
 			// path to reduce the rate of flushes?
 			continue
 		}
+		if tryDeferToSpanScan(ptr, gcw) {
+			continue
+		}
 		obj, span, objIndex := findObject(ptr, 0, 0)
 		if obj == 0 {
 			continue
@@ -264,7 +267,7 @@ func wbBufFlush1(pp *p) {
 	}
 
 	// Enqueue the greyed objects.
-	gcw.putBatch(ptrs[:pos])
+	gcw.putObjBatch(ptrs[:pos])
 
 	pp.wbBuf.reset()
 }
diff --git a/src/runtime/stack.go b/src/runtime/stack.go
index 8f11f54ccefd1ddb26b5627c5b029337dadc46de..9707b10876aca8327ef1a3bc9c4a31b85f70e881 100644
--- a/src/runtime/stack.go
+++ b/src/runtime/stack.go
@@ -10,6 +10,7 @@ import (
 	"internal/goarch"
 	"internal/goos"
 	"internal/runtime/atomic"
+	"internal/runtime/gc"
 	"internal/runtime/sys"
 	"unsafe"
 )
@@ -161,11 +162,11 @@ type stackpoolItem struct {
 // Global pool of large stack spans.
 var stackLarge struct {
 	lock mutex
-	free [heapAddrBits - pageShift]mSpanList // free lists by log_2(s.npages)
+	free [heapAddrBits - gc.PageShift]mSpanList // free lists by log_2(s.npages)
 }
 
 func stackinit() {
-	if _StackCacheSize&_PageMask != 0 {
+	if _StackCacheSize&pageMask != 0 {
 		throw("cache size must be a multiple of page size")
 	}
 	for i := range stackpool {
@@ -196,7 +197,7 @@ func stackpoolalloc(order uint8) gclinkptr {
 	lockWithRankMayAcquire(&mheap_.lock, lockRankMheap)
 	if s == nil {
 		// no free stacks. Allocate another span worth.
-		s = mheap_.allocManual(_StackCacheSize>>_PageShift, spanAllocStack)
+		s = mheap_.allocManual(_StackCacheSize>>gc.PageShift, spanAllocStack)
 		if s == nil {
 			throw("out of memory")
 		}
@@ -390,7 +391,7 @@ func stackalloc(n uint32) stack {
 		v = unsafe.Pointer(x)
 	} else {
 		var s *mspan
-		npage := uintptr(n) >> _PageShift
+		npage := uintptr(n) >> gc.PageShift
 		log2npage := stacklog2(npage)
 
 		// Try to get a stack from the large stack cache.
diff --git a/src/runtime/traceallocfree.go b/src/runtime/traceallocfree.go
index 84188a55c45bad08569b8c0eaffe7ed88285b85e..119288fb810db27be871ae66dea458cadb58c58b 100644
--- a/src/runtime/traceallocfree.go
+++ b/src/runtime/traceallocfree.go
@@ -8,6 +8,7 @@ package runtime
 
 import (
 	"internal/abi"
+	"internal/runtime/gc"
 	"internal/runtime/sys"
 )
 
@@ -37,7 +38,7 @@ func traceSnapshotMemory(gen uintptr) {
 	// Emit info.
 	w.varint(uint64(trace.minPageHeapAddr))
 	w.varint(uint64(pageSize))
-	w.varint(uint64(minHeapAlign))
+	w.varint(uint64(gc.MinHeapAlign))
 	w.varint(uint64(fixedStack))
 
 	// Finish writing the batch.
@@ -128,7 +129,7 @@ func (tl traceLocker) HeapObjectFree(addr uintptr) {
 
 // traceHeapObjectID creates a trace ID for a heap object at address addr.
 func traceHeapObjectID(addr uintptr) traceArg {
-	return traceArg(uint64(addr)-trace.minPageHeapAddr) / minHeapAlign
+	return traceArg(uint64(addr)-trace.minPageHeapAddr) / gc.MinHeapAlign
 }
 
 // GoroutineStackExists records that a goroutine stack already exists at address base with the provided size.