Index: src/cmd/compile/internal/ssa/_gen/ARM.rules
--- src/cmd/compile/internal/ssa/_gen/ARM.rules.orig
+++ src/cmd/compile/internal/ssa/_gen/ARM.rules
@@ -297,12 +297,12 @@
 // 4 and 128 are magic constants, see runtime/mkduff.go
 (Zero [s] {t} ptr mem)
 	&& s%4 == 0 && s > 4 && s <= 512
-	&& t.Alignment()%4 == 0 =>
+	&& t.Alignment()%4 == 0 && !config.noDuffDevice =>
 	(DUFFZERO [4 * (128 - s/4)] ptr (MOVWconst [0]) mem)
 
 // Large zeroing uses a loop
 (Zero [s] {t} ptr mem)
-	&& s > 512 || t.Alignment()%4 != 0 =>
+	&& (s > 512 || config.noDuffDevice) || t.Alignment()%4 != 0 =>
 	(LoweredZero [t.Alignment()]
 		ptr
 		(ADDconst <ptr.Type> ptr [int32(s-moveSize(t.Alignment(), config))])
@@ -337,12 +337,12 @@
 // 8 and 128 are magic constants, see runtime/mkduff.go
 (Move [s] {t} dst src mem)
 	&& s%4 == 0 && s > 4 && s <= 512
-	&& t.Alignment()%4 == 0 && logLargeCopy(v, s) =>
+	&& t.Alignment()%4 == 0 && !config.noDuffDevice && logLargeCopy(v, s) =>
 	(DUFFCOPY [8 * (128 - s/4)] dst src mem)
 
 // Large move uses a loop
 (Move [s] {t} dst src mem)
-	&& (s > 512 || t.Alignment()%4 != 0) && logLargeCopy(v, s) =>
+	&& ((s > 512 || config.noDuffDevice) || t.Alignment()%4 != 0) && logLargeCopy(v, s) =>
 	(LoweredMove [t.Alignment()]
 		dst
 		src
