diff --git a/tests/nightly.sh b/tests/nightly.sh
index 91b19f52b5..6631bb0a46 100755
--- a/tests/nightly.sh
+++ b/tests/nightly.sh
@@ -102,3 +102,25 @@ mkdir -p tests/testing-results
 docker run --rm -u $UID:$GUID --volume "${gem5_root}":"${gem5_root}" -w \
     "${gem5_root}" gcr.io/gem5-test/gcn-gpu:latest build/GCN3_X86/gem5.opt \
     configs/example/apu_se.py -n3 -c square
+
+# get HeteroSync
+wget -qN http://dist.gem5.org/dist/develop/test-progs/heterosync/gcn3/allSyncPrims-1kernel
+
+# run HeteroSync sleepMutex -- 16 WGs (4 per CU in default config), each doing
+# 10 Ld/St per thread and 4 iterations of the critical section is a reasonable
+# moderate contention case for the default 4 CU GPU config and help ensure GPU
+# atomics are tested.
+docker run --rm -u $UID:$GUID --volume "${gem5_root}":"${gem5_root}" -w \
+    "${gem5_root}" gcr.io/gem5-test/gcn-gpu:latest build/GCN3_X86/gem5.opt \
+    configs/example/apu_se.py -n3 -callSyncPrims-1kernel \
+    --options="sleepMutex 10 16 4"
+
+# run HeteroSync LFBarr -- similar setup to sleepMutex above -- 16 WGs
+# accessing unique data and then joining a lock-free barrier, 10 Ld/St per
+# thread, 4 iterations of critical section.  Again this is representative of a
+# moderate contention case for the default 4 CU GPU config and help ensure GPU
+# atomics are tested.
+docker run --rm -u $UID:$GUID --volume "${gem5_root}":"${gem5_root}" -w \
+    "${gem5_root}" gcr.io/gem5-test/gcn-gpu:latest build/GCN3_X86/gem5.opt \
+    configs/example/apu_se.py -n3 -callSyncPrims-1kernel \
+    --options="lfTreeBarrUniq 10 16 4"