From 00f8fc1e5f657f016f1440f9a65e2604c193b895 Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Mon, 12 Sep 2022 10:43:12 +0200
Subject: [PATCH 1/7] adding preliminary AMDGPU support

---
 Project.toml                         |  1 +
 src/Molly.jl                         |  9 +++
 src/chain_rules.jl                   |  4 +-
 src/gradients.jl                     | 27 ++++----
 src/interactions/implicit_solvent.jl | 10 +++
 src/neighbors.jl                     |  4 ++
 src/setup.jl                         | 66 +++++++++++++------
 src/types.jl                         | 47 ++++++++------
 src/zygote.jl                        | 12 ++--
 test/basic.jl                        |  5 +-
 test/minimization.jl                 | 40 ++++++------
 test/protein.jl                      | 94 ++++++++++++++--------------
 test/runtests.jl                     | 28 ++++++++-
 test/simulation.jl                   | 80 ++++++++++++-----------
 test/zygote.jl                       | 50 +++++++++------
 15 files changed, 297 insertions(+), 180 deletions(-)

diff --git a/Project.toml b/Project.toml
index 8d4add60..5d45d075 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,6 +4,7 @@ authors = ["Joe G Greener <jgreener@hotmail.co.uk>"]
 version = "0.13.0"
 
 [deps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a"
 BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
diff --git a/src/Molly.jl b/src/Molly.jl
index 2ab59c2e..a5989930 100644
--- a/src/Molly.jl
+++ b/src/Molly.jl
@@ -7,6 +7,15 @@ import Chemfiles
 using Colors
 using Combinatorics
 using CUDA
+if has_cuda_gpu()
+    CUDA.allowscalar(false)
+end
+
+using AMDGPU
+if has_rocm_gpu()
+    AMDGPU.allowscalar(false)
+end
+
 using DataStructures
 using Distances
 using Distributions
diff --git a/src/chain_rules.jl b/src/chain_rules.jl
index 2cec54fc..263c043c 100644
--- a/src/chain_rules.jl
+++ b/src/chain_rules.jl
@@ -109,7 +109,7 @@ function ChainRulesCore.rrule(::typeof(unsafe_getindex), arr, inds)
 end
 
 # Not faster on CPU
-function ChainRulesCore.rrule(::typeof(getindices_i), arr::CuArray, neighbors)
+function ChainRulesCore.rrule(::typeof(getindices_i), arr::AT, neighbors) where AT <: Union{CuArray, ROCArray}
     Y = getindices_i(arr, neighbors)
     @views @inbounds function getindices_i_pullback(Ȳ)
         return NoTangent(), accumulate_bounds(Ȳ, neighbors.atom_bounds_i), nothing
@@ -117,7 +117,7 @@ function ChainRulesCore.rrule(::typeof(getindices_i), arr::CuArray, neighbors)
     return Y, getindices_i_pullback
 end
 
-function ChainRulesCore.rrule(::typeof(getindices_j), arr::CuArray, neighbors)
+function ChainRulesCore.rrule(::typeof(getindices_j), arr::AT, neighbors) where AT <: Union{CuArray, ROCArray}
     Y = getindices_j(arr, neighbors)
     @views @inbounds function getindices_j_pullback(Ȳ)
         return NoTangent(), accumulate_bounds(Ȳ[neighbors.sortperm_j], neighbors.atom_bounds_j), nothing
diff --git a/src/gradients.jl b/src/gradients.jl
index 5faf4f01..f2481ec9 100644
--- a/src/gradients.jl
+++ b/src/gradients.jl
@@ -88,9 +88,10 @@ Allows gradients for individual parameters to be tracked.
 Returns atoms, pairwise interactions, specific interaction lists and general
 interactions.
 """
-function inject_gradients(sys, params_dic, gpu::Bool=isa(sys.coords, CuArray))
+function inject_gradients(sys, params_dic,
+                          gpu::Bool=isa(sys.coords, AT)) where AT <: Union{CuArray, ROCArray}
     if gpu
-        atoms_grad = CuArray(inject_atom.(Array(sys.atoms), sys.atoms_data, (params_dic,)))
+        atoms_grad = AT(inject_atom.(Array(sys.atoms), sys.atoms_data, (params_dic,)))
     else
         atoms_grad = inject_atom.(sys.atoms, sys.atoms_data, (params_dic,))
     end
@@ -100,7 +101,7 @@ function inject_gradients(sys, params_dic, gpu::Bool=isa(sys.coords, CuArray))
         pis_grad = sys.pairwise_inters
     end
     if length(sys.specific_inter_lists) > 0
-        sis_grad = inject_interaction_list.(sys.specific_inter_lists, (params_dic,), gpu)
+        sis_grad = inject_interaction_list.(sys.specific_inter_lists, (params_dic,), gpu, AT)
     else
         sis_grad = sys.specific_inter_lists
     end
@@ -127,36 +128,40 @@ function inject_atom(at, at_data, params_dic)
     )
 end
 
-function inject_interaction_list(inter::InteractionList1Atoms, params_dic, gpu)
+function inject_interaction_list(inter::InteractionList1Atoms, params_dic, gpu,
+                                 AT)
     if gpu
-        inters_grad = CuArray(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
+        inters_grad = AT(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
     else
         inters_grad = inject_interaction.(inter.inters, inter.types, (params_dic,))
     end
     InteractionList1Atoms(inter.is, inter.types, inters_grad)
 end
 
-function inject_interaction_list(inter::InteractionList2Atoms, params_dic, gpu)
+function inject_interaction_list(inter::InteractionList2Atoms, params_dic, gpu,
+                                 AT)
     if gpu
-        inters_grad = CuArray(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
+        inters_grad = AT(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
     else
         inters_grad = inject_interaction.(inter.inters, inter.types, (params_dic,))
     end
     InteractionList2Atoms(inter.is, inter.js, inter.types, inters_grad)
 end
 
-function inject_interaction_list(inter::InteractionList3Atoms, params_dic, gpu)
+function inject_interaction_list(inter::InteractionList3Atoms, params_dic, gpu,
+                                 AT)
     if gpu
-        inters_grad = CuArray(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
+        inters_grad = AT(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
     else
         inters_grad = inject_interaction.(inter.inters, inter.types, (params_dic,))
     end
     InteractionList3Atoms(inter.is, inter.js, inter.ks, inter.types, inters_grad)
 end
 
-function inject_interaction_list(inter::InteractionList4Atoms, params_dic, gpu)
+function inject_interaction_list(inter::InteractionList4Atoms, params_dic, gpu,
+                                 AT)
     if gpu
-        inters_grad = CuArray(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
+        inters_grad = AT(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
     else
         inters_grad = inject_interaction.(inter.inters, inter.types, (params_dic,))
     end
diff --git a/src/interactions/implicit_solvent.jl b/src/interactions/implicit_solvent.jl
index a2635648..49e44fea 100644
--- a/src/interactions/implicit_solvent.jl
+++ b/src/interactions/implicit_solvent.jl
@@ -410,6 +410,10 @@ function ImplicitSolventOBC(atoms::AbstractArray{Atom{T, M, D, E}},
         or = CuArray(offset_radii)
         sor = CuArray(scaled_offset_radii)
         is, js = CuArray(inds_i), CuArray(inds_j)
+    elseif isa(atoms, ROCArray)
+        or = ROCArray(offset_radii)
+        sor = ROCArray(scaled_offset_radii)
+        is, js = ROCArray(inds_i), ROCArrayArray(inds_j)
     else
         or = offset_radii
         sor = scaled_offset_radii
@@ -555,6 +559,12 @@ function ImplicitSolventGBN2(atoms::AbstractArray{Atom{T, M, D, E}},
         is, js = CuArray(inds_i), CuArray(inds_j)
         d0s, m0s = CuArray(table_d0), CuArray(table_m0)
         αs, βs, γs = CuArray(αs_cpu), CuArray(βs_cpu), CuArray(γs_cpu)
+    elseif isa(atoms, ROCArray)
+        or = ROCArray(offset_radii)
+        sor = ROCArray(scaled_offset_radii)
+        is, js = ROCArray(inds_i), ROCArray(inds_j)
+        d0s, m0s = ROCArray(table_d0), ROCArray(table_m0)
+        αs, βs, γs = ROCArray(αs_cpu), ROCArray(βs_cpu), ROCArray(γs_cpu)
     else
         or = offset_radii
         sor = scaled_offset_radii
diff --git a/src/neighbors.jl b/src/neighbors.jl
index bcf6d912..3e1e99f2 100644
--- a/src/neighbors.jl
+++ b/src/neighbors.jl
@@ -114,6 +114,10 @@ function DistanceVecNeighborFinder(;
         is = CuArray(hcat([collect(1:n_atoms) for i in 1:n_atoms]...))
         js = CuArray(permutedims(is, (2, 1)))
         m14 = CuArray(matrix_14)
+    elsif isa(nb_matrix, ROCArray)
+        is = ROCArray(hcat([collect(1:n_atoms) for i in 1:n_atoms]...))
+        js = ROCArray(permutedims(is, (2, 1)))
+        m14 = ROCArray(matrix_14)
     else
         is = hcat([collect(1:n_atoms) for i in 1:n_atoms]...)
         js = permutedims(is, (2, 1))
diff --git a/src/setup.jl b/src/setup.jl
index 3ed9ea0f..fb0de7c3 100644
--- a/src/setup.jl
+++ b/src/setup.jl
@@ -13,6 +13,29 @@ export
     is_heavy_atom,
     add_position_restraints
 
+# Creating default Array Type (AT) for users who did not specify
+function find_array_type(AT)
+    if AT == AbstractArray
+        if !gpu
+            AT = Array
+        elseif has_rocm_gpu() && has_cuda_gpu()
+            @warn("Both AMD and NVIDIA gpus available!
+                  Defaulting to CuArray...
+                  If you would like to use your AMD GPU, please specify " *
+                  "System(...; AT = ROCArray)")
+            AT = CuArray
+        elseif has_cuda_gpu()
+            AT = CuArray
+        elseif has_rocm_gpu()
+            AT = ROCArray
+        end
+    elseif AT != Array && AT != CuArray && AT != ROCArray
+        @warn("Array Type " * string(typeof(AT)) * " not available! " *
+              "Please use Array, CuArray, or ROCArray.")
+    end
+    return AT
+end
+
 """
     place_atoms(n_atoms, boundary; min_dist=nothing, max_attempts=100)
 
@@ -372,9 +395,12 @@ function System(coord_file::AbstractString,
                 implicit_solvent=nothing,
                 center_coords::Bool=true,
                 rename_terminal_res::Bool=true,
-                kappa=0.0u"nm^-1")
+                kappa=0.0u"nm^-1",
+                AT = AbstractArray)
     T = typeof(force_field.weight_14_coulomb)
 
+    AT = find_array_type(AT)
+
     # Chemfiles uses zero-based indexing, be careful
     trajectory = Chemfiles.Trajectory(coord_file)
     frame = Chemfiles.read(trajectory)
@@ -721,26 +747,25 @@ function System(coord_file::AbstractString,
     specific_inter_array = []
     if length(bonds.is) > 0
         push!(specific_inter_array, InteractionList2Atoms(
-            bonds.is, bonds.js, bonds.types,
-            gpu ? CuArray([bonds.inters...]) : [bonds.inters...],
+            bonds.is, bonds.js, bonds.types, AT([bonds.inters...]),
         ))
     end
     if length(angles.is) > 0
         push!(specific_inter_array, InteractionList3Atoms(
             angles.is, angles.js, angles.ks, angles.types,
-            gpu ? CuArray([angles.inters...]) : [angles.inters...],
+            AT([angles.inters...]),
         ))
     end
     if length(torsions.is) > 0
         push!(specific_inter_array, InteractionList4Atoms(
             torsions.is, torsions.js, torsions.ks, torsions.ls, torsions.types,
-            gpu ? CuArray(torsion_inters_pad) : torsion_inters_pad,
+            AT(torsion_inters_pad),
         ))
     end
     if length(impropers.is) > 0
         push!(specific_inter_array, InteractionList4Atoms(
             impropers.is, impropers.js, impropers.ks, impropers.ls, impropers.types,
-            gpu ? CuArray(improper_inters_pad) : improper_inters_pad,
+            AT(improper_inters_pad),
         ))
     end
     specific_inter_lists = tuple(specific_inter_array...)
@@ -771,8 +796,8 @@ function System(coord_file::AbstractString,
     atoms = [atoms...]
     if gpu_diff_safe
         neighbor_finder = DistanceVecNeighborFinder(
-            nb_matrix=gpu ? CuArray(nb_matrix) : nb_matrix,
-            matrix_14=gpu ? CuArray(matrix_14) : matrix_14,
+            nb_matrix=AT(nb_matrix),
+            matrix_14=AT(matrix_14),
             n_steps=10,
             dist_cutoff=T(dist_neighbors),
         )
@@ -787,8 +812,8 @@ function System(coord_file::AbstractString,
         )
     end
     if gpu
-        atoms = CuArray(atoms)
-        coords = CuArray(coords)
+        atoms = AT(atoms)
+        coords = AT(coords)
     end
 
     if isnothing(velocities)
@@ -845,7 +870,11 @@ function System(T::Type,
                 gpu_diff_safe::Bool=gpu,
                 dist_cutoff=units ? 1.0u"nm" : 1.0,
                 dist_neighbors=units ? 1.2u"nm" : 1.2,
-                center_coords::Bool=true)
+                center_coords::Bool=true,
+                AT = AbstractArray)
+
+    AT = find_array_type(AT)
+
     # Read force field and topology file
     atomtypes = Dict{String, Atom}()
     bondtypes = Dict{String, HarmonicBond}()
@@ -1108,20 +1137,19 @@ function System(T::Type,
     specific_inter_array = []
     if length(bonds.is) > 0
         push!(specific_inter_array, InteractionList2Atoms(
-            bonds.is, bonds.js, bonds.types,
-            gpu ? CuArray([bonds.inters...]) : [bonds.inters...],
+            bonds.is, bonds.js, bonds.types, AT([bonds.inters...]),
         ))
     end
     if length(angles.is) > 0
         push!(specific_inter_array, InteractionList3Atoms(
             angles.is, angles.js, angles.ks, angles.types,
-            gpu ? CuArray([angles.inters...]) : [angles.inters...],
+            AT([angles.inters...]),
         ))
     end
     if length(torsions.is) > 0
         push!(specific_inter_array, InteractionList4Atoms(
             torsions.is, torsions.js, torsions.ks, torsions.ls, torsions.types,
-            gpu ? CuArray([torsions.inters...]) : [torsions.inters...],
+            AT([torsions.inters...]),
         ))
     end
     specific_inter_lists = tuple(specific_inter_array...)
@@ -1130,8 +1158,8 @@ function System(T::Type,
 
     if gpu_diff_safe
         neighbor_finder = DistanceVecNeighborFinder(
-            nb_matrix=gpu ? CuArray(nb_matrix) : nb_matrix,
-            matrix_14=gpu ? CuArray(matrix_14) : matrix_14,
+            nb_matrix=AT(nb_matrix),
+            matrix_14=AT(matrix_14),
             n_steps=10,
             dist_cutoff=T(dist_neighbors),
         )
@@ -1146,8 +1174,8 @@ function System(T::Type,
         )
     end
     if gpu
-        atoms = CuArray(atoms)
-        coords = CuArray(coords)
+        atoms = AT(atoms)
+        coords = AT(coords)
     end
 
     if isnothing(velocities)
diff --git a/src/types.jl b/src/types.jl
index b90c7493..b2d594fe 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -299,7 +299,8 @@ interface described there.
 - `k::K=Unitful.k`: the Boltzmann constant, which may be modified in some
     simulations.
 - `gpu_diff_safe::Bool`: whether to use the code path suitable for the
-    GPU and taking gradients. Defaults to `isa(coords, CuArray)`.
+    GPU and taking gradients. Defaults to
+    `isa(coords, AT) where AT <: Union{CuArray, ROCArray}`.
 """
 mutable struct System{D, G, T, CU, A, AD, PI, SI, GI, CN, C, V, B, NF, L, F, E, K} <: AbstractSystem{D}
     atoms::A
@@ -333,11 +334,12 @@ function System(;
                 force_units=u"kJ * mol^-1 * nm^-1",
                 energy_units=u"kJ * mol^-1",
                 k=Unitful.k,
-                gpu_diff_safe=isa(coords, CuArray))
+                gpu_diff_safe=isa(coords, AT)) where AT <: Union{CuArray,
+                                                                 ROCArray}
     D = n_dimensions(boundary)
     G = gpu_diff_safe
     T = float_type(boundary)
-    CU = isa(coords, CuArray)
+    CU = isa(coords, AT)
     A = typeof(atoms)
     AD = typeof(atoms_data)
     PI = typeof(pairwise_inters)
@@ -372,16 +374,16 @@ function System(;
         throw(ArgumentError("There are $(length(atoms)) atoms but $(length(atoms_data)) atom data entries"))
     end
 
-    if isa(atoms, CuArray) && !isa(coords, CuArray)
+    if isa(atoms, AT) && !isa(coords, AT)
         throw(ArgumentError("The atoms are on the GPU but the coordinates are not"))
     end
-    if isa(coords, CuArray) && !isa(atoms, CuArray)
+    if isa(coords, AT) && !isa(atoms, AT)
         throw(ArgumentError("The coordinates are on the GPU but the atoms are not"))
     end
-    if isa(atoms, CuArray) && !isa(vels, CuArray)
+    if isa(atoms, AT) && !isa(vels, AT)
         throw(ArgumentError("The atoms are on the GPU but the velocities are not"))
     end
-    if isa(vels, CuArray) && !isa(atoms, CuArray)
+    if isa(vels, AT) && !isa(atoms, AT)
         throw(ArgumentError("The velocities are on the GPU but the atoms are not"))
     end
 
@@ -389,9 +391,9 @@ function System(;
     K = typeof(k_converted)
 
     return System{D, G, T, CU, A, AD, PI, SI, GI, CN, C, V, B, NF, L, F, E, K}(
-                    atoms, atoms_data, pairwise_inters, specific_inter_lists,
-                    general_inters, constraints, coords, vels, boundary, neighbor_finder,
-                    loggers, force_units, energy_units, k_converted)
+                  atoms, atoms_data, pairwise_inters, specific_inter_lists,
+                  general_inters, constraints, coords, vels, boundary, neighbor_finder,
+                  loggers, force_units, energy_units, k_converted)
 end
 
 """
@@ -456,7 +458,8 @@ of replicas and the neighbor finder should be set up to be same. This can be don
 - `k::K=Unitful.k`: the Boltzmann constant, which may be modified in some
     simulations.
 - `gpu_diff_safe::Bool`: whether to use the code path suitable for the
-    GPU and taking gradients. Defaults to `isa(replica_coords[1], CuArray)`.
+    GPU and taking gradients. Defaults to
+    `isa(replica_coords[1], AT) where AT <: Union{CuArray, ROCArray}`.
 """
 mutable struct ReplicaSystem{D, G, T, CU, A, AD, RS, B, EL, F, E, K} <: AbstractSystem{D}
     atoms::A
@@ -491,11 +494,13 @@ function ReplicaSystem(;
                         force_units=u"kJ * mol^-1 * nm^-1",
                         energy_units=u"kJ * mol^-1",
                         k=Unitful.k,
-                        gpu_diff_safe=isa(replica_coords[1], CuArray))
+                        gpu_diff_safe=isa(replica_coords[1],
+                                          AT)) where AT <: Union{CuArray,
+                                                                 ROCArray}
     D = n_dimensions(boundary)
     G = gpu_diff_safe
     T = float_type(boundary)
-    CU = isa(replica_coords[1], CuArray)
+    CU = isa(replica_coords[1], AT)
     A = typeof(atoms)
     AD = typeof(atoms_data)
     C = typeof(replica_coords[1])
@@ -585,25 +590,25 @@ function ReplicaSystem(;
         throw(ArgumentError("There are $(length(atoms)) atoms but $(length(atoms_data)) atom data entries"))
     end
 
-    n_cuarray = sum(y -> isa(y, CuArray), replica_coords)
+    n_cuarray = sum(y -> isa(y, AT), replica_coords)
     if !(n_cuarray == n_replicas || n_cuarray == 0)
         throw(ArgumentError("The coordinates for $n_cuarray out of $n_replicas replicas are on GPU"))
     end
-    if isa(atoms, CuArray) && n_cuarray != n_replicas
+    if isa(atoms, AT) && n_cuarray != n_replicas
         throw(ArgumentError("The atoms are on the GPU but the coordinates are not"))
     end
-    if n_cuarray == n_replicas && !isa(atoms, CuArray)
+    if n_cuarray == n_replicas && !isa(atoms, AT)
         throw(ArgumentError("The coordinates are on the GPU but the atoms are not"))
     end
 
-    n_cuarray = sum(y -> isa(y, CuArray), replica_velocities)
+    n_cuarray = sum(y -> isa(y, AT), replica_velocities)
     if !(n_cuarray == n_replicas || n_cuarray == 0)
         throw(ArgumentError("The velocities for $n_cuarray out of $n_replicas replicas are on GPU"))
     end
-    if isa(atoms, CuArray) && n_cuarray != n_replicas
+    if isa(atoms, AT) && n_cuarray != n_replicas
         throw(ArgumentError("The atoms are on the GPU but the velocities are not"))
     end
-    if n_cuarray == n_replicas && !isa(atoms, CuArray)
+    if n_cuarray == n_replicas && !isa(atoms, AT)
         throw(ArgumentError("The velocities are on the GPU but the atoms are not"))
     end
 
@@ -654,7 +659,9 @@ masses(s::Union{System, ReplicaSystem}) = mass.(s.atoms)
 
 # Move an array to the GPU depending on whether the system is on the GPU
 move_array(arr, ::System{D, G, T, false}) where {D, G, T} = arr
-move_array(arr, ::System{D, G, T, true }) where {D, G, T} = CuArray(arr)
+move_array(arr::AT, ::System{D, G, T, true }) where {AT <: Union{CuArray,
+                                                                 ROCArray},
+                                                     D, G, T} = AT(arr)
 
 AtomsBase.species_type(s::Union{System, ReplicaSystem}) = eltype(s.atoms)
 
diff --git a/src/zygote.jl b/src/zygote.jl
index ae2a14f7..37efb5cb 100644
--- a/src/zygote.jl
+++ b/src/zygote.jl
@@ -10,6 +10,8 @@ Zygote.accum(x::AbstractArray{<:SVector}, ys::AbstractArray{<:SizedVector}...) =
 
 Zygote.accum(x::Vector{<:SVector} , y::CuArray{<:SVector}) = Zygote.accum(CuArray(x), y)
 Zygote.accum(x::CuArray{<:SVector}, y::Vector{<:SVector} ) = Zygote.accum(x, CuArray(y))
+Zygote.accum(x::Vector{<:SVector} , y::ROCArray{<:SVector}) = Zygote.accum(ROCArray(x), y)
+Zygote.accum(x::ROCArray{<:SVector}, y::Vector{<:SVector} ) = Zygote.accum(x, ROCArray(y))
 
 Zygote.accum(x::SVector{D, T}, y::T) where {D, T} = x .+ y
 
@@ -136,12 +138,12 @@ end
 # Slower version than in Zygote but doesn't give wrong gradients on the GPU for repeated indices
 # Here we just move it to the CPU then move it back
 # See https://github.com/FluxML/Zygote.jl/pull/1131
-Zygote.∇getindex(x::CuArray, inds::Tuple{AbstractArray{<:Integer}}) = dy -> begin
+Zygote.∇getindex(x::AT, inds::Tuple{AbstractArray{<:Integer}}) where AT <: Union{CuArray, ROCArray} = dy -> begin
     inds1_cpu = Array(inds[1])
     dx = zeros(eltype(dy), length(x))
     dxv = view(dx, inds1_cpu)
     dxv .= Zygote.accum.(dxv, Zygote._droplike(Array(dy), dxv))
-    return Zygote._project(x, CuArray(dx)), nothing
+    return Zygote._project(x, AT(dx)), nothing
 end
 
 # Extend to add extra empty partials before (B) and after (A) the SVector partials
@@ -163,15 +165,15 @@ end
 sized_to_static(v::SizedVector{3, T, Vector{T}}) where {T} = SVector{3, T}(v[1], v[2], v[3])
 sized_to_static(v::SizedVector{2, T, Vector{T}}) where {T} = SVector{2, T}(v[1], v[2])
 
-function modify_grad(ȳ_in::AbstractArray{SizedVector{D, T, Vector{T}}}, arg::CuArray) where {D, T}
-    CuArray(sized_to_static.(ȳ_in))
+function modify_grad(ȳ_in::AbstractArray{SizedVector{D, T, Vector{T}}}, arg::AT) where {D, T, AT <: Union{CuArray, ROCArray}}
+    AT(sized_to_static.(ȳ_in))
 end
 
 function modify_grad(ȳ_in::AbstractArray{SizedVector{D, T, Vector{T}}}, arg) where {D, T}
     sized_to_static.(ȳ_in)
 end
 
-modify_grad(ȳ_in, arg::CuArray) = CuArray(ȳ_in)
+modify_grad(ȳ_in, arg::AT) where AT <: Union{CuArray, ROCArray} = AT(ȳ_in)
 modify_grad(ȳ_in, arg) = ȳ_in
 
 # Dualize a value with extra partials
diff --git a/test/basic.jl b/test/basic.jl
index d89c78e4..be55ef96 100644
--- a/test/basic.jl
+++ b/test/basic.jl
@@ -189,9 +189,12 @@ end
     coords_1 = SVector{3, Float64}.(eachcol(cm_1)) / 10 * u"nm"
     coords_2 = SVector{3, Float64}.(eachcol(cm_2)) / 10 * u"nm"
     @test rmsd(coords_1, coords_2) ≈ 2.54859467758795u"Å"
-    if run_gpu_tests
+    if run_cuda_tests
         @test rmsd(CuArray(coords_1), CuArray(coords_2)) ≈ 2.54859467758795u"Å"
     end
+    if run_rocm_tests
+        @test rmsd(ROCArray(coords_1), ROCArray(coords_2)) ≈ 2.54859467758795u"Å"
+    end
 
     bb_atoms = BioStructures.collectatoms(struc[1], BioStructures.backboneselector)
     coords = SVector{3, Float64}.(eachcol(BioStructures.coordarray(bb_atoms))) / 10 * u"nm"
diff --git a/test/minimization.jl b/test/minimization.jl
index 0979ea15..c77095ba 100644
--- a/test/minimization.jl
+++ b/test/minimization.jl
@@ -43,25 +43,27 @@
                     atol=1e-4u"kJ * mol^-1")
 
     if run_gpu_tests
-        coords = CuArray([
-            SVector(1.0, 1.0, 1.0)u"nm",
-            SVector(1.6, 1.0, 1.0)u"nm",
-            SVector(1.4, 1.6, 1.0)u"nm",
-        ])
-        sys = System(
-            atoms=CuArray([Atom(σ=(0.4 / (2 ^ (1 / 6)))u"nm", ϵ=1.0u"kJ * mol^-1") for i in 1:3]),
-            pairwise_inters=(LennardJones(),),
-            coords=coords,
-            boundary=CubicBoundary(5.0u"nm", 5.0u"nm", 5.0u"nm"),
-        )
-        sim = SteepestDescentMinimizer(tol=1.0u"kJ * mol^-1 * nm^-1")
+        for AT in gpu_array_types
+            coords = AT([
+                SVector(1.0, 1.0, 1.0)u"nm",
+                SVector(1.6, 1.0, 1.0)u"nm",
+                SVector(1.4, 1.6, 1.0)u"nm",
+            ])
+            sys = System(
+                atoms=AT([Atom(σ=(0.4 / (2 ^ (1 / 6)))u"nm", ϵ=1.0u"kJ * mol^-1") for i in 1:3]),
+                pairwise_inters=(LennardJones(),),
+                coords=coords,
+                boundary=CubicBoundary(5.0u"nm", 5.0u"nm", 5.0u"nm"),
+            )
+            sim = SteepestDescentMinimizer(tol=1.0u"kJ * mol^-1 * nm^-1")
     
-        simulate!(sys, sim)
-        dists = distances(sys.coords, sys.boundary)
-        dists_flat = dists[triu(trues(3, 3), 1)]
-        @test all(x -> isapprox(x, 0.4u"nm"; atol=1e-3u"nm"), dists_flat)
-        neighbors = find_neighbors(sys)
-        @test isapprox(potential_energy(sys, neighbors), -3.0u"kJ * mol^-1";
-                        atol=1e-4u"kJ * mol^-1")
+            simulate!(sys, sim)
+            dists = distances(sys.coords, sys.boundary)
+            dists_flat = dists[triu(trues(3, 3), 1)]
+            @test all(x -> isapprox(x, 0.4u"nm"; atol=1e-3u"nm"), dists_flat)
+            neighbors = find_neighbors(sys)
+            @test isapprox(potential_energy(sys, neighbors), -3.0u"kJ * mol^-1";
+                            atol=1e-4u"kJ * mol^-1")
+        end
     end
 end
diff --git a/test/protein.jl b/test/protein.jl
index f74c9543..d93e9475 100644
--- a/test/protein.jl
+++ b/test/protein.jl
@@ -161,53 +161,55 @@ end
 
     # Test the same simulation on the GPU
     if run_gpu_tests
-        sys = System(
-            joinpath(data_dir, "6mrr_equil.pdb"),
-            ff;
-            velocities=CuArray(deepcopy(velocities_start)),
-            gpu=true,
-            center_coords=false,
-        )
-        @test kinetic_energy(sys) ≈ 65521.87288132431u"kJ * mol^-1"
-        @test temperature(sys) ≈ 329.3202932884933u"K"
-
-        neighbors = find_neighbors(sys)
-        @test isapprox(potential_energy(sys, neighbors), E_openmm; atol=1e-5u"kJ * mol^-1")
-
-        simulate!(sys, simulator, n_steps)
-
-        coords_diff = Array(sys.coords) .- wrap_coords.(coords_openmm, (sys.boundary,))
-        vels_diff = Array(sys.velocities) .- vels_openmm
-        @test maximum(maximum(abs.(v)) for v in coords_diff) < 1e-9u"nm"
-        @test maximum(maximum(abs.(v)) for v in vels_diff  ) < 1e-6u"nm * ps^-1"
-
-        sys_nounits = System(
-            joinpath(data_dir, "6mrr_equil.pdb"),
-            ff_nounits;
-            velocities=CuArray(deepcopy(ustrip_vec.(velocities_start))),
-            units=false,
-            gpu=true,
-            center_coords=false,
-        )
-        @test kinetic_energy(sys_nounits)u"kJ * mol^-1" ≈ 65521.87288132431u"kJ * mol^-1"
-        @test temperature(sys_nounits)u"K" ≈ 329.3202932884933u"K"
+        for AT in gpu_array_types
+            sys = System(
+                joinpath(data_dir, "6mrr_equil.pdb"),
+                ff;
+                velocities=AT(deepcopy(velocities_start)),
+                gpu=true,
+                center_coords=false,
+            )
+            @test kinetic_energy(sys) ≈ 65521.87288132431u"kJ * mol^-1"
+            @test temperature(sys) ≈ 329.3202932884933u"K"
 
-        neighbors_nounits = find_neighbors(sys_nounits)
-        @test isapprox(potential_energy(sys_nounits, neighbors_nounits) * u"kJ * mol^-1",
-                        E_openmm; atol=1e-5u"kJ * mol^-1")
-
-        simulate!(sys_nounits, simulator_nounits, n_steps)
-
-        coords_diff = Array(sys_nounits.coords * u"nm") .- wrap_coords.(coords_openmm, (sys.boundary,))
-        vels_diff = Array(sys_nounits.velocities * u"nm * ps^-1") .- vels_openmm
-        @test maximum(maximum(abs.(v)) for v in coords_diff) < 1e-9u"nm"
-        @test maximum(maximum(abs.(v)) for v in vels_diff  ) < 1e-6u"nm * ps^-1"
-
-        params_dic_gpu = extract_parameters(sys_nounits, ff_nounits)
-        @test params_dic == params_dic_gpu
-        atoms_grad, pis_grad, sis_grad, gis_grad = inject_gradients(sys_nounits, params_dic_gpu)
-        @test atoms_grad == sys_nounits.atoms
-        @test pis_grad == sys_nounits.pairwise_inters
+            neighbors = find_neighbors(sys)
+            @test isapprox(potential_energy(sys, neighbors), E_openmm; atol=1e-5u"kJ * mol^-1")
+
+            simulate!(sys, simulator, n_steps)
+
+            coords_diff = Array(sys.coords) .- wrap_coords.(coords_openmm, (sys.boundary,))
+            vels_diff = Array(sys.velocities) .- vels_openmm
+            @test maximum(maximum(abs.(v)) for v in coords_diff) < 1e-9u"nm"
+            @test maximum(maximum(abs.(v)) for v in vels_diff  ) < 1e-6u"nm * ps^-1"
+
+            sys_nounits = System(
+                joinpath(data_dir, "6mrr_equil.pdb"),
+                ff_nounits;
+                velocities=AT(deepcopy(ustrip_vec.(velocities_start))),
+                units=false,
+                gpu=true,
+                center_coords=false,
+            )
+            @test kinetic_energy(sys_nounits)u"kJ * mol^-1" ≈ 65521.87288132431u"kJ * mol^-1"
+            @test temperature(sys_nounits)u"K" ≈ 329.3202932884933u"K"
+
+            neighbors_nounits = find_neighbors(sys_nounits)
+            @test isapprox(potential_energy(sys_nounits, neighbors_nounits) * u"kJ * mol^-1",
+                            E_openmm; atol=1e-5u"kJ * mol^-1")
+
+            simulate!(sys_nounits, simulator_nounits, n_steps)
+
+            coords_diff = Array(sys_nounits.coords * u"nm") .- wrap_coords.(coords_openmm, (sys.boundary,))
+            vels_diff = Array(sys_nounits.velocities * u"nm * ps^-1") .- vels_openmm
+            @test maximum(maximum(abs.(v)) for v in coords_diff) < 1e-9u"nm"
+            @test maximum(maximum(abs.(v)) for v in vels_diff  ) < 1e-6u"nm * ps^-1"
+
+            params_dic_gpu = extract_parameters(sys_nounits, ff_nounits)
+            @test params_dic == params_dic_gpu
+            atoms_grad, pis_grad, sis_grad, gis_grad = inject_gradients(sys_nounits, params_dic_gpu)
+            @test atoms_grad == sys_nounits.atoms
+            @test pis_grad == sys_nounits.pairwise_inters
+        end
     end
 end
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 557e5a1e..37e6d96a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -42,16 +42,38 @@ else
     @warn "The parallel tests will not be run as Julia is running on 1 thread"
 end
 
-run_gpu_tests = CUDA.functional()
-if run_gpu_tests
+run_cuda_tests = CUDA.functional()
+if run_cuda_tests
     device!(parse(Int, DEVICE))
     @info "The GPU tests will be run on device $DEVICE"
 else
-    @warn "The GPU tests will not be run as a CUDA-enabled device is not available"
+    @warn "The CUDA tests will not be run as a CUDA-enabled device is not available"
 end
 
 CUDA.allowscalar(false) # Check that we never do scalar indexing on the GPU
 
+run_rocm_tests = AMDGPU.functional()
+if run_rocm_tests
+    device!(parse(Int, DEVICE))
+    @info "The GPU tests will be run on device $DEVICE"
+else
+    @warn "The ROCM tests will not be run as a ROCM-enabled device is not availa
+ble"
+end
+
+AMDGPU.allowscalar(false)
+
+run_gpu_tests = run_cuda_tests || run_rocm_tests
+gpu_array_types = []
+if run_gpu_tests
+    if run_cuda_tests
+        push!(gpu_array_types, CuArray)
+    end
+    if run_cuda_tests
+        push!(gpu_array_types, ROCArray)
+    end
+end
+
 data_dir = normpath(@__DIR__, "..", "data")
 ff_dir = joinpath(data_dir, "force_fields")
 
diff --git a/test/simulation.jl b/test/simulation.jl
index a990606f..b4efcd6f 100644
--- a/test/simulation.jl
+++ b/test/simulation.jl
@@ -384,35 +384,37 @@ end
 
 @testset "Position restraints" begin
     gpu_list = run_gpu_tests ? (false, true) : (false,)
-    for gpu in gpu_list
-        n_atoms = 10
-        n_atoms_res = n_atoms ÷ 2
-        n_steps = 2_000
-        boundary = CubicBoundary(2.0u"nm", 2.0u"nm", 2.0u"nm")
-        starting_coords = place_atoms(n_atoms, boundary; min_dist=0.3u"nm")
-        atoms = [Atom(charge=0.0, mass=10.0u"u", σ=0.2u"nm", ϵ=0.2u"kJ * mol^-1") for i in 1:n_atoms]
-        atoms_data = [AtomData(atom_type=(i <= n_atoms_res ? "A1" : "A2")) for i in 1:n_atoms]
-        sim = Langevin(dt=0.001u"ps", temperature=300.0u"K", friction=1.0u"ps^-1")
-
-        sys = System(
-            atoms=gpu ? CuArray(atoms) : atoms,
-            atoms_data=atoms_data,
-            pairwise_inters=(LennardJones(),),
-            coords=gpu ? CuArray(deepcopy(starting_coords)) : deepcopy(starting_coords),
-            boundary=boundary,
-            loggers=(coords=CoordinateLogger(100),),
-        )
+    for AT in gpu_array_types
+        for gpu in gpu_list
+            n_atoms = 10
+            n_atoms_res = n_atoms ÷ 2
+            n_steps = 2_000
+            boundary = CubicBoundary(2.0u"nm", 2.0u"nm", 2.0u"nm")
+            starting_coords = place_atoms(n_atoms, boundary; min_dist=0.3u"nm")
+            atoms = [Atom(charge=0.0, mass=10.0u"u", σ=0.2u"nm", ϵ=0.2u"kJ * mol^-1") for i in 1:n_atoms]
+            atoms_data = [AtomData(atom_type=(i <= n_atoms_res ? "A1" : "A2")) for i in 1:n_atoms]
+            sim = Langevin(dt=0.001u"ps", temperature=300.0u"K", friction=1.0u"ps^-1")
+
+            sys = System(
+                atoms=gpu ? AT(atoms) : atoms,
+                atoms_data=atoms_data,
+                pairwise_inters=(LennardJones(),),
+                coords=gpu ? AT(deepcopy(starting_coords)) : deepcopy(starting_coords),
+                boundary=boundary,
+                loggers=(coords=CoordinateLogger(100),),
+            )
 
-        atom_selector(at, at_data) = at_data.atom_type == "A1"
+            atom_selector(at, at_data) = at_data.atom_type == "A1"
 
-        sys_res = add_position_restraints(sys, 100_000.0u"kJ * mol^-1 * nm^-2";
-                                          atom_selector=atom_selector)
+            sys_res = add_position_restraints(sys, 100_000.0u"kJ * mol^-1 * nm^-2";
+                                              atom_selector=atom_selector)
 
-        @time simulate!(sys_res, sim, n_steps)
+            @time simulate!(sys_res, sim, n_steps)
 
-        dists = norm.(vector.(starting_coords, Array(sys_res.coords), (boundary,)))
-        @test maximum(dists[1:n_atoms_res]) < 0.1u"nm"
-        @test median(dists[(n_atoms_res + 1):end]) > 0.2u"nm"
+            dists = norm.(vector.(starting_coords, Array(sys_res.coords), (boundary,)))
+            @test maximum(dists[1:n_atoms_res]) < 0.1u"nm"
+            @test median(dists[(n_atoms_res + 1):end]) > 0.2u"nm"
+        end
     end
 end
 
@@ -736,7 +738,7 @@ end
     starting_coords_f32 = [Float32.(c) for c in starting_coords]
     starting_velocities_f32 = [Float32.(c) for c in starting_velocities]
 
-    function test_sim(nl::Bool, parallel::Bool, gpu_diff_safe::Bool, f32::Bool, gpu::Bool)
+    function test_sim(nl::Bool, parallel::Bool, gpu_diff_safe::Bool, f32::Bool, gpu::Bool; AT = Array)
         n_atoms = 400
         n_steps = 200
         atom_mass = f32 ? 10.0f0u"u" : 10.0u"u"
@@ -749,7 +751,7 @@ end
             InteractionList2Atoms(collect(1:2:n_atoms),
             collect(2:2:n_atoms),
             fill("", length(bonds)),
-            gpu ? CuArray(bonds) : bonds,
+            gpu ? AT(bonds) : bonds,
         ),)
 
         neighbor_finder = NoNeighborFinder()
@@ -758,7 +760,7 @@ end
         if nl
             if gpu_diff_safe
                 neighbor_finder = DistanceVecNeighborFinder(
-                    nb_matrix=gpu ? CuArray(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
+                    nb_matrix=gpu ? AT(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
                     n_steps=10,
                     dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
                 )
@@ -774,9 +776,9 @@ end
         show(devnull, neighbor_finder)
 
         if gpu
-            coords = CuArray(deepcopy(f32 ? starting_coords_f32 : starting_coords))
-            velocities = CuArray(deepcopy(f32 ? starting_velocities_f32 : starting_velocities))
-            atoms = CuArray([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
+            coords = AT(deepcopy(f32 ? starting_coords_f32 : starting_coords))
+            velocities = AT(deepcopy(f32 ? starting_velocities_f32 : starting_velocities))
+            atoms = AT([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
                                   ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
         else
             coords = deepcopy(f32 ? starting_coords_f32 : starting_coords)
@@ -820,10 +822,18 @@ end
         push!(runs, ("in-place NL parallel", [true , true , false, false, false]))
     end
     if run_gpu_tests
-        push!(runs, ("out-of-place gpu"       , [false, false, true , false, true ]))
-        push!(runs, ("out-of-place gpu f32"   , [false, false, true , true , true ]))
-        push!(runs, ("out-of-place gpu NL"    , [true , false, true , false, true ]))
-        push!(runs, ("out-of-place gpu f32 NL", [true , false, true , true , true ]))
+        if run_cuda_tests
+            push!(runs, ("out-of-place gpu"       , [false, false, true , false, true, AT = CuArray]))
+            push!(runs, ("out-of-place gpu f32"   , [false, false, true , true , true, AT = CuArray]))
+            push!(runs, ("out-of-place gpu NL"    , [true , false, true , false, true, AT = CuArray]))
+            push!(runs, ("out-of-place gpu f32 NL", [true , false, true , true , true, AT = CuArray]))
+        end
+        if run_rocm_tests
+            push!(runs, ("out-of-place gpu"       , [false, false, true , false, true, AT = ROCArray]))
+            push!(runs, ("out-of-place gpu f32"   , [false, false, true , true , true, AT = ROCArray]))
+            push!(runs, ("out-of-place gpu NL"    , [true , false, true , false, true, AT = ROCArray]))
+            push!(runs, ("out-of-place gpu f32 NL", [true , false, true , true , true, AT = ROCArray]))
+        end
     end
 
     final_coords_ref, E_start_ref = test_sim(runs[1][2]...)
diff --git a/test/zygote.jl b/test/zygote.jl
index bda9beb3..a5a48485 100644
--- a/test/zygote.jl
+++ b/test/zygote.jl
@@ -37,7 +37,7 @@
     end
 
     function test_grad(gpu::Bool, forward::Bool, f32::Bool, pis::Bool,
-                        sis::Bool, obc2::Bool, gbn2::Bool)
+                        sis::Bool, obc2::Bool, gbn2::Bool; AT = Array)
         n_atoms = 50
         n_steps = 100
         atom_mass = f32 ? 10.0f0 : 10.0
@@ -75,7 +75,7 @@
             collect(16:30),
             collect(31:45),
             fill("", 15),
-            gpu ? CuArray(angles_inner) : angles_inner,
+            gpu ? AT(angles_inner) : angles_inner,
         )
         torsions_inner = [PeriodicTorsion(
                 periodicities=[1, 2, 3],
@@ -89,12 +89,12 @@
             collect(21:30),
             collect(31:40),
             fill("", 10),
-            gpu ? CuArray(torsions_inner) : torsions_inner,
+            gpu ? AT(torsions_inner) : torsions_inner,
         )
         atoms_setup = [Atom(charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.0f0 : 0.0) for i in 1:n_atoms]
         if obc2
             imp_obc2 = ImplicitSolventOBC(
-                gpu ? CuArray(atoms_setup) : atoms_setup,
+                gpu ? AT(atoms_setup) : atoms_setup,
                 [AtomData(element="O") for i in 1:n_atoms],
                 InteractionList2Atoms(bond_is, bond_js, [""], nothing);
                 use_OBC2=true,
@@ -102,7 +102,7 @@
             general_inters = (imp_obc2,)
         elseif gbn2
             imp_gbn2 = ImplicitSolventGBN2(
-                gpu ? CuArray(atoms_setup) : atoms_setup,
+                gpu ? AT(atoms_setup) : atoms_setup,
                 [AtomData(element="O") for i in 1:n_atoms],
                 InteractionList2Atoms(bond_is, bond_js, [""], nothing),
             )
@@ -111,7 +111,7 @@
             general_inters = ()
         end
         neighbor_finder = DistanceVecNeighborFinder(
-            nb_matrix=gpu ? CuArray(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
+            nb_matrix=gpu ? AT(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
             n_steps=10,
             dist_cutoff=f32 ? 1.5f0 : 1.5,
         )
@@ -128,18 +128,18 @@
                 bond_is,
                 bond_js,
                 fill("", length(bonds_inner)),
-                gpu ? CuArray(bonds_inner) : bonds_inner,
+                gpu ? AT(bonds_inner) : bonds_inner,
             )
             cs = deepcopy(forward ? coords_dual : coords)
             vs = deepcopy(forward ? velocities_dual : velocities)
 
             s = System(
-                atoms=gpu ? CuArray(atoms) : atoms,
+                atoms=gpu ? AT(atoms) : atoms,
                 pairwise_inters=pairwise_inters,
                 specific_inter_lists=sis ? (bonds, angles, torsions) : (),
                 general_inters=general_inters,
-                coords=gpu ? CuArray(cs) : cs,
-                velocities=gpu ? CuArray(vs) : vs,
+                coords=gpu ? AT(cs) : cs,
+                velocities=gpu ? AT(vs) : vs,
                 boundary=boundary,
                 neighbor_finder=neighbor_finder,
                 gpu_diff_safe=true,
@@ -165,15 +165,27 @@
         ("cpu gbn2"        , [false, false, false, true , true , false, true ], 0.1 , 0.25),
         ("cpu gbn2 forward", [false, true , false, true , true , false, true ], 0.02, 0.02),
     ]
-    if run_gpu_tests #                    gpu    fwd    f32    pis    sis    obc2   gbn2
-        push!(runs, ("gpu"             , [true , false, false, true , true , false, false], 0.25, 20.0))
-        push!(runs, ("gpu forward"     , [true , true , false, true , true , false, false], 0.01, 0.01))
-        push!(runs, ("gpu f32"         , [true , false, true , true , true , false, false], 0.5 , 50.0))
-        push!(runs, ("gpu nospecific"  , [true , false, false, true , false, false, false], 0.25, 0.0 ))
-        push!(runs, ("gpu nopairwise"  , [true , false, false, false, true , false, false], 0.0 , 10.0))
-        push!(runs, ("gpu obc2"        , [true , false, false, true , true , true , false], 0.25, 20.0))
-        push!(runs, ("gpu gbn2"        , [true , false, false, true , true , false, true ], 0.25, 20.0))
-        push!(runs, ("gpu gbn2 forward", [true , true , false, true , true , false, true ], 0.02, 0.02))
+    if run_gpu_tests #                         gpu    fwd    f32    pis    sis    obc2   gbn2
+        if run_cuda_tests
+            push!(runs, ("cuda"             , [true , false, false, true , true , false, false, AT = CuArray], 0.25, 20.0))
+            push!(runs, ("cuda forward"     , [true , true , false, true , true , false, false, AT = CuArray], 0.01, 0.01))
+            push!(runs, ("cuda f32"         , [true , false, true , true , true , false, false, AT = CuArray], 0.5 , 50.0))
+            push!(runs, ("cuda nospecific"  , [true , false, false, true , false, false, false, AT = CuArray], 0.25, 0.0 ))
+            push!(runs, ("cuda nopairwise"  , [true , false, false, false, true , false, false, AT = CuArray], 0.0 , 10.0))
+            push!(runs, ("cuda obc2"        , [true , false, false, true , true , true , false, AT = CuArray], 0.25, 20.0))
+            push!(runs, ("cuda gbn2"        , [true , false, false, true , true , false, true , AT = CuArray], 0.25, 20.0))
+            push!(runs, ("cuda gbn2 forward", [true , true , false, true , true , false, true , AT = CuArray], 0.02, 0.02))
+        end
+        if run_rocm_tests
+            push!(runs, ("rocm"             , [true , false, false, true , true , false, false, AT = ROCArray], 0.25, 20.0))
+            push!(runs, ("rocm forward"     , [true , true , false, true , true , false, false, AT = ROCArray], 0.01, 0.01))
+            push!(runs, ("rocm f32"         , [true , false, true , true , true , false, false, AT = ROCArray], 0.5 , 50.0))
+            push!(runs, ("rocm nospecific"  , [true , false, false, true , false, false, false, AT = ROCArray], 0.25, 0.0 ))
+            push!(runs, ("rocm nopairwise"  , [true , false, false, false, true , false, false, AT = ROCArray], 0.0 , 10.0))
+            push!(runs, ("rocm obc2"        , [true , false, false, true , true , true , false, AT = ROCArray], 0.25, 20.0))
+            push!(runs, ("rocm gbn2"        , [true , false, false, true , true , false, true , AT = ROCArray], 0.25, 20.0))
+            push!(runs, ("rocm gbn2 forward", [true , true , false, true , true , false, true , AT = ROCArray], 0.02, 0.02))
+        end
     end
 
     for (name, args, tol_σ, tol_k) in runs

From a3481fb8eab40d2347baa15edb0b12fa637c849f Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Wed, 14 Sep 2022 13:30:45 +0200
Subject: [PATCH 2/7] it compiles and runs basic examples on CPU and GPU. Now
 for tests

---
 Project.toml     |  1 +
 src/gradients.jl |  4 ++--
 src/setup.jl     | 39 +++++++++++++++------------------------
 src/types.jl     | 37 ++++++++++++++++++++-----------------
 src/zygote.jl    |  8 +++++---
 test/runtests.jl |  5 +++--
 6 files changed, 46 insertions(+), 48 deletions(-)

diff --git a/Project.toml b/Project.toml
index 5d45d075..6994e0bd 100644
--- a/Project.toml
+++ b/Project.toml
@@ -33,6 +33,7 @@ UnitfulChainRules = "f31437dd-25a7-4345-875f-756556e6935d"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
+AMDGPU = "0.4"
 AtomsBase = "0.2"
 BioStructures = "1"
 CUDA = "3"
diff --git a/src/gradients.jl b/src/gradients.jl
index f2481ec9..798cd7f9 100644
--- a/src/gradients.jl
+++ b/src/gradients.jl
@@ -88,8 +88,8 @@ Allows gradients for individual parameters to be tracked.
 Returns atoms, pairwise interactions, specific interaction lists and general
 interactions.
 """
-function inject_gradients(sys, params_dic,
-                          gpu::Bool=isa(sys.coords, AT)) where AT <: Union{CuArray, ROCArray}
+function inject_gradients(sys, params_dic; AT = find_array_type(sys.coords),
+                          gpu::Bool = (AT <: Union{CuArray, ROCArray}))
     if gpu
         atoms_grad = AT(inject_atom.(Array(sys.atoms), sys.atoms_data, (params_dic,)))
     else
diff --git a/src/setup.jl b/src/setup.jl
index fb0de7c3..786cc3c3 100644
--- a/src/setup.jl
+++ b/src/setup.jl
@@ -14,24 +14,19 @@ export
     add_position_restraints
 
 # Creating default Array Type (AT) for users who did not specify
-function find_array_type(AT)
-    if AT == AbstractArray
-        if !gpu
-            AT = Array
-        elseif has_rocm_gpu() && has_cuda_gpu()
-            @warn("Both AMD and NVIDIA gpus available!
-                  Defaulting to CuArray...
-                  If you would like to use your AMD GPU, please specify " *
-                  "System(...; AT = ROCArray)")
-            AT = CuArray
-        elseif has_cuda_gpu()
-            AT = CuArray
-        elseif has_rocm_gpu()
-            AT = ROCArray
-        end
-    elseif AT != Array && AT != CuArray && AT != ROCArray
-        @warn("Array Type " * string(typeof(AT)) * " not available! " *
-              "Please use Array, CuArray, or ROCArray.")
+function configure_array_type(gpu)
+    if !gpu
+        AT = Array
+    elseif has_rocm_gpu() && has_cuda_gpu()
+        @warn("Both AMD and NVIDIA gpus available!\n"*
+              "Defaulting to CuArray...\n"*
+              "If you would like to use your AMD GPU, please specify " *
+              "System(...; AT = ROCArray)")
+        AT = CuArray
+    elseif has_cuda_gpu()
+        AT = CuArray
+    elseif has_rocm_gpu()
+        AT = ROCArray
     end
     return AT
 end
@@ -396,11 +391,9 @@ function System(coord_file::AbstractString,
                 center_coords::Bool=true,
                 rename_terminal_res::Bool=true,
                 kappa=0.0u"nm^-1",
-                AT = AbstractArray)
+                AT = configure_array_type(gpu))
     T = typeof(force_field.weight_14_coulomb)
 
-    AT = find_array_type(AT)
-
     # Chemfiles uses zero-based indexing, be careful
     trajectory = Chemfiles.Trajectory(coord_file)
     frame = Chemfiles.read(trajectory)
@@ -871,9 +864,7 @@ function System(T::Type,
                 dist_cutoff=units ? 1.0u"nm" : 1.0,
                 dist_neighbors=units ? 1.2u"nm" : 1.2,
                 center_coords::Bool=true,
-                AT = AbstractArray)
-
-    AT = find_array_type(AT)
+                AT = configure_array_type(gpu))
 
     # Read force field and topology file
     atomtypes = Dict{String, Atom}()
diff --git a/src/types.jl b/src/types.jl
index b2d594fe..3e898287 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -319,6 +319,10 @@ mutable struct System{D, G, T, CU, A, AD, PI, SI, GI, CN, C, V, B, NF, L, F, E,
     k::K
 end
 
+function find_array_type(a)
+    return typeof(a).name.wrapper
+end
+
 function System(;
                 atoms,
                 atoms_data=[],
@@ -334,12 +338,12 @@ function System(;
                 force_units=u"kJ * mol^-1 * nm^-1",
                 energy_units=u"kJ * mol^-1",
                 k=Unitful.k,
-                gpu_diff_safe=isa(coords, AT)) where AT <: Union{CuArray,
-                                                                 ROCArray}
+                AT=find_array_type(coords),
+                gpu_diff_safe=(AT <: Union{CuArray, ROCArray}))
     D = n_dimensions(boundary)
     G = gpu_diff_safe
     T = float_type(boundary)
-    CU = isa(coords, AT)
+    CU = AT <: Union{CuArray, ROCArray}
     A = typeof(atoms)
     AD = typeof(atoms_data)
     PI = typeof(pairwise_inters)
@@ -375,16 +379,16 @@ function System(;
     end
 
     if isa(atoms, AT) && !isa(coords, AT)
-        throw(ArgumentError("The atoms are on the GPU but the coordinates are not"))
+        throw(ArgumentError("The atoms and coordinates are on different devices!"))
     end
     if isa(coords, AT) && !isa(atoms, AT)
-        throw(ArgumentError("The coordinates are on the GPU but the atoms are not"))
+        throw(ArgumentError("The coordinates and atoms are on different devices!"))
     end
     if isa(atoms, AT) && !isa(vels, AT)
-        throw(ArgumentError("The atoms are on the GPU but the velocities are not"))
+        throw(ArgumentError("The atoms and velocities are on different devices!"))
     end
     if isa(vels, AT) && !isa(atoms, AT)
-        throw(ArgumentError("The velocities are on the GPU but the atoms are not"))
+        throw(ArgumentError("The velocities and atoms are on different devices!"))
     end
 
     k_converted = convert_k_units(T, k, energy_units)
@@ -494,13 +498,12 @@ function ReplicaSystem(;
                         force_units=u"kJ * mol^-1 * nm^-1",
                         energy_units=u"kJ * mol^-1",
                         k=Unitful.k,
-                        gpu_diff_safe=isa(replica_coords[1],
-                                          AT)) where AT <: Union{CuArray,
-                                                                 ROCArray}
+                        AT=find_array_type(replica_coords[1]),
+                        gpu_diff_safe =  (AT <: Union{CuArray, ROCArray}))
     D = n_dimensions(boundary)
     G = gpu_diff_safe
     T = float_type(boundary)
-    CU = isa(replica_coords[1], AT)
+    CU = AT <: Union{CuArray, ROCArray}
     A = typeof(atoms)
     AD = typeof(atoms_data)
     C = typeof(replica_coords[1])
@@ -592,24 +595,24 @@ function ReplicaSystem(;
 
     n_cuarray = sum(y -> isa(y, AT), replica_coords)
     if !(n_cuarray == n_replicas || n_cuarray == 0)
-        throw(ArgumentError("The coordinates for $n_cuarray out of $n_replicas replicas are on GPU"))
+        throw(ArgumentError("The coordinates for $n_cuarray out of $n_replicas replicas are on a different device!"))
     end
     if isa(atoms, AT) && n_cuarray != n_replicas
-        throw(ArgumentError("The atoms are on the GPU but the coordinates are not"))
+        throw(ArgumentError("The atoms and coordinates are on different devices!"))
     end
     if n_cuarray == n_replicas && !isa(atoms, AT)
-        throw(ArgumentError("The coordinates are on the GPU but the atoms are not"))
+        throw(ArgumentError("The coordinates and atoms are on different devices!"))
     end
 
     n_cuarray = sum(y -> isa(y, AT), replica_velocities)
     if !(n_cuarray == n_replicas || n_cuarray == 0)
-        throw(ArgumentError("The velocities for $n_cuarray out of $n_replicas replicas are on GPU"))
+        throw(ArgumentError("The velocities for $n_cuarray out of $n_replicas replicas are on a different device!"))
     end
     if isa(atoms, AT) && n_cuarray != n_replicas
-        throw(ArgumentError("The atoms are on the GPU but the velocities are not"))
+        throw(ArgumentError("The atoms and velocities are on different devices!"))
     end
     if n_cuarray == n_replicas && !isa(atoms, AT)
-        throw(ArgumentError("The velocities are on the GPU but the atoms are not"))
+        throw(ArgumentError("The velocities and atoms are on different devices!"))
     end
 
     k_converted = convert_k_units(T, k, energy_units)
diff --git a/src/zygote.jl b/src/zygote.jl
index 37efb5cb..33adbf1d 100644
--- a/src/zygote.jl
+++ b/src/zygote.jl
@@ -138,11 +138,12 @@ end
 # Slower version than in Zygote but doesn't give wrong gradients on the GPU for repeated indices
 # Here we just move it to the CPU then move it back
 # See https://github.com/FluxML/Zygote.jl/pull/1131
-Zygote.∇getindex(x::AT, inds::Tuple{AbstractArray{<:Integer}}) where AT <: Union{CuArray, ROCArray} = dy -> begin
+Zygote.∇getindex(x::Union{CuArray, ROCArray}, inds::Tuple{AbstractArray{<:Integer}}) = dy -> begin
     inds1_cpu = Array(inds[1])
     dx = zeros(eltype(dy), length(x))
     dxv = view(dx, inds1_cpu)
     dxv .= Zygote.accum.(dxv, Zygote._droplike(Array(dy), dxv))
+    AT = find_array_type(x)
     return Zygote._project(x, AT(dx)), nothing
 end
 
@@ -165,7 +166,8 @@ end
 sized_to_static(v::SizedVector{3, T, Vector{T}}) where {T} = SVector{3, T}(v[1], v[2], v[3])
 sized_to_static(v::SizedVector{2, T, Vector{T}}) where {T} = SVector{2, T}(v[1], v[2])
 
-function modify_grad(ȳ_in::AbstractArray{SizedVector{D, T, Vector{T}}}, arg::AT) where {D, T, AT <: Union{CuArray, ROCArray}}
+function modify_grad(ȳ_in::AbstractArray{SizedVector{D, T, Vector{T}}}, arg::Union{CuArray, ROCArray}) where {D, T}
+    AT = find_array_type(arg)
     AT(sized_to_static.(ȳ_in))
 end
 
@@ -173,7 +175,7 @@ function modify_grad(ȳ_in::AbstractArray{SizedVector{D, T, Vector{T}}}, arg) wh
     sized_to_static.(ȳ_in)
 end
 
-modify_grad(ȳ_in, arg::AT) where AT <: Union{CuArray, ROCArray} = AT(ȳ_in)
+modify_grad(ȳ_in, arg::AT) where AT <: Union{CuArray, ROCArray} = find_array_type(arg)(ȳ_in)
 modify_grad(ȳ_in, arg) = ȳ_in
 
 # Dualize a value with extra partials
diff --git a/test/runtests.jl b/test/runtests.jl
index 37e6d96a..376565c4 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,6 +2,7 @@ using Molly
 using Aqua
 import BioStructures # Imported to avoid clashing names
 using CUDA
+using AMDGPU
 using FiniteDifferences
 using ForwardDiff
 using Zygote
@@ -54,8 +55,8 @@ CUDA.allowscalar(false) # Check that we never do scalar indexing on the GPU
 
 run_rocm_tests = AMDGPU.functional()
 if run_rocm_tests
-    device!(parse(Int, DEVICE))
-    @info "The GPU tests will be run on device $DEVICE"
+    AMDGPU.default_device_id!(parse(Int, DEVICE)+1)
+    @info "The GPU tests will be run on device " * string(DEVICE + 1)
 else
     @warn "The ROCM tests will not be run as a ROCM-enabled device is not availa
 ble"

From b42d77aa9e6b5fac32921ee5b8f0393d69b972fd Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Thu, 15 Sep 2022 10:27:29 +0200
Subject: [PATCH 3/7] fixing tests

---
 test/runtests.jl   |  2 +-
 test/simulation.jl | 44 ++++++++++++++---------------
 test/zygote.jl     | 70 +++++++++++++++++++++++-----------------------
 3 files changed, 58 insertions(+), 58 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 376565c4..3d2646b7 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -56,7 +56,7 @@ CUDA.allowscalar(false) # Check that we never do scalar indexing on the GPU
 run_rocm_tests = AMDGPU.functional()
 if run_rocm_tests
     AMDGPU.default_device_id!(parse(Int, DEVICE)+1)
-    @info "The GPU tests will be run on device " * string(DEVICE + 1)
+    @info "The GPU tests will be run on device " * string(parse(Int, DEVICE) + 1)
 else
     @warn "The ROCM tests will not be run as a ROCM-enabled device is not availa
 ble"
diff --git a/test/simulation.jl b/test/simulation.jl
index b4efcd6f..fcbc0e7d 100644
--- a/test/simulation.jl
+++ b/test/simulation.jl
@@ -738,7 +738,7 @@ end
     starting_coords_f32 = [Float32.(c) for c in starting_coords]
     starting_velocities_f32 = [Float32.(c) for c in starting_velocities]
 
-    function test_sim(nl::Bool, parallel::Bool, gpu_diff_safe::Bool, f32::Bool, gpu::Bool; AT = Array)
+    function test_sim(nl::Bool, parallel::Bool, gpu_diff_safe::Bool, f32::Bool, gpu::Bool, array_type)
         n_atoms = 400
         n_steps = 200
         atom_mass = f32 ? 10.0f0u"u" : 10.0u"u"
@@ -751,7 +751,7 @@ end
             InteractionList2Atoms(collect(1:2:n_atoms),
             collect(2:2:n_atoms),
             fill("", length(bonds)),
-            gpu ? AT(bonds) : bonds,
+            gpu ? array_type(bonds) : bonds,
         ),)
 
         neighbor_finder = NoNeighborFinder()
@@ -760,7 +760,7 @@ end
         if nl
             if gpu_diff_safe
                 neighbor_finder = DistanceVecNeighborFinder(
-                    nb_matrix=gpu ? AT(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
+                    nb_matrix=gpu ? array_type(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
                     n_steps=10,
                     dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
                 )
@@ -776,9 +776,9 @@ end
         show(devnull, neighbor_finder)
 
         if gpu
-            coords = AT(deepcopy(f32 ? starting_coords_f32 : starting_coords))
-            velocities = AT(deepcopy(f32 ? starting_velocities_f32 : starting_velocities))
-            atoms = AT([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
+            coords = array_type(deepcopy(f32 ? starting_coords_f32 : starting_coords))
+            velocities = array_type(deepcopy(f32 ? starting_velocities_f32 : starting_velocities))
+            atoms = array_type([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
                                   ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
         else
             coords = deepcopy(f32 ? starting_coords_f32 : starting_coords)
@@ -810,29 +810,29 @@ end
     end
 
     runs = [
-        ("in-place"        , [false, false, false, false, false]),
-        ("in-place NL"     , [true , false, false, false, false]),
-        ("in-place f32"    , [false, false, false, true , false]),
-        ("out-of-place"    , [false, false, true , false, false]),
-        ("out-of-place NL" , [true , false, true , false, false]),
-        ("out-of-place f32", [false, false, true , true , false]),
+        ("in-place"        , [false, false, false, false, false, Array]),
+        ("in-place NL"     , [true , false, false, false, false, Array]),
+        ("in-place f32"    , [false, false, false, true , false, Array]),
+        ("out-of-place"    , [false, false, true , false, false, Array]),
+        ("out-of-place NL" , [true , false, true , false, false, Array]),
+        ("out-of-place f32", [false, false, true , true , false, Array]),
     ]
     if run_parallel_tests
-        push!(runs, ("in-place parallel"   , [false, true , false, false, false]))
-        push!(runs, ("in-place NL parallel", [true , true , false, false, false]))
+        push!(runs, ("in-place parallel"   , [false, true , false, false, false, Array]))
+        push!(runs, ("in-place NL parallel", [true , true , false, false, false, Array]))
     end
     if run_gpu_tests
         if run_cuda_tests
-            push!(runs, ("out-of-place gpu"       , [false, false, true , false, true, AT = CuArray]))
-            push!(runs, ("out-of-place gpu f32"   , [false, false, true , true , true, AT = CuArray]))
-            push!(runs, ("out-of-place gpu NL"    , [true , false, true , false, true, AT = CuArray]))
-            push!(runs, ("out-of-place gpu f32 NL", [true , false, true , true , true, AT = CuArray]))
+            push!(runs, ("out-of-place gpu"       , [false, false, true , false, true, CuArray]))
+            push!(runs, ("out-of-place gpu f32"   , [false, false, true , true , true, CuArray]))
+            push!(runs, ("out-of-place gpu NL"    , [true , false, true , false, true, CuArray]))
+            push!(runs, ("out-of-place gpu f32 NL", [true , false, true , true , true, CuArray]))
         end
         if run_rocm_tests
-            push!(runs, ("out-of-place gpu"       , [false, false, true , false, true, AT = ROCArray]))
-            push!(runs, ("out-of-place gpu f32"   , [false, false, true , true , true, AT = ROCArray]))
-            push!(runs, ("out-of-place gpu NL"    , [true , false, true , false, true, AT = ROCArray]))
-            push!(runs, ("out-of-place gpu f32 NL", [true , false, true , true , true, AT = ROCArray]))
+            push!(runs, ("out-of-place gpu"       , [false, false, true , false, true, ROCArray]))
+            push!(runs, ("out-of-place gpu f32"   , [false, false, true , true , true, ROCArray]))
+            push!(runs, ("out-of-place gpu NL"    , [true , false, true , false, true, ROCArray]))
+            push!(runs, ("out-of-place gpu f32 NL", [true , false, true , true , true, ROCArray]))
         end
     end
 
diff --git a/test/zygote.jl b/test/zygote.jl
index a5a48485..4b4465ff 100644
--- a/test/zygote.jl
+++ b/test/zygote.jl
@@ -37,7 +37,7 @@
     end
 
     function test_grad(gpu::Bool, forward::Bool, f32::Bool, pis::Bool,
-                        sis::Bool, obc2::Bool, gbn2::Bool; AT = Array)
+                        sis::Bool, obc2::Bool, gbn2::Bool, array_type)
         n_atoms = 50
         n_steps = 100
         atom_mass = f32 ? 10.0f0 : 10.0
@@ -75,7 +75,7 @@
             collect(16:30),
             collect(31:45),
             fill("", 15),
-            gpu ? AT(angles_inner) : angles_inner,
+            gpu ? array_type(angles_inner) : angles_inner,
         )
         torsions_inner = [PeriodicTorsion(
                 periodicities=[1, 2, 3],
@@ -89,12 +89,12 @@
             collect(21:30),
             collect(31:40),
             fill("", 10),
-            gpu ? AT(torsions_inner) : torsions_inner,
+            gpu ? array_type(torsions_inner) : torsions_inner,
         )
         atoms_setup = [Atom(charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.0f0 : 0.0) for i in 1:n_atoms]
         if obc2
             imp_obc2 = ImplicitSolventOBC(
-                gpu ? AT(atoms_setup) : atoms_setup,
+                gpu ? array_type(atoms_setup) : atoms_setup,
                 [AtomData(element="O") for i in 1:n_atoms],
                 InteractionList2Atoms(bond_is, bond_js, [""], nothing);
                 use_OBC2=true,
@@ -102,7 +102,7 @@
             general_inters = (imp_obc2,)
         elseif gbn2
             imp_gbn2 = ImplicitSolventGBN2(
-                gpu ? AT(atoms_setup) : atoms_setup,
+                gpu ? array_type(atoms_setup) : atoms_setup,
                 [AtomData(element="O") for i in 1:n_atoms],
                 InteractionList2Atoms(bond_is, bond_js, [""], nothing),
             )
@@ -111,7 +111,7 @@
             general_inters = ()
         end
         neighbor_finder = DistanceVecNeighborFinder(
-            nb_matrix=gpu ? AT(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
+            nb_matrix=gpu ? array_type(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
             n_steps=10,
             dist_cutoff=f32 ? 1.5f0 : 1.5,
         )
@@ -128,18 +128,18 @@
                 bond_is,
                 bond_js,
                 fill("", length(bonds_inner)),
-                gpu ? AT(bonds_inner) : bonds_inner,
+                gpu ? array_type(bonds_inner) : bonds_inner,
             )
             cs = deepcopy(forward ? coords_dual : coords)
             vs = deepcopy(forward ? velocities_dual : velocities)
 
             s = System(
-                atoms=gpu ? AT(atoms) : atoms,
+                atoms=gpu ? array_type(atoms) : atoms,
                 pairwise_inters=pairwise_inters,
                 specific_inter_lists=sis ? (bonds, angles, torsions) : (),
                 general_inters=general_inters,
-                coords=gpu ? AT(cs) : cs,
-                velocities=gpu ? AT(vs) : vs,
+                coords=gpu ? array_type(cs) : cs,
+                velocities=gpu ? array_type(vs) : vs,
                 boundary=boundary,
                 neighbor_finder=neighbor_finder,
                 gpu_diff_safe=true,
@@ -150,41 +150,41 @@
             simulate!(s, simulator, n_steps)
 
             return mean_min_separation(s.coords, boundary)
-        end
+        enu
 
         return loss
     end
 
     runs = [ #                gpu    fwd    f32    pis    sis    obc2   gbn2
-        ("cpu"             , [false, false, false, true , true , false, false], 0.1 , 0.25),
-        ("cpu forward"     , [false, true , false, true , true , false, false], 0.01, 0.01),
-        ("cpu f32"         , [false, false, true , true , true , false, false], 0.2 , 10.0),
-        ("cpu nospecific"  , [false, false, false, true , false, false, false], 0.1 , 0.0 ),
-        ("cpu nopairwise"  , [false, false, false, false, true , false, false], 0.0 , 0.25),
-        ("cpu obc2"        , [false, false, false, true , true , true , false], 0.1 , 0.25),
-        ("cpu gbn2"        , [false, false, false, true , true , false, true ], 0.1 , 0.25),
-        ("cpu gbn2 forward", [false, true , false, true , true , false, true ], 0.02, 0.02),
+        ("cpu"             , [false, false, false, true , true , false, false, Array], 0.1 , 0.25),
+        ("cpu forward"     , [false, true , false, true , true , false, false, Array], 0.01, 0.01),
+        ("cpu f32"         , [false, false, true , true , true , false, false, Array], 0.2 , 10.0),
+        ("cpu nospecific"  , [false, false, false, true , false, false, false, Array], 0.1 , 0.0 ),
+        ("cpu nopairwise"  , [false, false, false, false, true , false, false, Array], 0.0 , 0.25),
+        ("cpu obc2"        , [false, false, false, true , true , true , false, Array], 0.1 , 0.25),
+        ("cpu gbn2"        , [false, false, false, true , true , false, true , Array], 0.1 , 0.25),
+        ("cpu gbn2 forward", [false, true , false, true , true , false, true , Array], 0.02, 0.02),
     ]
     if run_gpu_tests #                         gpu    fwd    f32    pis    sis    obc2   gbn2
         if run_cuda_tests
-            push!(runs, ("cuda"             , [true , false, false, true , true , false, false, AT = CuArray], 0.25, 20.0))
-            push!(runs, ("cuda forward"     , [true , true , false, true , true , false, false, AT = CuArray], 0.01, 0.01))
-            push!(runs, ("cuda f32"         , [true , false, true , true , true , false, false, AT = CuArray], 0.5 , 50.0))
-            push!(runs, ("cuda nospecific"  , [true , false, false, true , false, false, false, AT = CuArray], 0.25, 0.0 ))
-            push!(runs, ("cuda nopairwise"  , [true , false, false, false, true , false, false, AT = CuArray], 0.0 , 10.0))
-            push!(runs, ("cuda obc2"        , [true , false, false, true , true , true , false, AT = CuArray], 0.25, 20.0))
-            push!(runs, ("cuda gbn2"        , [true , false, false, true , true , false, true , AT = CuArray], 0.25, 20.0))
-            push!(runs, ("cuda gbn2 forward", [true , true , false, true , true , false, true , AT = CuArray], 0.02, 0.02))
+            push!(runs, ("cuda"             , [true , false, false, true , true , false, false, CuArray], 0.25, 20.0))
+            push!(runs, ("cuda forward"     , [true , true , false, true , true , false, false, CuArray], 0.01, 0.01))
+            push!(runs, ("cuda f32"         , [true , false, true , true , true , false, false, CuArray], 0.5 , 50.0))
+            push!(runs, ("cuda nospecific"  , [true , false, false, true , false, false, false, CuArray], 0.25, 0.0 ))
+            push!(runs, ("cuda nopairwise"  , [true , false, false, false, true , false, false, CuArray], 0.0 , 10.0))
+            push!(runs, ("cuda obc2"        , [true , false, false, true , true , true , false, CuArray], 0.25, 20.0))
+            push!(runs, ("cuda gbn2"        , [true , false, false, true , true , false, true , CuArray], 0.25, 20.0))
+            push!(runs, ("cuda gbn2 forward", [true , true , false, true , true , false, true , CuArray], 0.02, 0.02))
         end
         if run_rocm_tests
-            push!(runs, ("rocm"             , [true , false, false, true , true , false, false, AT = ROCArray], 0.25, 20.0))
-            push!(runs, ("rocm forward"     , [true , true , false, true , true , false, false, AT = ROCArray], 0.01, 0.01))
-            push!(runs, ("rocm f32"         , [true , false, true , true , true , false, false, AT = ROCArray], 0.5 , 50.0))
-            push!(runs, ("rocm nospecific"  , [true , false, false, true , false, false, false, AT = ROCArray], 0.25, 0.0 ))
-            push!(runs, ("rocm nopairwise"  , [true , false, false, false, true , false, false, AT = ROCArray], 0.0 , 10.0))
-            push!(runs, ("rocm obc2"        , [true , false, false, true , true , true , false, AT = ROCArray], 0.25, 20.0))
-            push!(runs, ("rocm gbn2"        , [true , false, false, true , true , false, true , AT = ROCArray], 0.25, 20.0))
-            push!(runs, ("rocm gbn2 forward", [true , true , false, true , true , false, true , AT = ROCArray], 0.02, 0.02))
+            push!(runs, ("rocm"             , [true , false, false, true , true , false, false, ROCArray], 0.25, 20.0))
+            push!(runs, ("rocm forward"     , [true , true , false, true , true , false, false, ROCArray], 0.01, 0.01))
+            push!(runs, ("rocm f32"         , [true , false, true , true , true , false, false, ROCArray], 0.5 , 50.0))
+            push!(runs, ("rocm nospecific"  , [true , false, false, true , false, false, false, ROCArray], 0.25, 0.0 ))
+            push!(runs, ("rocm nopairwise"  , [true , false, false, false, true , false, false, ROCArray], 0.0 , 10.0))
+            push!(runs, ("rocm obc2"        , [true , false, false, true , true , true , false, ROCArray], 0.25, 20.0))
+            push!(runs, ("rocm gbn2"        , [true , false, false, true , true , false, true , ROCArray], 0.25, 20.0))
+            push!(runs, ("rocm gbn2 forward", [true , true , false, true , true , false, true , ROCArray], 0.02, 0.02))
         end
     end
 

From a1db5246a55acb9db2d326e7a7be9d8c01836b86 Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Thu, 15 Sep 2022 05:32:26 -0400
Subject: [PATCH 4/7] typo in runtest

---
 test/runtests.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 3d2646b7..0604d7e3 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -70,7 +70,7 @@ if run_gpu_tests
     if run_cuda_tests
         push!(gpu_array_types, CuArray)
     end
-    if run_cuda_tests
+    if run_rocm_tests
         push!(gpu_array_types, ROCArray)
     end
 end
@@ -90,8 +90,8 @@ if GROUP == "All"
         undefined_exports=false,
     )
 
-    include("basic.jl")
-    include("interactions.jl")
+    #include("basic.jl")
+    #include("interactions.jl")
     include("minimization.jl")
     include("simulation.jl")
     include("agent.jl")

From 7243fdf54db9e4c834bb493b4292686793283e0f Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Thu, 15 Sep 2022 11:53:10 +0200
Subject: [PATCH 5/7] new attempt at move_array

---
 src/types.jl     | 6 +++---
 test/runtests.jl | 7 +++----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/types.jl b/src/types.jl
index 3e898287..43693010 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -662,9 +662,9 @@ masses(s::Union{System, ReplicaSystem}) = mass.(s.atoms)
 
 # Move an array to the GPU depending on whether the system is on the GPU
 move_array(arr, ::System{D, G, T, false}) where {D, G, T} = arr
-move_array(arr::AT, ::System{D, G, T, true }) where {AT <: Union{CuArray,
-                                                                 ROCArray},
-                                                     D, G, T} = AT(arr)
+function move_array(arr, sys::System{D, G, T, true }) where {D, G, T}
+    find_array_type(sys.coords)(arr)
+end
 
 AtomsBase.species_type(s::Union{System, ReplicaSystem}) = eltype(s.atoms)
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 0604d7e3..04341618 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -58,8 +58,7 @@ if run_rocm_tests
     AMDGPU.default_device_id!(parse(Int, DEVICE)+1)
     @info "The GPU tests will be run on device " * string(parse(Int, DEVICE) + 1)
 else
-    @warn "The ROCM tests will not be run as a ROCM-enabled device is not availa
-ble"
+    @warn "The ROCM tests will not be run as a ROCM-enabled device is not available"
 end
 
 AMDGPU.allowscalar(false)
@@ -90,8 +89,8 @@ if GROUP == "All"
         undefined_exports=false,
     )
 
-    #include("basic.jl")
-    #include("interactions.jl")
+    include("basic.jl")
+    include("interactions.jl")
     include("minimization.jl")
     include("simulation.jl")
     include("agent.jl")

From 9df920cc1bcdb50668b956bf444ec0dda84ee375 Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Thu, 15 Sep 2022 12:00:55 +0200
Subject: [PATCH 6/7] attempting to prevent CI from running on my draft PR.
 This can be reverted later

---
 .github/workflows/CI.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 2bb038a3..3d086fe8 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -6,6 +6,11 @@ on:
   push:
     branches:
       - master
+    types:
+      - opened
+      - reopened
+      - synchronize
+      - ready_for_review
     tags: '*'
   schedule:
     - cron: '00 04 * * 1' # 4am every Monday
@@ -14,6 +19,7 @@ jobs:
   test:
     name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
     runs-on: ${{ matrix.os }}
+    if: ${{ github.event_name == 'push' || !github.event.pull_request.draft }}
     strategy:
       fail-fast: false
       matrix:
@@ -49,6 +55,7 @@ jobs:
   docs:
     name: Documentation
     runs-on: ubuntu-latest
+    if: ${{ github.event_name == 'push' || !github.event.pull_request.draft }}
     steps:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@v1

From ebc4f764d165bbe45ec899510e525cd331932f6a Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Thu, 15 Sep 2022 13:46:54 +0200
Subject: [PATCH 7/7] one more typo...

---
 src/neighbors.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/neighbors.jl b/src/neighbors.jl
index 3e1e99f2..3233bd8c 100644
--- a/src/neighbors.jl
+++ b/src/neighbors.jl
@@ -114,7 +114,7 @@ function DistanceVecNeighborFinder(;
         is = CuArray(hcat([collect(1:n_atoms) for i in 1:n_atoms]...))
         js = CuArray(permutedims(is, (2, 1)))
         m14 = CuArray(matrix_14)
-    elsif isa(nb_matrix, ROCArray)
+    elseif isa(nb_matrix, ROCArray)
         is = ROCArray(hcat([collect(1:n_atoms) for i in 1:n_atoms]...))
         js = ROCArray(permutedims(is, (2, 1)))
         m14 = ROCArray(matrix_14)