diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index dd448e9e50..5c4c8f1688 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -23,6 +23,7 @@ jobs: - run: | . .travis/ci-system-setup.sh echo "$HOME/.cargo/bin" >> "$GITHUB_PATH" + echo "/usr/local/cuda/bin" >> "$GITHUB_PATH" env: RUNNER_ENVIRONMENT: ${{ runner.environment }} @@ -45,6 +46,7 @@ jobs: - run: | . .travis/ci-system-setup.sh echo "$HOME/.cargo/bin" >> "$GITHUB_PATH" + echo "/usr/local/cuda/bin" >> "$GITHUB_PATH" env: RUNNER_ENVIRONMENT: ${{ runner.environment }} diff --git a/.travis/ci-system-setup.sh b/.travis/ci-system-setup.sh index 38f7fc8254..daf0dda485 100755 --- a/.travis/ci-system-setup.sh +++ b/.travis/ci-system-setup.sh @@ -39,6 +39,12 @@ then touch /tmp/ci-setup-done fi +if [ -e /usr/local/cuda ] +then + PATH=$PATH:/usr/local/cuda/bin + nvcc --version +fi + S3=https://s3.amazonaws.com/tract-ci-builds/tests if [ "$GITHUB_WORKFLOW" = "Metal tests" -o "$GITHUB_WORKFLOW" = "CUDA tests" ] diff --git a/Cargo.toml b/Cargo.toml index 610404fc58..518662e62a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -138,7 +138,7 @@ clap = { version = "~3.1", features = [ "cargo" ] } colorous = "1.0.5" core_affinity = "0.8.0" criterion = "0.6" -cudarc = { version = "0.16.4", features = ["dynamic-loading", "cuda-12060", "f16"] } +cudarc = { version = "0.17", features = ["dynamic-loading", "cuda-version-from-build-system", "f16"] } derive-new = "0.5.9" dinghy-test = "0.6" downcast-rs = "1.2.0" diff --git a/cuda/src/kernels/mod.rs b/cuda/src/kernels/mod.rs index 2def43e0bf..b690affd0d 100644 --- a/cuda/src/kernels/mod.rs +++ b/cuda/src/kernels/mod.rs @@ -9,6 +9,8 @@ mod unary; mod utils; use crate::ops::GgmlQuantQ81Fact; +use std::mem::transmute; + use crate::tensor::{CudaBuffer, CudaTensor}; use anyhow::{bail, ensure}; pub use binary::BinOps; @@ -159,7 +161,9 @@ pub fn get_sliced_cuda_view_mut( len: usize, ) -> TractResult> { ensure!(offset + len <= t.len() * t.datum_type().size_of()); - let mut buffer = t.device_buffer().downcast_ref::().unwrap(); + let buffer: &CudaBuffer = t.device_buffer().downcast_ref::().unwrap(); let offset = t.buffer_offset::() + offset; - Ok(buffer.as_view_mut().slice_mut(offset..(offset + len))) + let ptr: *const CudaBuffer = buffer; + let mut_buffer: &mut CudaBuffer = unsafe { (ptr as *mut CudaBuffer).as_mut().unwrap() }; + Ok(mut_buffer.as_view_mut().slice_mut(offset..(offset + len))) } diff --git a/cuda/src/tensor.rs b/cuda/src/tensor.rs index 77421ef15a..53770ec9f7 100644 --- a/cuda/src/tensor.rs +++ b/cuda/src/tensor.rs @@ -1,4 +1,4 @@ -use std::ops::Deref; +use std::ops::{Deref, DerefMut}; use cudarc::driver::{CudaSlice, DevicePtr}; use tract_core::internal::tract_smallvec::ToSmallVec; @@ -30,6 +30,12 @@ impl Deref for CudaBuffer { } } +impl DerefMut for CudaBuffer { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.inner + } +} + #[derive(Clone)] pub struct CudaTensor { buffer: Arc,