c++ - Out of bound address when directly reading from array -

i developing cuda application has routines allocation , deallocation of arrays in shared memory.

in application (that, sorry, cannot make available), have class encapsulate chunk of memory array. class has count method counts number of elements matches value.

so, imagine (which actual part of whole class)

template <class type> struct array {     // ...      type &operator[](int i) { return data_[i]; }     type operator[](int i) const { return data_[i]; }      size_t count(const type &val) const {         size_t c = 0;         (size_t = 0; < len_; ++i)             if (data_[i] == val)                 ++c;         return c;     }      void print(const char *fmt, const char *sep, const char *end) const {         (size_t = 0; < len_ - 1; ++i) {             printf(fmt, data_[i]);             printf(sep);         }         printf(fmt, _data[len_ - 1]);         printf(end);     } private:     type *data_;     size_t len_; };

assumed memory accessing correctly allocated (shared memory allocated @ runtime, passing dimension kernel), big enough contain data , data_ points aligned (wrt type) region inside shared memory. checked multiple times, , these assumptions shall valid (but feel free ask more checkings).

now, while testing code found strange:

when explicitly assigning values using operator[], , reading them using operator[] const, no issues arise.
when reading data using print, no issues arises.
when calling count(), program crashes , address addr out of bounds reported cuda-memcheck, caused invalid __global__ read of size x (x = sizeof(type)). addr inside shared memory buffer, should valid.
if, inside count, replace data_[i] (*this)[i], program runs fine , no crash occurs.

now, have absolutely no idea happen, , have no idea check see happening behind scenes... why reading directly crashes? why using operator[] not? , why reading (directly?) inside print not crashes?

i know question hard , sorry provide little information code... feel free ask detail, try answer as can. idea or suggestion welcome, because days trying solve , far get.

i using 2 different gpus test code, 1 capability 2.1 , 1 3.5 (the latter 1 giving me detailed information crash, while first 1 not). cuda 5.0

edit: have located minimal example error happens. curiously, error appears when compiling sm_20 , sm_35, not on sm_30. gpu using has cap 3.5

/* compile , run with:   nvcc -g -g bug.cu -o bug -arch=sm_20 # bug!   nvcc -g -g bug.cu -o bug -arch=sm_30 # no bug :|   nvcc -g -g bug.cu -o bug -arch=sm_35 # bug!   cuda-memcheck bug  here's output (skipping initial rows) ctor 0x3fffc10 w/o alloc, data 0x10000c8 calling non const [] calling non const [] fill [] ok fill raw ok kernel launch failed error:         unspecified launch failure ========= invalid __global__ write of size 8 =========     @ 0x00000188 in /home/bio/are/algocuda/bug.cu:26:array<double>::fill(double const &) =========     thread (0,0,0) in block (0,0,0) =========     address 0x010000c8 out of bounds =========     device frame:/home/bio/are/algocuda/bug.cu:49:kernel_bug(unsigned long) (kernel_bug(unsigned long) : 0x8c0) =========     saved host backtrace driver entry point @ kernel launch time =========     host frame:/usr/lib/libcuda.so (culaunchkernel + 0x3dc) [0xc9edc] =========     host frame:/opt/cuda-5.0/lib64/libcudart.so.5.0 [0x13324] =========     host frame:/opt/cuda-5.0/lib64/libcudart.so.5.0 (cudalaunch + 0x182) [0x3ac62] =========     host frame:bug [0xbb8] =========     host frame:bug [0xaa7] =========     host frame:bug [0xac4] =========     host frame:bug [0xa07] =========     host frame:/lib/libc.so.6 (__libc_start_main + 0xfd) [0x1ec4d] =========     host frame:bug [0x8c9] ========= ========= program hit error 4 on cuda api call cudadevicesynchronize  =========     saved host backtrace driver entry point @ error =========     host frame:/usr/lib/libcuda.so [0x26a180] =========     host frame:/opt/cuda-5.0/lib64/libcudart.so.5.0 (cudadevicesynchronize + 0x1dd) [0x441fd] =========     host frame:bug [0xa0c] =========     host frame:/lib/libc.so.6 (__libc_start_main + 0xfd) [0x1ec4d] =========     host frame:bug [0x8c9] ========= ========= error summary: 2 errors   (cuda-gdb) set cuda memcheck on (cuda-gdb) run starting program: /home/bio/are/algocuda/bug  [thread debugging using libthread_db enabled] [new thread 0x7ffff5c25700 (lwp 23793)] [context create of context 0x625870 on device 0] [launch of cuda kernel 0 (kernel_bug<<<(1,1,1),(1,1,1)>>>) on device 0] memcheck detected illegal access address (@global)0x10000c8  program received signal cuda_exception_1, lane illegal address. [switching focus cuda kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 12, warp 0, lane 0] 0x0000000000881928 in array<double>::fill (this=0x3fffc10, v=0x3fffc08) @ bug.cu:26 26                              data[i] = v; */  #include <stdio.h>  extern __shared__ char totalsharedmemory[];  template <class type> struct array {     // create array using specific buffer     __device__ __host__ array(size_t len, type *buffer):         len(len),         data(buffer) {         printf("ctor %p w/o alloc, data %p\n", this, data);     }     __device__ __host__ type operator[](int i) const {         printf("calling const []\n");         return data[i];     }     __device__ __host__ type &operator[](int i) {         printf("calling non const []\n");         return data[i];     }     __device__ __host__ void fill(const type &v) {         (size_t = 0; < len; ++i) data[i] = v;     }     size_t len;     type *data; };  __global__ void kernel_bug(size_t bytesperblock) {     // test writing show filling memory     // not produce error     (size_t = 0; < bytesperblock; ++i) {         totalsharedmemory[i] = % ('z' - 'a' + 1) + 'a';         printf("[%p] %c\n", totalsharedmemory + i, totalsharedmemory[i]);     }      // 200 / 8 = 25 should aligned     array<double> x(2, (double *)(totalsharedmemory + 200));     x[0] = 2;     x[1] = 4;     printf("fill [] ok\n");     x.data[0] = 1;     x.data[1] = 0;     printf("fill raw ok\n");     x.fill(0); // crash here     printf("fill method ok\n"); }  int main(int argc, char **argv) {     // total memory required     size_t bytesperblock = 686; // big enough 85 doubles     kernel_bug<<<1, 1, bytesperblock>>>(bytesperblock);     cudaerror_t err = cudadevicesynchronize();     if (err != cudasuccess) {         fprintf(stderr, "kernel launch failed error:\n\t%s\n", cudageterrorstring(err));         return 1;     }     return 0; }

edit: tested cuda 4.2, problem persists.

i able reproduce issue following:

rhel 5.5, driver 304.54, cuda 5.0, quadro 5000 gpu.

i not able reproduce issue following:

rhel 5.5, driver 319.72, cuda 5.5, quadro 5000 gpu.

please update cuda install cuda 5.5, , driver 319.72 or newer.

Search This Blog

Kiastu

c++ - Out of bound address when directly reading from array -

Comments

Post a Comment

Popular posts from this blog

android - getbluetoothservice() called with no bluetoothmanagercallback -

javascript - Image onload event not firing in firefox -

sql - ASP.NET SqlDataSource, like on SelectCommand -