# OpenCL - Different contexts are used for different platforms - synchronization between contexts can only be done by clFinish - kernels can be partitioned and executed across different devices which is called heterogeneous parallel computing. - clCreateSubDevices - CUDA and OpenCL correspondence #CUDA #OpenCL GPU device multiprocessor compute unit scalar core processing element thread work-item warp wavefront thread-block work-group grid NDRange global memory global memory shared memory local memory local memory private memory __global__ function __kernel function __device__ function no qualification needed __constant__ variable __constant variable __device__ variable __global variable __shared__ variable __local variable ## barrier ` void barrier (cl_mem_fence_flags flags)` All work-items in a work-group executing the kernel on a processor must execute this function before any are allowed to continue execution beyond the barrier. This function must be encountered by all work-items in a work-group executing the kernel. If barrier is inside a conditional statement, then all work-items must enter the conditional if any work-item enters the conditional statement and executes the barrier. If barrier is inside a loop, all work-items must execute the barrier for each iteration of the loop before any are allowed to continue execution beyond the barrier. The barrier function also queues a memory fence (reads and writes) to ensure correct ordering of memory operations to local or global memory. The flags argument specifies the memory address space and can be set to a combination of the following literal values. CLK_LOCAL_MEM_FENCE - The barrier function will either flush any variables stored in local memory or queue a memory fence to ensure correct ordering of memory operations to local memory. CLK_GLOBAL_MEM_FENCE - The barrier function will queue a memory fence to ensure correct ordering of memory operations to global memory. This can be useful when work-items, for example, write to buffer or image objects and then want to read the updated data. ## mem_fence `void mem_fence (cl_mem_fence_flags flags)` Orders loads and stores of a work-item executing a kernel. This means that loads and stores preceding the mem_fence will be committed to memory before any loads and stores following the mem_fence. The flags argument specifies the memory address space and can be set to a combination of the following literal values: CLK_LOCAL_MEM_FENCE, CLK_GLOBAL_MEM_FENCE. ## wavefront(warp) vs workgroup the threads in wavefront are executed in lock step(depends on driver) and multiple wavefronts typically 4 are time multiplexed into a workgroup thats executed on single compute unit that has shared memory for all the wavefronts of a workgroup. ## functions `T add_sat (T x, T y)` add and saturate, adds x and y and if overflowed then returns the maximum value. ``` cl_int clEnqueueReadBuffer ( cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, size_t offset, size_t size, void* ptr, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event ) ``` ## macOS `clang++ clDemo.cpp -framework OpenCL -std=c++14 \ -DCL_HPP_TARGET_OPENCL_VERSION=120 \ -DCL_HPP_MINIMUM_OPENCL_VERSION=120 \ -o clDemo` ## AMD GPU - 64 threads per SIMD - 4 SIMD per CU ## Linux ### ROCm `bear -- clang -v -ggdb -O0 -L/opt/rocm/opencl/lib -lOpenCL \ -I/opt/rocm/opencl/include CasRSA_CL.c -o CasRSA_CL` #### rocprof #outputs rocprofquery.csv rocprof -i rocprofquery.txt --timestamp on ./bn_cl -TC: test case* -TCC: TeslaComputeCluster* -PMC: Performance Counters -SQTT: SQ Thread Trace -HSA AQL: Heterogeneous System Architecture Architected Queuing Language #### hsa-amd-aqlprofile - ICD: Installable Client Driver. To register ICD `echo full/path/to/libOpenCLDriverStub.so > /etc/OpenCL/vendors/test.icd` - `/opt/rocm/lib/libamdocl64.so` is generated by `clr` (https://github.com/ROCm-Developer-Tools/clr) - depends on rocr-runtime #### rocm version cat src/rocm-core/build/version cat /opt/rocm/.info/version cat /opt/rocm/include/rocm_version.h ### VirCL ``` QEMU(VKvhost, CLvhost) | VKApp, OCLApp VenusRenderer, CLRenderer | VKLoader, CLLoader VKDriver, CLDriver, MiniGbm | miniGbm<-Venus(VK FE driver)-OCLFrontEnd Libdrm/libdrm_amdgpu | libdrm/kms Dom0Kernel(amdgpu) | DomUKernel(Virt-IO Transport(virt-gpu, virt-compute)) Hypervisor HW(dGPU/APU) ``` # CUDA ## Device Memory types and scopes #Variables declaration #Memory #Scope #Lifetime Automatic variables other than arrays Register Thread Kernel Automatic array variables Local Thread Kernel __device__ __shared__ int SharedVar Shared Block Kernel __device__ int GlobalVar Global Grid Application __device__ __constant__ int ConstVar Constant Grid Application Global memory is slower than shared memory Registers accessible by one thread is the fastest ## Function declaration #Keywords and Functions #ExecutedOn #CallableFrom __device__ float DevFunc device device __global__ void KernelFunc device host __host__ float HostFunc host host ## Julia function ```CUDA struct cuComplex { float r,i; } __device__ float magnitude(struct cuComplex a){ return ((a.r * a.r) + (a.i * a.i)); } __device__ void add(struct cuComplex a, struct cuComplex b, sturct cuComplex* res) { res->r = a.r + b.r; res->i = a.i + b.i; } __device__ void mul(struct cuComplex a, struct cuComplex b, struct cuComplex* res){ res->r=(a.r*b.r)-(a.i*b.i); res->i=(a.r*b.i)+(a.i*b.r); } __device__ int julia(int x, int y) { const float scale=1.5; float jx=scale*(float)(DIM/2-x)/(DIM/2), jy=scale*(float)(DIM/2-y)/(DIM/2); struct cuComplex c, a, r1, r2; c.r=-0.8;c.i=0.154; a.r=jx; a.i=jy; int i=0; for (i=0; i<200; ++i) { mul(a,a,&r1); add(r1, c, &r2); if (magnitude(r2) > 1000) return 0; a.r=r2.r; a.i=r2.i; } return 1; } __global__ void kernel(unsigned char* ptr) { int x = blockIdx.x; int y = blockIdx.y; int offset = x+y*gridDim.x; int juliaValue = julia(x,y); ptr[offset*4+0]=255*juliaValue; ptr[offset*4+1]=0; ptr[offset*4+2]=0; ptr[offset*4+3]=255; } int main(void) { CPUBitmap bitmap(DIM, DIM); unsigned char* dev_bitmap: cudaMalloc((void**)&dev_bitmap, bitmap.image_size()); dim3 grid(DIM,DIM); kernel<<>>(dev_bitmap); cudaMemcpy(bitmap.get_ptr(),dev_bitmap,bitmap.image_size(), cudaMemcpyDeviceToHost); bitmap.display_and_exit(); cudaFree(dev_bitmap); } ``` ```bash // g: debug symbols, lineinfo: adds kernelSource-MachineInstructions mappings nvcc -g -lineinfo julia.cu -o julia gdb --args ./julia // to compile as shared library nvcc -g -lineinfo --ptxas-options=-v --compiler-options '-fPIC' --shared\ cnn.cu -lcublas -lcudnn -o libcnn.so ``` ## profiling cuda kernel ```CUDA int main(void) { cudaEvent_t startEvent, stopEvent; float ms; int blockSize=1024, n=nMB*1024*1024/size(float); cudaMalloc(&d_a, n*sizeof(float)); for (int i=0; i<=32; i++) { cudaMemset(d_a, 0.0, n*sizeof(float)); cudaEventRecord(startEvent); offset_access<>(d_a, i); cudaEventRecord(stopEvent); cudaEventSynchronize(stopEvent); cudaEventElapsedTime(&ms, startEvent, stopEvent); printf("%d, %fn", i, 2*nMB/ms); } } ``` ### nvprof - `nvprof ./llm` gives time taken by varioush activities - `nvprof --devices 0 --metrics gst_throughput,gld_throughput ./llm` is not supported by devices with compute capabilities > 7 ### profiling with nvvp - If nvvp throws java exception then ``` sudo apt install openjdk-8-jre update-java-alternatives -l sudo update-java-alternatives -s java-1.8.0-openjdk-amd64 ``` ### profiling with nsightcompute - - kernel can be of multidimensional kernel <<>> blockIdx.x,blockIdx.y,blockIdx.z threadIdx.x, threadIdx.y, threadIdx.z are the variables through which index of the thread is determined. - Idx doesnt mean id in x dimesion; there is no Idy/Idz. blockNum=blockIdx.z*(gridDim.x*gridDim.y)+blockIdx.y*gridDim.x+blockIdx.x threadNum=threadIdx.z*(blockDim.x*blockDim.y)+threadIdx.y*blockDim.x+ threadIdx.x globalThreadId=blockNum*(blockDim.x*blockDim.y*blockDim.z)+threadNum and i=blockidx.x*blockDim.x+threadIdx.x j=blockidx.y*blockDim.y+threadIdx.y k=blockIdx.z*blockDim.z+threadIdx.z - to sync threads that have different execution times use __syncthreads(); Global memory is slower than shared memory Registers accessible by one thread is the fastest #if statements - at if condition, the threads diverge; its called divergence - for nested if blocks the warp shceduler pushes a mask indiacting active threads on to a stack. Major revision number: 7 Minor revision number: 5 Name: NVIDIA GeForce RTX 2060 TotalGolbalMemory: 2147155968 TotalSharedMem per block: 49152 warpSize: 32 Maximum memory pitch: 2147483647 maxThreadsPerBlock: 1024 max dimension 0 of block: 1024 max dimension 1 of block: 1024 max dimension 2 of block: 64 max dimension 0 of grid: 2147483647 max dimension 1 of grid: 65535 max dimension 2 of grid: 65535 Clock rate: 1845000 Total constant memory: 65536 Texture alignment: 512 Concurrent copy and execution: Yes Number of multiprocessors: 30 # Optimizations ## Memory access coalescing - coalesce global memory access by stride typically of aligned 32 words (equal to warp size) - reducing number of global memory transactions by warps is one of the keys for optimizing execution time - use shared memory - coalesce shared memory banks access to resolve bank conflicts - resolve bank conflicts by memory padding - coalesce global memory partitions to resolve partition camping - betterway to resolve partition camping than memory padding is diagonal block rendering. - blockIdx_y = blockIdx.x - blockIdx_x = (blockIdx.x+blockIdx.y)%gridDim.x; __global__ void offset_access(float* a) { int tid = blockDim.x * blockIdx.x + threadIdx.x; a[tid]=a[tid]+1; // coalesced access a[tid+1]=a[tid+1]+1; //uncoalesced access; requires 2 global memory reads //and 2 global memory writes a[tid*3]=a[tid*3]+1; //uncoalesced access; requires 3 global memory reads //and 3 global memory writes } # naiveRow = without any optimization load by row store by col #nvidia visual profiler nvvp # reduction kernel # Asnynchronous API cudaStream_t stream; cudaError_t cudaStreamCreate(cudaStream_t* stream); cudaError_t cudaStreamDestroy(cudaStream_t stream); cudaError_t cudaStreamSynchronize(cudaStream_t stream); cudaError_t cudaStreamQuery(cudaStream_t stream); cudaEvent_t stop; cudaEventCreate(&stop)); cudaError_t cudaMallochost(void**, size); cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream = 0); // it assigns page_locked memory that cannot be swapped to discs kernel_name<<>>(argument list); cudaEventRecord(stop); while(cudaEventQuery(stop) == cudaErrorNotReady) { counter++; } cudaEventDestroy(stop); cudaFreeHost(h_a); cudaFree(d_a); cudaDeviceReset(); # uses of asynchronous api - overlapped host computation and device computation - overlapped host computation and host-device data transfer - overlapped host-device data transfer and device computation - concurrent device computation ## errors `LLVM ERROR: failed to annotate CFG` - CFG: Call/Control Flow Graph