# OpenCL
- Different contexts are used for different platforms
- synchronization between contexts can only be done by clFinish
- kernels can be partitioned and executed across different devices which is
  called heterogeneous parallel computing.
- clCreateSubDevices
- CUDA and OpenCL correspondence
  #CUDA            	         #OpenCL
  GPU             	         device
  multiprocessor  	         compute unit
  scalar core     	         processing element
  thread          	         work-item
  warp                       wavefront
  thread-block    	         work-group
  grid            	         NDRange
  global memory   	         global memory
  shared memory   	         local memory
  local memory    	         private memory
  __global__ function	     __kernel function
  __device__ function	     no qualification needed
  __constant__ variable	     __constant variable
  __device__ variable	     __global variable
  __shared__ variable	     __local variable

## barrier
` void barrier (cl_mem_fence_flags flags)`
All work-items in a work-group executing the kernel on a processor must execute this function before any are allowed to continue execution beyond the barrier. This function must be encountered by all work-items in a work-group executing the kernel.

If barrier is inside a conditional statement, then all work-items must enter the conditional if any work-item enters the conditional statement and executes the barrier.

If barrier is inside a loop, all work-items must execute the barrier for each iteration of the loop before any are allowed to continue execution beyond the barrier.

The barrier function also queues a memory fence (reads and writes) to ensure correct ordering of memory operations to local or global memory.

The flags argument specifies the memory address space and can be set to a combination of the following literal values.

CLK_LOCAL_MEM_FENCE - The barrier function will either flush any variables stored in local memory or queue a memory fence to ensure correct ordering of memory operations to local memory.

CLK_GLOBAL_MEM_FENCE - The barrier function will queue a memory fence to ensure correct ordering of memory operations to global memory. This can be useful when work-items, for example, write to buffer or image objects and then want to read the updated data.

## mem_fence
`void mem_fence (cl_mem_fence_flags flags)`
Orders loads and stores of a work-item executing a kernel. This means that loads and stores preceding the mem_fence will be committed to memory before any loads and stores following the mem_fence.

The flags argument specifies the memory address space and can be set to a combination of the following literal values: CLK_LOCAL_MEM_FENCE, CLK_GLOBAL_MEM_FENCE.

## wavefront(warp) vs workgroup
the threads in wavefront are executed in lock step(depends on driver) and
multiple wavefronts typically 4 are time multiplexed into a workgroup thats executed on single compute unit that has shared memory for all the wavefronts of a workgroup.

## functions
`T add_sat (T x, T y)` add and saturate, adds x and y and if overflowed then
returns the maximum value.
```
cl_int clEnqueueReadBuffer (
   cl_command_queue command_queue, cl_mem buffer, 
   cl_bool blocking_read, size_t offset, size_t size, void* ptr,
   cl_uint num_events_in_wait_list, const cl_event* event_wait_list,
   cl_event* event
)
```
## macOS
`clang++ clDemo.cpp -framework OpenCL -std=c++14 \
         -DCL_HPP_TARGET_OPENCL_VERSION=120 \
         -DCL_HPP_MINIMUM_OPENCL_VERSION=120 \
         -o clDemo`

## AMD GPU
- 64 threads per SIMD
- 4 SIMD per CU

## Linux
### ROCm
`bear -- clang -v -ggdb -O0 -L/opt/rocm/opencl/lib -lOpenCL \
               -I/opt/rocm/opencl/include CasRSA_CL.c -o CasRSA_CL`
#### rocprof
#outputs rocprofquery.csv
rocprof -i rocprofquery.txt --timestamp on ./bn_cl
-TC: test case*
-TCC: TeslaComputeCluster*
-PMC: Performance Counters
-SQTT: SQ Thread Trace
-HSA AQL: Heterogeneous System Architecture Architected Queuing Language

#### hsa-amd-aqlprofile
- ICD: Installable Client Driver. To register ICD
     `echo full/path/to/libOpenCLDriverStub.so > /etc/OpenCL/vendors/test.icd`
 - `/opt/rocm/lib/libamdocl64.so` is generated by `clr`
   (https://github.com/ROCm-Developer-Tools/clr)
- depends on rocr-runtime

#### rocm version
cat src/rocm-core/build/version
cat /opt/rocm/.info/version
cat /opt/rocm/include/rocm_version.h

### VirCL
```
QEMU(VKvhost, CLvhost) | VKApp, OCLApp
VenusRenderer, CLRenderer | VKLoader, CLLoader
VKDriver, CLDriver, MiniGbm | miniGbm<-Venus(VK FE driver)-OCLFrontEnd
Libdrm/libdrm_amdgpu        | libdrm/kms
Dom0Kernel(amdgpu)     | DomUKernel(Virt-IO Transport(virt-gpu, virt-compute))
Hypervisor
HW(dGPU/APU)
```

# CUDA
## Device Memory types and scopes
#Variables declaration			         #Memory   #Scope  #Lifetime
Automatic variables other than arrays	 Register  Thread  Kernel
Automatic array variables      		     Local	   Thread  Kernel
__device__ __shared__ int SharedVar	     Shared	   Block   Kernel
__device__ int GlobalVar  		         Global	   Grid	   Application
__device__ __constant__ int ConstVar	 Constant  Grid	   Application

Global memory is slower than shared memory
Registers accessible by one thread is the fastest

## Function declaration
#Keywords and Functions     #ExecutedOn  #CallableFrom
__device__ float DevFunc    device       device
__global__ void KernelFunc  device       host
__host__ float HostFunc     host         host

## Julia function
```CUDA
struct cuComplex {
   float r,i;
}
__device__ float magnitude(struct cuComplex a){
   return ((a.r * a.r) + (a.i * a.i));
}
__device__ void add(struct cuComplex a, struct cuComplex b,
                    sturct cuComplex* res) {
   res->r = a.r + b.r; res->i = a.i + b.i;
}
__device__ void mul(struct cuComplex a, struct cuComplex b,
                    struct cuComplex* res){
   res->r=(a.r*b.r)-(a.i*b.i); res->i=(a.r*b.i)+(a.i*b.r);
}
__device__ int julia(int x, int y) {
   const float scale=1.5;
   float jx=scale*(float)(DIM/2-x)/(DIM/2), jy=scale*(float)(DIM/2-y)/(DIM/2);
   struct cuComplex c, a, r1, r2;
   c.r=-0.8;c.i=0.154;
   a.r=jx; a.i=jy;
   int i=0;
   for (i=0; i<200; ++i) {
      mul(a,a,&r1); add(r1, c, &r2);
      if (magnitude(r2) > 1000)
         return 0;
      a.r=r2.r; a.i=r2.i;
   }
   return 1;
}
__global__ void kernel(unsigned char* ptr) {
   int x = blockIdx.x;
   int y = blockIdx.y;
   int offset = x+y*gridDim.x;
   int juliaValue = julia(x,y);
   ptr[offset*4+0]=255*juliaValue;
   ptr[offset*4+1]=0;
   ptr[offset*4+2]=0;
   ptr[offset*4+3]=255;
}
int main(void) {
   CPUBitmap bitmap(DIM, DIM);
   unsigned char* dev_bitmap:
   cudaMalloc((void**)&dev_bitmap, bitmap.image_size());
   dim3 grid(DIM,DIM);
   kernel<<<grid,1>>>(dev_bitmap);
   cudaMemcpy(bitmap.get_ptr(),dev_bitmap,bitmap.image_size(),
              cudaMemcpyDeviceToHost);
   bitmap.display_and_exit();
   cudaFree(dev_bitmap);
}
```
```bash
// g: debug symbols, lineinfo: adds kernelSource-MachineInstructions mappings
nvcc -g -lineinfo julia.cu -o julia
gdb --args ./julia
// to compile as shared library
nvcc -g -lineinfo --ptxas-options=-v --compiler-options '-fPIC' --shared\
cnn.cu -lcublas -lcudnn -o libcnn.so
```

## profiling cuda kernel
```CUDA
int main(void) {
   cudaEvent_t startEvent, stopEvent;
   float ms; int blockSize=1024, n=nMB*1024*1024/size(float);
   cudaMalloc(&d_a, n*sizeof(float));
   for (int i=0; i<=32; i++) {
      cudaMemset(d_a, 0.0, n*sizeof(float));
      cudaEventRecord(startEvent);
      offset_access<<n/blockSize, blockSize>>(d_a, i);
      cudaEventRecord(stopEvent);
      cudaEventSynchronize(stopEvent);
      cudaEventElapsedTime(&ms, startEvent, stopEvent);
      printf("%d, %fn", i, 2*nMB/ms);
   }
}
```
### nvprof
- `nvprof ./llm` gives time taken by varioush activities
- `nvprof --devices 0 --metrics gst_throughput,gld_throughput ./llm` is not
  supported by devices with compute capabilities > 7
### profiling with nvvp
- If nvvp throws java exception then
  ```
  sudo apt install openjdk-8-jre
  update-java-alternatives -l
  sudo update-java-alternatives -s java-1.8.0-openjdk-amd64
  ```
### profiling with nsightcompute
- 
- kernel can be of multidimensional
kernel <<<dim3(tx,ty,tz), dim3(bx,by,bz)>>>
blockIdx.x,blockIdx.y,blockIdx.z
threadIdx.x, threadIdx.y, threadIdx.z
are the variables through which index of the thread is determined.
- Idx doesnt mean id in x dimesion; there is no Idy/Idz.
blockNum=blockIdx.z*(gridDim.x*gridDim.y)+blockIdx.y*gridDim.x+blockIdx.x
threadNum=threadIdx.z*(blockDim.x*blockDim.y)+threadIdx.y*blockDim.x+
          threadIdx.x
globalThreadId=blockNum*(blockDim.x*blockDim.y*blockDim.z)+threadNum
and
i=blockidx.x*blockDim.x+threadIdx.x
j=blockidx.y*blockDim.y+threadIdx.y
k=blockIdx.z*blockDim.z+threadIdx.z

- to sync threads that have different execution times use
__syncthreads();

Global memory is slower than shared memory
Registers accessible by one thread is the fastest
#if statements
- at if condition, the threads diverge; its called divergence
- for nested if blocks the warp shceduler pushes a mask indiacting active
threads on to a stack.

Major revision number: 7
Minor revision number: 5
Name: NVIDIA GeForce RTX 2060
TotalGolbalMemory: 2147155968
TotalSharedMem per block: 49152
warpSize: 32
Maximum memory pitch: 2147483647
maxThreadsPerBlock: 1024
max dimension 0 of block: 1024
max dimension 1 of block: 1024
max dimension 2 of block: 64
max dimension 0 of grid: 2147483647
max dimension 1 of grid: 65535
max dimension 2 of grid: 65535
Clock rate: 1845000
Total constant memory: 65536
Texture alignment: 512
Concurrent copy and execution: Yes
Number of multiprocessors: 30

# Optimizations
## Memory access coalescing
  - coalesce global memory access by stride typically of aligned  32 words
  (equal to warp size)
  - reducing number of global memory transactions by warps is one of the keys
    for optimizing execution time
  - use shared memory
    - coalesce shared memory banks access to resolve bank conflicts
    - resolve bank conflicts by memory padding
  - coalesce global memory partitions to resolve partition camping
    - betterway to resolve partition camping than memory padding is diagonal
      block rendering.
      - blockIdx_y = blockIdx.x
      - blockIdx_x = (blockIdx.x+blockIdx.y)%gridDim.x;
__global__ void offset_access(float* a) {
   int tid = blockDim.x * blockIdx.x + threadIdx.x;
   a[tid]=a[tid]+1; // coalesced access
   a[tid+1]=a[tid+1]+1; //uncoalesced access; requires 2 global memory reads
   			//and 2 global memory writes
   a[tid*3]=a[tid*3]+1; //uncoalesced access; requires 3 global memory reads
   			//and 3 global memory writes
}

# naiveRow = without any optimization
load by row
store by col

#nvidia visual profiler
nvvp

# reduction kernel

# Asnynchronous API
cudaStream_t stream;
cudaError_t cudaStreamCreate(cudaStream_t* stream);
cudaError_t cudaStreamDestroy(cudaStream_t stream);
cudaError_t cudaStreamSynchronize(cudaStream_t stream);
cudaError_t cudaStreamQuery(cudaStream_t stream);
 cudaEvent_t stop;
cudaEventCreate(&stop));
cudaError_t cudaMallochost(void**, size);
cudaError_t cudaMemcpyAsync(void* dst, const void* src, size_t count,
   cudaMemcpyKind kind, cudaStream_t stream = 0);
// it assigns page_locked memory that cannot be swapped to discs
kernel_name<<<grid, block, sharedMemSize, stream>>>(argument list);
cudaEventRecord(stop);
while(cudaEventQuery(stop) == cudaErrorNotReady) {
   counter++;
}
cudaEventDestroy(stop);
cudaFreeHost(h_a);
cudaFree(d_a);
cudaDeviceReset();

# uses of asynchronous api
- overlapped host computation and device computation
- overlapped host computation and host-device data transfer
- overlapped host-device data transfer and device computation
- concurrent device computation

## errors
`LLVM ERROR: failed to annotate CFG`
- CFG: Call/Control Flow Graph