#include <iostream>
#include <cstdlib>
#include <stdio.h>

using namespace std;

// kernel to find the element
__global__ void find(int n, int *x, int y)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
  for (int i = index; i < n; i+=stride){
      if (*(x+i)==y){
        printf("Value at index:%d \n",i);
      }
  }
}


int main(void){

  int N = 1<<20;
  int *d_a;
  int *a = new int[N];

  for (int i=0; i<N; i++){
    *(a+i) = i;
  }

  cudaMalloc((void **) &d_a, N*sizeof(int));

  cudaMemcpy(d_a, a, N*sizeof(int), cudaMemcpyHostToDevice);

  find<<<4096,256>>>(N, d_a, 1000000);
  
  cudaFree(d_a);
  delete [] a;

  return 0;
}