#include <iostream>
#include <cstdlib>
#include <stdio.h>

using namespace std;

// kernel to find the element
__global__ void find(int n, int *x, int y)
{
  int index = threadIdx.x;
  int stride = blockDim.x;
  for (int i = index; i < n; i+=stride){
      if (*(x+i)==y){
        printf("Value at index:%d \n",i);
      }
  }
}

int main(void)
{
  int N = 1<<20; // 2^20 elements

  int *x;
  cudaMallocManaged(&x, N*sizeof(int));
  
  // initialize x array on the host
  for (int i = 0; i < N; i++) {
    *(x+i) = i;
  }

  // Run kernel on 1M elements on the GPU
  find<<<1,256>>>(N,x,1000000);

  // Free memory
  cudaFree(x);
  
  return 0;
}
