
#include <stdio.h>
#include <string.h>
#include "text.h"

#define NB_ASCII_CHAR 128

__global__
void histo_kernel( char *str, long size, int *histo ) {
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = blockDim.x * gridDim.x;

    while (i < size) {
        /* FIX ME */
        /* Increment histo */
        i += stride;
    }
}

int main( void ) {
    int len = strlen(h_str);
    printf("len:%d\n", len);
    int size = len*sizeof(char);


    // allocate memory on the GPU for the file's data
    char *d_str;
    int *dev_histo;
    cudaMalloc( (void**)&d_str, size );
    cudaMemcpy( d_str, h_str, size, cudaMemcpyHostToDevice );

    cudaMalloc( (void**)&dev_histo, NB_ASCII_CHAR * sizeof( int ) );
    cudaMemset( dev_histo, 0, NB_ASCII_CHAR * sizeof( int ) );

    // kernel launch - 2x the number of mps gave best timing
    cudaDeviceProp  prop;
    cudaGetDeviceProperties( &prop, 0 );
    int blocks = prop.multiProcessorCount;


    // capture the start time
    // starting the timer here so that we include the cost of
    // all of the operations on the GPU.
    cudaEvent_t     start, stop, start_cpu, stop_cpu;
    cudaEventCreate( &start );
    cudaEventCreate( &stop );
    cudaEventCreate( &start_cpu );
    cudaEventCreate( &stop_cpu );
    cudaEventRecord( start, 0 );

    histo_kernel<<<blocks*2,256>>>( d_str, size, dev_histo );
    
    
    // get stop time, and display the timing results
    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );
    float   elapsedTime;
    cudaEventElapsedTime( &elapsedTime, start, stop );
    printf( "Time to compute:  %f ms\n", elapsedTime );

    int    histo[NB_ASCII_CHAR];
    cudaMemcpy( histo, dev_histo, NB_ASCII_CHAR * sizeof( int ), cudaMemcpyDeviceToHost );


    long histoCount = 0;
    for (int i = 0; i < NB_ASCII_CHAR; i++) {
        histoCount += histo[i];
    }
    printf( "Histogram Sum:  %ld\n", histoCount );

    cudaEventRecord( start_cpu, 0 );
    // verify that we have the same counts via CPU
    for (int i = 0; i < len; i++){
        if (h_str[i] < NB_ASCII_CHAR){}
            histo[h_str[i]]--;
    }
    cudaEventRecord( stop_cpu, 0 );
    cudaEventSynchronize( stop_cpu );
    cudaEventElapsedTime( &elapsedTime, start_cpu, stop_cpu );
    printf( "Time to compute cpu:  %f ms\n", elapsedTime );
    
    
    for (int i = 0; i < NB_ASCII_CHAR; i++) {
        if (histo[i] != 0){
            printf( "Failure at %d!  Off by %d\n", i, histo[i] );
        }
    }    
    
    cudaEventDestroy( start );
    cudaEventDestroy( stop );
    cudaFree( dev_histo );
    cudaFree( d_str );
    return 0;
}
