//======================================================== file = bloom1.c ===== //= Program to experiment with Bloom filters for string search match = //============================================================================== //= Notes: = //= 1) Program computes 32-bit CRCs, partitions into 4 8-bit values, and = //= then sets 4 bits in a 256 bit (16 byte) Bloom filter array. The = //= program flow is: = //= - Reads in a first string list to "program" the Bloom filter = //= - Reads in a second string list to do tests in Bloom filter = //= - Outputs the strings from the second list that matched = //= 2) For CRC32 uses the standard "Charles Michael Heard" code from = //= http://cell-relay.indiana.edu/cell-relay/publications/software = //= /CRC which was adapted from the algorithm described by Avarm = //= Perez, "Byte-wise CRC Calculations," IEEE Micro 3, 40 (1983). = //=----------------------------------------------------------------------------= //= Example input (in1.txt and in2.txt used in below example execution): = //= = //= in1.txt: = //= apple pie = //= big box = //= cats and dogs = //= ditto = //= easy does it = //= fun and games = //= good enough = //= = //= in2.txt: = //= apple pie = //= bigger box = //= cats and dogs and mice = //= ditto = //= easy does it for now = //= fun and games and what else = //= good enough = //=----------------------------------------------------------------------------= //= Example execution: = //= = //= ----------------------------------------- bloom1.c ----- = //= - Program to experiment with Bloom filters - = //= -------------------------------------------------------- = //= File name of input list to add to filter ===========> in1.txt = //= File name of input list to check for match =========> in2.txt = //= -------------------------------------------------------- = //= Bloom filter is... = //= 0 1 2 3 4 5 6 7 8 910111213141516171819202122232425262728293031 = //= 00800090240000401801200000100400080C2210030300004000080000082200 = //= -------------------------------------------------------- = //= Matching strings are... = //= apple pie = //= ditto = //= good enough = //= -------------------------------------------------------- = //=----------------------------------------------------------------------------= //= Build: bcc32 bloom1.c = //=----------------------------------------------------------------------------= //= Execute: bloom1 = //=----------------------------------------------------------------------------= //= Author: Ken Christensen = //= University of South Florida = //= WWW: http://www.csee.usf.edu/~christen = //= Email: christen@cse.usf.edu = //=----------------------------------------------------------------------------= //= History: KJC (05/08/06) - Genesis = //============================================================================== //----- Include files ---------------------------------------------------------- #include // Needed for printf() and feof() #include // Needed for exit() #include // Needed for strlen() //----- Type defines ----------------------------------------------------------- typedef unsigned char byte; // Byte is a char typedef unsigned short int word16; // 16-bit word is a short int typedef unsigned int word32; // 32-bit word is an int //----- Constant defines ------------------------------------------------------- #define FALSE 0 // Boolean false #define TRUE 1 // Boolean true #define POLYNOMIAL 0x04C11DB7L // Standard CRC-32 polynomial //----- Global variables ------------------------------------------------------- static word32 CrcTable[256]; // Table of 8-bit CRC32 remainders byte BFilter[32]; // Bloom filter array of 256 bits //----- Function prototypes ---------------------------------------------------- void gen_crc_table(void); word32 update_crc(word32 crc_accum, byte *data_blk_ptr, word32 data_blk_size); void get_int32_bytes(word32 int32, byte *b1, byte *b2, byte *b3, byte *b4); void mapBloom(byte b0, byte b1, byte b2, byte b3); int testBloom(byte b0, byte b1, byte b2, byte b3); //============================================================================== //= Main program = //============================================================================== void main(void) { FILE *fp1; // File pointer to input file #1 FILE *fp2; // File pointer to input file #2 char inFile1[256]; // File name for input file #1 char inFile2[256]; // File name for input file #2 char inString[1024]; // Input string word32 crc32; // CRC32 value byte b0, b1, b2, b3; // CRC32 bytes 0, 1, 2, and 3 int retCode; // Return code (TRUE or FALSE) word32 i; // Loop counter // Output banner printf("----------------------------------------- bloom1.c ----- \n"); printf("- Program to experiment with Bloom filters - \n"); printf("-------------------------------------------------------- \n"); // Clear the Bloom filter for (i=0; i<32; i++) BFilter[i] = 0x00; // Initialize the CRC32 table gen_crc_table(); // Prompt for (and open) filename of list of strings to enter into filter printf("File name of input list to add to filter ===========> "); scanf("%s", inFile1); fp1 = fopen(inFile1, "r"); if (fp1 == NULL) { printf("*** ERROR in opening input file #1 (%s) *** \n", inFile1); exit(1); } // Prompt for (and open) filename of list of strings to check for filter match printf("File name of input list to check for match =========> "); scanf("%s", inFile2); fp2 = fopen(inFile2, "r"); if (fp2 == NULL) { printf("*** ERROR in opening input file #2 (%s) *** \n", inFile2); exit(1); } // Read input file #1 and map each string to Bloom filter while(TRUE) { fgets(inString, 1024, fp1); if (feof(fp1)) break; crc32 = update_crc(0xFFFFFFFF, inString, strlen(inString)); get_int32_bytes(crc32, &b0, &b1, &b2, &b3); mapBloom(b0, b1, b2, b3); } fclose(fp1); // Output the Bloom filter printf("-------------------------------------------------------- \n"); printf("Bloom filter is... \n"); for (i=0; i<32; i++) printf("%2d", i); printf("\n"); for (i=0; i<32; i++) printf("%02X", BFilter[i]); printf("\n"); // Output results header printf("-------------------------------------------------------- \n"); printf("Matching strings are... \n"); // Read input file #2 and test (and output) if match to Bloom filter while(TRUE) { fgets(inString, 1024, fp2); if (feof(fp2)) break; crc32 = update_crc(0xFFFFFFFF, inString, strlen(inString)); get_int32_bytes(crc32, &b0, &b1, &b2, &b3); retCode = testBloom(b0, b1, b2, b3); if (retCode == TRUE) printf("%s", inString); } fclose(fp2); // Output closing trailer printf("-------------------------------------------------------- \n"); } //------------------------------------------------------------------------------ //- Function to initialize CRC32 table - //------------------------------------------------------------------------------ void gen_crc_table(void) { register word32 crc_accum; // CRC32 accumulator register word16 i, j; // Loop counters // Initialize the CRC32 8-bit look-up table for (i=0; i<256; i++) { crc_accum = ((word32) i << 24); for (j=0; j<8; j++) { if (crc_accum & 0x80000000L) crc_accum = (crc_accum << 1) ^ POLYNOMIAL; else crc_accum = (crc_accum << 1); } CrcTable[i] = crc_accum; } } //------------------------------------------------------------------------------ //- Function to generate CRC32 - //------------------------------------------------------------------------------ word32 update_crc(word32 crc_accum, byte *data_blk_ptr, word32 data_blk_size) { register word32 i, j; // Loop counters and index values // Compute CRC32 for data block for (j=0; j> 24) ^ *data_blk_ptr++) & 0xFF; crc_accum = (crc_accum << 8) ^ CrcTable[i]; } crc_accum = ~crc_accum; return crc_accum; } //------------------------------------------------------------------------------ //- Function to extract the four bytes from a 32-bit integer - //------------------------------------------------------------------------------ void get_int32_bytes(word32 int32, byte *b0, byte *b1, byte *b2, byte *b3) { word32 tempWord32; // Temporary Word32 // Extract the 4 bytes of the crc32 tempWord32 = (int32 & 0x000000FF); tempWord32 = tempWord32 >> 0; *b3 = (byte) tempWord32; tempWord32 = (int32 & 0x0000FF00); tempWord32 = tempWord32 >> 8; *b2 = (byte) tempWord32; tempWord32 = (int32 & 0x00FF0000); tempWord32 = tempWord32 >> 16; *b1 = (byte) tempWord32; tempWord32 = (int32 & 0xFF000000); tempWord32 = tempWord32 >> 24; *b0 = (byte) tempWord32; } //------------------------------------------------------------------------------ //- Function to map four 8-bit values into the 256-bit Bloom filter - //------------------------------------------------------------------------------ void mapBloom(byte b0, byte b1, byte b2, byte b3) { byte b; // Byte variable int byteNum; // Byte number int bitNum; // Bit number int i; // Loop counter // Set 4 bits in BFilter[] for (i=0; i<4; i++) { if (i == 0) b = b0; if (i == 1) b = b1; if (i == 2) b = b2; if (i == 3) b = b3; byteNum = b / 8; bitNum = b % 8; if (bitNum == 0) BFilter[byteNum] = BFilter[byteNum] | 0x80; if (bitNum == 1) BFilter[byteNum] = BFilter[byteNum] | 0x40; if (bitNum == 2) BFilter[byteNum] = BFilter[byteNum] | 0x20; if (bitNum == 3) BFilter[byteNum] = BFilter[byteNum] | 0x10; if (bitNum == 4) BFilter[byteNum] = BFilter[byteNum] | 0x08; if (bitNum == 5) BFilter[byteNum] = BFilter[byteNum] | 0x04; if (bitNum == 6) BFilter[byteNum] = BFilter[byteNum] | 0x02; if (bitNum == 7) BFilter[byteNum] = BFilter[byteNum] | 0x01; } } //------------------------------------------------------------------------------ //- Function to test for a Bloom filter match - //------------------------------------------------------------------------------ int testBloom(byte b0, byte b1, byte b2, byte b3) { byte b; // Byte variable int byteNum; // Byte number int bitNum; // Bit number int retCode; // Return code int i; // Loop counter // Initialize retcode to "found" retCode = TRUE; // Check if all 4 bits in BFilter[] are set for (i=0; i<4; i++) { if (i == 0) b = b0; if (i == 1) b = b1; if (i == 2) b = b2; if (i == 3) b = b3; byteNum = b / 8; bitNum = b % 8; if ((bitNum == 0) && !(BFilter[byteNum] & 0x80)) retCode = FALSE; if ((bitNum == 1) && !(BFilter[byteNum] & 0x40)) retCode = FALSE; if ((bitNum == 2) && !(BFilter[byteNum] & 0x20)) retCode = FALSE; if ((bitNum == 3) && !(BFilter[byteNum] & 0x10)) retCode = FALSE; if ((bitNum == 4) && !(BFilter[byteNum] & 0x08)) retCode = FALSE; if ((bitNum == 5) && !(BFilter[byteNum] & 0x04)) retCode = FALSE; if ((bitNum == 6) && !(BFilter[byteNum] & 0x02)) retCode = FALSE; if ((bitNum == 7) && !(BFilter[byteNum] & 0x01)) retCode = FALSE; if (retCode == FALSE) break; } // Return the return code return retCode; }