Mailing List Archive

Re: clamav-devel Digest, Vol 111, Issue 12 AC algorithm optimization
Sorry for late reply.
My implementation as follow:
I modified the original algorithm, when cur state is root, using the 2-gram hash table to accelerate filtering.
In following implementation, building dat table dependent on the Trie structure that had been builded.

References:
http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=31365&url=http%3A%2F%2Fieeexplore.ieee.org%2Fxpls%2Fabs_all.jsp%3Farnumber%3D31365

---------------------------------------
#include <stdio.h>

#include <stdlib.h>

#include "common.h"

#include "my_dat_ac.h"



#define T_SIZE 655360

#define printf(fmt,arg...)







struct input_list {

unsigned char node;

struct bfs_list *next;

};



enum {

T_MUL_ST = 1, // multi-state

T_SIN_ST, // sigle-state

T_TER_ST, // terminal-state

T_SEP_ST // separate-state

};



struct _state_node {

struct my_ac_patt * list;

uint32_t fail;

};



struct _state_build {

struct input_list *valid_input_bfs;

struct input_list *valid_input_bfs_last;

uint16_t type;

uint8_t from_char;

struct _state_node state;

};



struct _state_build build_new[T_SIZE] = {};

static uint32_t new[T_SIZE] = {};



static uint32_t base[T_SIZE] = {};

static uint32_t check[T_SIZE] = {};

static struct _state_node check_match[T_SIZE] = {};



static uint32_t old[T_SIZE] = {};



static uint32_t dat_hash[65536] = {};



static int my_input_enqueue(struct input_list **bfs, struct input_list **last, unsigned char n)

{

struct input_list *new;



new = (struct input_list *) malloc(sizeof(struct input_list));

if(!new) {

cli_errmsg("bfs_enqueue: Can't allocate memory for input_list\n");

return CL_EMEM;

}

new->next = NULL;

new->node = n;



if(*last) {

(*last)->next = new;

*last = new;

} else {

*bfs = *last = new;

}



return CL_SUCCESS;

}



static int my_input_dequeue(struct input_list **bfs, struct input_list **last)

{

struct input_list *lpt;

int pt;



if(!(lpt = *bfs)) {

return -1;

} else {

*bfs = (*bfs)->next;

pt = lpt->node;

if(lpt == *last)

*last = NULL;

free(lpt);

return pt;

}

}



static int my_input_queue_foreach(struct input_list **bfs)

{

struct input_list *lpt;

int pt;



if(!(lpt = *bfs)) {

return -1;

} else {

*bfs = (*bfs)->next;

pt = lpt->node;

return pt;

}

}



extern struct bitmap_state *get_bit_popcnt_next_state(struct bitmap_state *pcur_state, unsigned char a);

static int max_check_index = 0;

int my_dat_ac_build(struct my_matcher *root)

{

int count1 =0;

struct bfs_list *multi_bfs = NULL, *multi_bfs_last = NULL;

struct bfs_list *sep_bfs = NULL, *sep_bfs_last = NULL;

int res;

uint8_t a;

struct input_list *tmp;

//init value------s : = 1 ; new(s) : = 1 ; queue := { s } ;

int i;

for(i=0;i<T_SIZE;i++)

base[i] = 2;

struct bitmap_state *bitmap_st = root->ac_root;

struct bitmap_state *bitmap_st1;



uint32_t s = bitmap_st->state_id , s1;

new[s] = bitmap_st->state_id;

old[bitmap_st->state_id] = s;

//base[new[s]] = bitmap_st->state_id;

my_bfs_enqueue(&multi_bfs, &multi_bfs_last, (void*)bitmap_st);

/* Routine for multistates */

while((bitmap_st = my_bfs_dequeue(&multi_bfs, &multi_bfs_last)) != (void*)queue_NULL) {

s = (uint32_t)bitmap_st->state_id;

printf("-----M-----%03d(%03d)----------\n", s, new[s]);

/* Let s be the next state in queue */

abcd:

tmp = build_new[s].valid_input_bfs;

while((res = my_input_queue_foreach(&tmp)) != -1) {

a = (uint8_t)res;

if(check[base[new[s]] + a] != 0){

base[new[s]] = base[new[s]] + 1;

goto abcd;

}

}



while((res = my_input_dequeue(&build_new[s].valid_input_bfs, &build_new[s].valid_input_bfs_last)) != -1) {

/*for each a in list[s]*/

a = (uint8_t)res;

// while(check[base[new[s]] + a] != 0){

// base[new[s]] = base[new[s]] + 1;

// }

check[base[new[s]] + a] = new[s];

bitmap_st1 = get_bit_popcnt_next_state(bitmap_st, a);

s1 = bitmap_st1->state_id;

new[s1] = base[new[s]] + a;

old[base[new[s]] + a] = s1;

if(new[s1] > max_check_index){

max_check_index = new[s1];

}

printf("o_stat %03d -%02x- new_stat %03d;;;;%d\n", s1, a, new[s1], ++count1);



//build hash

if(bitmap_st1->level == 2){

uint8_t hash_str[2];

hash_str[0] = build_new[s].from_char;

hash_str[1] = build_new[s1].from_char;

uint16_t hash_val = *(uint16_t*)hash_str;

dat_hash[hash_val] = new[s1];

}



// check_match[new[s1]].fail = new[build_new[s1].state.fail];

// check_match[new[s1]].list = build_new[s1].state.list;



if(build_new[s1].type != T_SEP_ST){

my_bfs_enqueue(&multi_bfs, &multi_bfs_last, (void*)bitmap_st1);

}

else {

my_bfs_enqueue(&sep_bfs, &sep_bfs_last, (void*)bitmap_st1);

}

}

}



/* Routine for single state */

while((bitmap_st = my_bfs_dequeue(&sep_bfs, &sep_bfs_last)) != queue_NULL) {

s = (uint32_t)bitmap_st->state_id;

printf("-----S-----%03d(%03d)----------\n", s, new[s]);

while(build_new[s].type != T_TER_ST){

a = (uint8_t)my_input_dequeue(&build_new[s].valid_input_bfs, &build_new[s].valid_input_bfs_last);

while(check[base[new[s]] + a] != 0){

base[new[s]] = base[new[s]] + 1;

}

check[base[new[s]] + a] = new[s];

bitmap_st1 = get_bit_popcnt_next_state(bitmap_st, a);

s1 = bitmap_st1->state_id;

new[s1] = base[new[s]] + a;

old[base[new[s]] + a] = s1;

if(new[s1] > max_check_index){

max_check_index = new[s1];

}



//build hash

if(bitmap_st1->level == 2){

uint8_t hash_str[2];

hash_str[0] = build_new[s].from_char;

hash_str[1] = build_new[s1].from_char;

uint16_t hash_val = *(uint16_t*)hash_str;

dat_hash[hash_val] = new[s1];

}



printf("o_stat %03d -%02x- new_stat %03d;;;;%d\n", s1, a, new[s1], ++count1);

// check_match[new[s1]].fail = new[build_new[s1].state.fail];

// check_match[new[s1]].list = build_new[s1].state.list;

s = s1;

bitmap_st = bitmap_st1;

}

}

printf("max_check_index %d\n", max_check_index);

for(i = 0;i<=max_check_index;i++)

{

check_match[i].fail = new[build_new[old[i]].state.fail];

check_match[i].list = build_new[old[i]].state.list;

}



return 0;

}

static unsigned int fail_count[32] = {};

uint32_t my_dat_ac_fail_proc(unsigned char a, uint32_t s)

{

#if 0

// return 1;

// uint32_t t;

// s = check_match[s].fail;

// t = a + base[s];

// if(check[t] != s)

// return 1;

// else

// return t;



static int tdfsdfasdf = 1;

tdfsdfasdf++;

return 1;

#else

uint32_t t;

int i = 0;

do{

fail_count[i++]++;

if(s == 1) return s;

s = check_match[s].fail;

t = a + base[s];

}while(check[t] != s);

return t;

#endif

}



static uint32_t st_hash_fail_count = 0;

static uint32_t st_hash_ref_count = 0;

uint32_t my_dat_ac_fail_proc_hash(uint32_t s, uint8_t *buffer, uint32_t *i, uint32_t length)

{

uint32_t t;

int k = 0;

uint32_t j;

do{

fail_count[k++]++;

if(s == 1){//root

loop:

j = *i;

//hash search

for(;j<length-1;j++)

{

st_hash_ref_count++;

if(dat_hash[*(uint16_t*)(buffer+j)]){

*i = j+1;

return dat_hash[*(uint16_t*)(buffer+j)];

}

st_hash_fail_count++;

}

*i = j+1;

return 0;

}

s = check_match[s].fail;

if(s == 1)

goto loop;

t = buffer[*i] + base[s];

}while(check[t] != s);

return t;

}



int my_dat_ac_scanbuff(const unsigned char *buffer, uint32_t length,const struct my_matcher *root)

{

int i;

// for(i = 1;i< root->ac_nodes+1;i++)

// {

// //printf("old s %03d 's type is %d\n", i, build_new[i].type);

// printf("old s %03d new_stat is %d\n", i, new[i]);

// }

// for(i = 0;i< T_SIZE;i++)

// {

// if(check_match[i].fail == 0)

// printf("stat %05d fail null[%d]\n", old[i], i);

// }

// fprintf(stdout, "trans num: %d\n", my_ac_trans_sum(root));

//return 0;

memset((void*)fail_count, 0x00, sizeof(fail_count));

uint32_t s = 1, t;

printf("my_dat_ac_scanbuff\n", i, build_new[i].type);

int count = 0;

for (i = 0; i < length; i++) {

t = buffer[i] + base[s];

if(check[t] != s){

//fail

//t = my_dat_ac_fail_proc(buffer[i], s);

t = my_dat_ac_fail_proc_hash(s, buffer, &i, length);

//continue;

}

// if(s == 1)

// st_hash_ref_count++;

printf("%05d --%02x--> %05d\n", old[s], buffer[i], old[t]);

s=t;

if(check_match[s].list){

//final

//fprintf(stdout, "hit:sigid %d:pos %d:\n", check_match[s].list->sigid, i);

count++;

}

}

// fprintf(stdout, "hit count %d, st_hash_fail_count %u st_hash_ref_count %u\n", count, st_hash_fail_count, st_hash_ref_count);

// for(i = 0;i<8;i++)

// fprintf(stdout, "%u,", fail_count[i]);

// fprintf(stdout, "\n");

return s;

}



int my_dat_update_input(uint32_t s, unsigned char a, uint32_t t)

{

printf("id:%03d -- %02x -- id:%03d\n", s, a, t);

my_input_enqueue(&build_new[s].valid_input_bfs, &build_new[s].valid_input_bfs_last, a);

build_new[t].from_char = a;

return 0;

}



int my_dat_update_patlist(uint32_t s, struct my_ac_patt *list)

{

build_new[s].state.list = list;

return 0;

}



int my_dat_update_fail(uint32_t s,uint32_t fail)

{

build_new[s].state.fail = fail;

return 0;

}



int my_dat_update_terminal(uint32_t s)

{

build_new[s].type = T_TER_ST;

return 0;

}



int my_dat_update_type(struct bitmap_state **st_array, uint16_t array_len)

{

int i = array_len-1;



if(build_new[st_array[i]->state_id].type !=0)

return 0;



int flag = 0;

for(; i>=0;i--)

{

if(st_array[i]->sumary[_bitmap_group] > 1 || flag)

{

build_new[st_array[i]->state_id].type = T_MUL_ST;

flag = 1;

}

else

build_new[st_array[i]->state_id].type = T_SEP_ST;

}

return 0;

}





_______________________________________________
http://lurker.clamav.net/list/clamav-devel.html
Please submit your patches to our Bugzilla: http://bugs.clamav.net