Loading Hash/src/config.cpp +659 −505 File changed.Preview size limit exceeded, changes collapsed. Show changes Hash/src/user.cpp +267 −236 Original line number Diff line number Diff line Loading @@ -9,27 +9,22 @@ long int max_dimension = -1; // used for restriction on dimensionality of vector data (used in i.e. print and distance functions). void restrict_dimension(long int _max_dim) { max_dimension = _max_dim; } //******************************************************** //******************************************************** // vector data read, print, distance functions, structures //******************************************************** //******************************************************** T_DISTANCE get_max_distance_float_vector() { return MAXIMUM_DISTANCE; } T_DISTANCE dist_func_L1_float_vector(void *p1, long int p1_len, void *p2, long int p2_len, T_DISTANCE threshold) { // manhattan Loading @@ -46,7 +41,8 @@ T_DISTANCE dist_func_L1_float_vector(void *p1, long int p1_len, void *p2, long i d1 = (float *)p1; d2 = (float *)p2; for(long int i = 0; i < dim; i++) { for (long int i = 0; i < dim; i++) { if (d1[i] < d2[i]) ret += (d2[i] - d1[i]); else Loading @@ -56,9 +52,6 @@ T_DISTANCE dist_func_L1_float_vector(void *p1, long int p1_len, void *p2, long i return ret; } T_DISTANCE dist_func_L2_float_vector(void *p1, long int p1_len, void *p2, long int p2_len, T_DISTANCE threshold) { float *d1, *d2; Loading @@ -75,7 +68,8 @@ T_DISTANCE dist_func_L2_float_vector(void *p1, long int p1_len, void *p2, long i d1 = (float *)p1; d2 = (float *)p2; for(long int i = 0; i < dim; i++) { for (long int i = 0; i < dim; i++) { diff = d2[i] - d1[i]; ret += diff * diff; } Loading @@ -85,9 +79,6 @@ T_DISTANCE dist_func_L2_float_vector(void *p1, long int p1_len, void *p2, long i return ret; } int read_obj_func_float_vector(FILE *f_in, long int dimensions, P_VOID &data, long int &data_len) { // this functions reads object's data from file f_in Loading @@ -103,14 +94,16 @@ int read_obj_func_float_vector(FILE *f_in, long int dimensions, P_VOID &data, lo data = malloc(data_len); d = (float *)data; while ( (!feof(f_in)) && (i < dimensions) ) { while ((!feof(f_in)) && (i < dimensions)) { if (fscanf(f_in, "%f", &tmp) == EOF) break; d[i] = tmp; ++i; } if (i < dimensions) { if (i < dimensions) { if (data) free(data); data = NULL; Loading @@ -120,9 +113,6 @@ int read_obj_func_float_vector(FILE *f_in, long int dimensions, P_VOID &data, lo return 0; } void print_obj_func_float_vector(void *data, long int data_len, FILE *f_out, unsigned int delim, const char *txt_pre, long int obj_id, int is_colored, const char *txt_post) { // data, data_len - data of object Loading @@ -133,7 +123,8 @@ void print_obj_func_float_vector(void *data, long int data_len, FILE *f_out, uns float *d; long int dim; if (txt_pre) { if (txt_pre) { const char *c; c = strchr(txt_pre, '%'); if (c == NULL || c[1] != 'c') Loading @@ -147,7 +138,8 @@ void print_obj_func_float_vector(void *data, long int data_len, FILE *f_out, uns if (max_dimension != -1 && dim > max_dimension) dim = max_dimension; if (dim) { if (dim) { fprintf(f_out, "%f", d[0]); for (long int i = 1; i < dim; i++) fprintf(f_out, "%c%f", delim, d[i]); Loading @@ -157,33 +149,29 @@ void print_obj_func_float_vector(void *data, long int data_len, FILE *f_out, uns fputs(txt_post, f_out); } //******************************************************** //******************************************************** // URL read, print, distance functions, structures //******************************************************** //******************************************************** struct t_url { struct t_url { long int next_off; char url[1]; }; struct t_url_data { struct t_url_data { long int cnt; t_url urls[1]; }; T_DISTANCE get_max_distance_urls() { return (T_DISTANCE)1; } T_DISTANCE dist_func_url_sets(void *p1, long int p1_len, void *p2, long int p2_len, T_DISTANCE threshold) { // #define UNSORTED_VERSION Loading @@ -198,7 +186,8 @@ T_DISTANCE dist_func_url_sets(void *p1, long int p1_len, void *p2, long int p2_l cnt_p1 = ((t_url_data *)p1)->cnt; cnt_p2 = ((t_url_data *)p2)->cnt; if (cnt_p1 < cnt_p2) { if (cnt_p1 < cnt_p2) { void *tmp = p1; long int ltmp = p1_len; p1 = p2; Loading @@ -210,26 +199,30 @@ T_DISTANCE dist_func_url_sets(void *p1, long int p1_len, void *p2, long int p2_l cnt_p2 = ltmp; } // evaluate size of intersect of p1 and p2 off1 = (long int *)(((t_url_data *)p1)->urls); for(i = 0; i < cnt_p1; i++) { for (i = 0; i < cnt_p1; i++) { url1 = (char *)off1 + sizeof(long int); off2 = (long int *)(((t_url_data *)p1)->urls); for (j = 0; j < i; j++) { for (j = 0; j < i; j++) { url2 = (char *)off2 + sizeof(long int); if (strcmp(url1, url2) == 0) break; off2 = (long int *)((char *)p1 + *off2); } if (j == i) { // the same url was not found in p1 object if (j == i) { // the same url was not found in p1 object off2 = (long int *)(((t_url_data *)p2)->urls); for (j = 0; j < cnt_p2; j++) { for (j = 0; j < cnt_p2; j++) { url2 = (char *)off2 + sizeof(long int); if (strcmp(url1, url2) == 0) { if (strcmp(url1, url2) == 0) { ++cnt_inter; break; } Loading @@ -243,11 +236,13 @@ T_DISTANCE dist_func_url_sets(void *p1, long int p1_len, void *p2, long int p2_l // evaluate size of union of p1 and p2 off1 = (long int *)(((t_url_data *)p1)->urls); for(i = 0; i < cnt_p1; i++) { for (i = 0; i < cnt_p1; i++) { url1 = (char *)off1 + sizeof(long int); off2 = (long int *)(((t_url_data *)p1)->urls); for (j = 0; j < i; j++) { for (j = 0; j < i; j++) { url2 = (char *)off2 + sizeof(long int); if (strcmp(url1, url2) == 0) break; Loading @@ -261,23 +256,27 @@ T_DISTANCE dist_func_url_sets(void *p1, long int p1_len, void *p2, long int p2_l off1 = (long int *)(((t_url_data *)p2)->urls); for(i = 0; i < cnt_p2; i++) { for (i = 0; i < cnt_p2; i++) { url1 = (char *)off1 + sizeof(long int); // search in p2 off2 = (long int *)(((t_url_data *)p2)->urls); for (j = 0; j < i; j++) { for (j = 0; j < i; j++) { url2 = (char *)off2 + sizeof(long int); if (strcmp(url1, url2) == 0) break; off2 = (long int *)((char *)p2 + *off2); } if (j == i) { if (j == i) { // search in p1 off2 = (long int *)(((t_url_data *)p1)->urls); for (j = 0; j < cnt_p1; j++) { for (j = 0; j < cnt_p1; j++) { url2 = (char *)off2 + sizeof(long int); if (strcmp(url1, url2) == 0) break; Loading Loading @@ -315,20 +314,24 @@ T_DISTANCE dist_func_url_sets(void *p1, long int p1_len, void *p2, long int p2_l url2 = (char *)off2 + sizeof(long int); i = 0; j = 0; while (i < cnt_p1 && j < cnt_p2) { while (i < cnt_p1 && j < cnt_p2) { cmp = strcmp(url1, url2); if (cmp == 0) { // url1 and url2 are the same if (cmp == 0) { // url1 and url2 are the same ++cnt_inter; off1 = (long int *)((char *)p1 + *off1); url1 = (char *)off1 + sizeof(long int); ++i; } else if (cmp < 0) { // url1 is less than url2 else if (cmp < 0) { // url1 is less than url2 off1 = (long int *)((char *)p1 + *off1); url1 = (char *)off1 + sizeof(long int); ++i; } else { // cmp > 0 // url2 is less than url1 else { // cmp > 0 // url2 is less than url1 off2 = (long int *)((char *)p2 + *off2); url2 = (char *)off2 + sizeof(long int); ++j; Loading @@ -345,8 +348,6 @@ T_DISTANCE dist_func_url_sets(void *p1, long int p1_len, void *p2, long int p2_l #endif } void print_obj_func_urls(void *data, long int data_len, FILE *f_out, unsigned int delim, const char *txt_pre, long int obj_id, int is_colored, const char *txt_post) { Loading @@ -359,7 +360,8 @@ void print_obj_func_urls(void *data, long int data_len, FILE *f_out, unsigned in char *url; long int *off; if (txt_pre) { if (txt_pre) { const char *c; c = strchr(txt_pre, '%'); if (c == NULL || c[1] != 'c') Loading @@ -368,13 +370,13 @@ void print_obj_func_urls(void *data, long int data_len, FILE *f_out, unsigned in fprintf(f_out, txt_pre, (is_colored) ? '*' : ' ', obj_id); } cnt = ((t_url_data *)data)->cnt; fprintf(f_out, "%ld", cnt); off = (long int *)(((t_url_data *)data)->urls); for(long int i = 0; i < cnt; i++) { for (long int i = 0; i < cnt; i++) { url = (char *)off + sizeof(long int); fputc(delim, f_out); fputs(url, f_out); Loading @@ -385,8 +387,6 @@ void print_obj_func_urls(void *data, long int data_len, FILE *f_out, unsigned in fputs(txt_post, f_out); } int read_obj_func_urls(FILE *f_in, long int dimensions, P_VOID &data, long int &data_len) { // this functions reads object's data from file f_in Loading @@ -400,13 +400,15 @@ int read_obj_func_urls(FILE *f_in, long int dimensions, P_VOID &data, long int & char *url; long int *off; if (feof(f_in)) { if (feof(f_in)) { data = NULL; data_len = 0L; return 1; } if (fscanf(f_in, "%ld", &cnt) == EOF) { if (fscanf(f_in, "%ld", &cnt) == EOF) { data = NULL; data_len = 0L; return 1; Loading @@ -416,18 +418,18 @@ int read_obj_func_urls(FILE *f_in, long int dimensions, P_VOID &data, long int & allocated = 64 * 1024L; data = malloc(allocated); data_len = sizeof(long int); ((t_url_data *)data)->cnt = cnt; // save count of urls url = (char *)&(((t_url_data *)data)->urls); for (long int i = 0; i < cnt; i++) { // read individual urls for (long int i = 0; i < cnt; i++) { // read individual urls off = (long int *)url; url += sizeof(long int); data_len += sizeof(long int); while ((c = fgetc(f_in)) != ' ' && c != EOF && c != '\r' && c != '\n') { while ((c = fgetc(f_in)) != ' ' && c != EOF && c != '\r' && c != '\n') { url[0] = c; ++url; ++data_len; Loading @@ -436,7 +438,8 @@ int read_obj_func_urls(FILE *f_in, long int dimensions, P_VOID &data, long int & ++url; ++data_len; *off = url - (char *)data; // set offset if (allocated - data_len < 1024) { // increase allocated memory if (allocated - data_len < 1024) { // increase allocated memory long int l_off, l_url; l_off = (char *)off - (char *)data; Loading @@ -452,23 +455,17 @@ int read_obj_func_urls(FILE *f_in, long int dimensions, P_VOID &data, long int & return 0; } //******************************************************** //******************************************************** // TEXT read, print, distance functions, structures //******************************************************** //******************************************************** T_DISTANCE get_max_distance_text() { return MAXIMUM_DISTANCE; } T_DISTANCE dist_func_text(void *p1, long int p1_len, void *p2, long int p2_len, T_DISTANCE threshold) { int *d; // pointer to vector Loading @@ -487,9 +484,11 @@ T_DISTANCE dist_func_text(void *p1, long int p1_len, void *p2, long int p2_len, --p2_len; // aproximate distance if (threshold != UNUSED_DISTANCE) { if (threshold != UNUSED_DISTANCE) { dist = (T_DISTANCE)fabs(p1_len - p2_len); if (threshold < dist) { // dist is the minimum bound, the edit distance cannot be lower if (threshold < dist) { // dist is the minimum bound, the edit distance cannot be lower #ifdef HASH_DEBUG fprintf(stderr, "%8.2f < %8.2f\n", threshold, dist); #endif Loading @@ -501,10 +500,12 @@ T_DISTANCE dist_func_text(void *p1, long int p1_len, void *p2, long int p2_len, // Step 1 if (p1_len <= 0) { if (p1_len <= 0) { return p2_len; } if (p2_len <= 0) { if (p2_len <= 0) { return p1_len; } d = (int *)malloc((p2_len + 1) * sizeof(int)); Loading @@ -517,7 +518,8 @@ T_DISTANCE dist_func_text(void *p1, long int p1_len, void *p2, long int p2_len, // Step 3 s_i = (char *)p1; for (i = 1; i <= p1_len; i++) { for (i = 1; i <= p1_len; i++) { d[0] = i; diag_prev = i - 1; Loading @@ -525,7 +527,8 @@ T_DISTANCE dist_func_text(void *p1, long int p1_len, void *p2, long int p2_len, // Step 4 t_j = (char *)p2; for (j = 1; j <= p2_len; j++) { for (j = 1; j <= p2_len; j++) { // Step 5 Loading Loading @@ -559,8 +562,6 @@ T_DISTANCE dist_func_text(void *p1, long int p1_len, void *p2, long int p2_len, return dist; } T_DISTANCE dist_func_text_length(void *p1, long int p1_len, void *p2, long int p2_len, T_DISTANCE threshold) { if (p1_len > p2_len) Loading @@ -569,8 +570,6 @@ T_DISTANCE dist_func_text_length(void *p1, long int p1_len, void *p2, long int p return p2_len - p1_len; } void print_obj_func_text(void *data, long int data_len, FILE *f_out, unsigned int delim, const char *txt_pre, long int obj_id, int is_colored, const char *txt_post) { // data, data_len - data of object Loading @@ -578,7 +577,8 @@ void print_obj_func_text(void *data, long int data_len, FILE *f_out, unsigned in // txt - additional comment, it's printed out before object's data, this text is used as formating argument to printf function // obj_id - this parameter is used as a parameter to printf function together with txt parameter (%ld can be used in txt for priting out) // is_colored - this parameter indicates whether the object is colored (1) or not (0) (for details see self-similarity join algo) if (txt_pre) { if (txt_pre) { const char *c; c = strchr(txt_pre, '%'); if (c == NULL || c[1] != 'c') Loading @@ -587,15 +587,12 @@ void print_obj_func_text(void *data, long int data_len, FILE *f_out, unsigned in fprintf(f_out, txt_pre, (is_colored) ? '*' : ' ', obj_id); } fputs((char *)data, f_out); if (txt_post) fputs(txt_post, f_out); } int read_obj_func_text(FILE *f_in, long int dimensions, P_VOID &data, long int &data_len) { // this functions reads object's data from file f_in Loading @@ -607,7 +604,8 @@ int read_obj_func_text(FILE *f_in, long int dimensions, P_VOID &data, long int & char c; char *txt; if (feof(f_in)) { if (feof(f_in)) { data = NULL; data_len = 0L; return 1; Loading @@ -618,16 +616,19 @@ int read_obj_func_text(FILE *f_in, long int dimensions, P_VOID &data, long int & data_len = 0; txt = (char *)data; while ((c = fgetc(f_in)) != EOF && c != '\x0a' && c != '\x0d') { while ((c = fgetc(f_in)) != EOF && c != '\x0a' && c != '\x0d') { txt[data_len] = c; ++data_len; if (allocated - data_len < 1) { // increase allocated memory if (allocated - data_len < 1) { // increase allocated memory allocated += 16 * 1024; data = realloc(data, allocated); txt = (char *)data; } } if (data_len == 0 && c == EOF) { if (data_len == 0 && c == EOF) { free(data); return 1; } Loading @@ -635,7 +636,8 @@ int read_obj_func_text(FILE *f_in, long int dimensions, P_VOID &data, long int & txt[data_len] = '\0'; ++data_len; if (c == '\x0d') { // read LF following CR if (c == '\x0d') { // read LF following CR c = fgetc(f_in); if (c != '\x0a') ungetc(c, f_in); Loading @@ -653,11 +655,13 @@ int read_obj_func_text(FILE *f_in, long int dimensions, P_VOID &data, long int & static int weightInsDelDNA = 3; void setWeightInsDelDNA(int w) { void setWeightInsDelDNA(int w) { weightInsDelDNA = w; } int getWeightInsDelDNA() { int getWeightInsDelDNA() { return weightInsDelDNA; } Loading Loading @@ -685,41 +689,64 @@ static int changeMatrixDNA[23][23] = { {3, 5, 4, 5, 6, 5, 5, 5, 6, 1, 3, 5, 2, 5, 4, 3, 4, 9, 5, 0, 3, 2, 2}, {3, 3, 3, 3, 6, 3, 3, 3, 3, 3, 3, 3, 3, 6, 3, 3, 3, 6, 4, 3, 0, 2, 2}, {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2}, {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0 } }; {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0}}; inline int getCharMatrixIndexDNA(char chr) { switch (chr) { case 'A': return 0; case 'R': return 1; case 'N': return 2; case 'D': return 3; case 'C': return 4; case 'Q': return 5; case 'E': return 6; case 'G': return 7; case 'H': return 8; case 'I': return 9; case 'L': return 10; case 'K': return 11; case 'M': return 12; case 'F': return 13; case 'P': return 14; case 'S': return 15; case 'T': return 16; case 'W': return 17; case 'Y': return 18; case 'V': return 19; case 'X': return 20; case 'Z': return 21; case 'B': return 22; inline int getCharMatrixIndexDNA(char chr) { switch (chr) { case 'A': return 0; case 'R': return 1; case 'N': return 2; case 'D': return 3; case 'C': return 4; case 'Q': return 5; case 'E': return 6; case 'G': return 7; case 'H': return 8; case 'I': return 9; case 'L': return 10; case 'K': return 11; case 'M': return 12; case 'F': return 13; case 'P': return 14; case 'S': return 15; case 'T': return 16; case 'W': return 17; case 'Y': return 18; case 'V': return 19; case 'X': return 20; case 'Z': return 21; case 'B': return 22; } printf("UNKNOWN LETTER '%c' IN THE DNA SEQUENCE!!! Aborting program.\n", chr); exit(-1); // return -1; } T_DISTANCE dist_func_dna(void *p1, long int p1_len, void *p2, long int p2_len, T_DISTANCE threshold) { int *d; // pointer to vector Loading Loading @@ -753,10 +780,12 @@ T_DISTANCE dist_func_dna(void *p1, long int p1_len, void *p2, long int p2_len, T // Step 1 if (p1_len <= 0) { if (p1_len <= 0) { return p2_len * weightInsDelDNA; } if (p2_len <= 0) { if (p2_len <= 0) { return p1_len * weightInsDelDNA; } d = (int *)malloc((p2_len + 1) * sizeof(int)); Loading @@ -769,7 +798,8 @@ T_DISTANCE dist_func_dna(void *p1, long int p1_len, void *p2, long int p2_len, T // Step 3 s_i = (char *)p1; for (i = 1; i <= p1_len; i++) { for (i = 1; i <= p1_len; i++) { d[0] = i * weightInsDelDNA; // Deleting i letters from the first string ??? diag_prev = (i - 1) * weightInsDelDNA; Loading @@ -777,7 +807,8 @@ T_DISTANCE dist_func_dna(void *p1, long int p1_len, void *p2, long int p2_len, T // Step 4 t_j = (char *)p2; for (j = 1; j <= p2_len; j++) { for (j = 1; j <= p2_len; j++) { // Step 5 Loading Loading
Hash/src/config.cpp +659 −505 File changed.Preview size limit exceeded, changes collapsed. Show changes
Hash/src/user.cpp +267 −236 Original line number Diff line number Diff line Loading @@ -9,27 +9,22 @@ long int max_dimension = -1; // used for restriction on dimensionality of vector data (used in i.e. print and distance functions). void restrict_dimension(long int _max_dim) { max_dimension = _max_dim; } //******************************************************** //******************************************************** // vector data read, print, distance functions, structures //******************************************************** //******************************************************** T_DISTANCE get_max_distance_float_vector() { return MAXIMUM_DISTANCE; } T_DISTANCE dist_func_L1_float_vector(void *p1, long int p1_len, void *p2, long int p2_len, T_DISTANCE threshold) { // manhattan Loading @@ -46,7 +41,8 @@ T_DISTANCE dist_func_L1_float_vector(void *p1, long int p1_len, void *p2, long i d1 = (float *)p1; d2 = (float *)p2; for(long int i = 0; i < dim; i++) { for (long int i = 0; i < dim; i++) { if (d1[i] < d2[i]) ret += (d2[i] - d1[i]); else Loading @@ -56,9 +52,6 @@ T_DISTANCE dist_func_L1_float_vector(void *p1, long int p1_len, void *p2, long i return ret; } T_DISTANCE dist_func_L2_float_vector(void *p1, long int p1_len, void *p2, long int p2_len, T_DISTANCE threshold) { float *d1, *d2; Loading @@ -75,7 +68,8 @@ T_DISTANCE dist_func_L2_float_vector(void *p1, long int p1_len, void *p2, long i d1 = (float *)p1; d2 = (float *)p2; for(long int i = 0; i < dim; i++) { for (long int i = 0; i < dim; i++) { diff = d2[i] - d1[i]; ret += diff * diff; } Loading @@ -85,9 +79,6 @@ T_DISTANCE dist_func_L2_float_vector(void *p1, long int p1_len, void *p2, long i return ret; } int read_obj_func_float_vector(FILE *f_in, long int dimensions, P_VOID &data, long int &data_len) { // this functions reads object's data from file f_in Loading @@ -103,14 +94,16 @@ int read_obj_func_float_vector(FILE *f_in, long int dimensions, P_VOID &data, lo data = malloc(data_len); d = (float *)data; while ( (!feof(f_in)) && (i < dimensions) ) { while ((!feof(f_in)) && (i < dimensions)) { if (fscanf(f_in, "%f", &tmp) == EOF) break; d[i] = tmp; ++i; } if (i < dimensions) { if (i < dimensions) { if (data) free(data); data = NULL; Loading @@ -120,9 +113,6 @@ int read_obj_func_float_vector(FILE *f_in, long int dimensions, P_VOID &data, lo return 0; } void print_obj_func_float_vector(void *data, long int data_len, FILE *f_out, unsigned int delim, const char *txt_pre, long int obj_id, int is_colored, const char *txt_post) { // data, data_len - data of object Loading @@ -133,7 +123,8 @@ void print_obj_func_float_vector(void *data, long int data_len, FILE *f_out, uns float *d; long int dim; if (txt_pre) { if (txt_pre) { const char *c; c = strchr(txt_pre, '%'); if (c == NULL || c[1] != 'c') Loading @@ -147,7 +138,8 @@ void print_obj_func_float_vector(void *data, long int data_len, FILE *f_out, uns if (max_dimension != -1 && dim > max_dimension) dim = max_dimension; if (dim) { if (dim) { fprintf(f_out, "%f", d[0]); for (long int i = 1; i < dim; i++) fprintf(f_out, "%c%f", delim, d[i]); Loading @@ -157,33 +149,29 @@ void print_obj_func_float_vector(void *data, long int data_len, FILE *f_out, uns fputs(txt_post, f_out); } //******************************************************** //******************************************************** // URL read, print, distance functions, structures //******************************************************** //******************************************************** struct t_url { struct t_url { long int next_off; char url[1]; }; struct t_url_data { struct t_url_data { long int cnt; t_url urls[1]; }; T_DISTANCE get_max_distance_urls() { return (T_DISTANCE)1; } T_DISTANCE dist_func_url_sets(void *p1, long int p1_len, void *p2, long int p2_len, T_DISTANCE threshold) { // #define UNSORTED_VERSION Loading @@ -198,7 +186,8 @@ T_DISTANCE dist_func_url_sets(void *p1, long int p1_len, void *p2, long int p2_l cnt_p1 = ((t_url_data *)p1)->cnt; cnt_p2 = ((t_url_data *)p2)->cnt; if (cnt_p1 < cnt_p2) { if (cnt_p1 < cnt_p2) { void *tmp = p1; long int ltmp = p1_len; p1 = p2; Loading @@ -210,26 +199,30 @@ T_DISTANCE dist_func_url_sets(void *p1, long int p1_len, void *p2, long int p2_l cnt_p2 = ltmp; } // evaluate size of intersect of p1 and p2 off1 = (long int *)(((t_url_data *)p1)->urls); for(i = 0; i < cnt_p1; i++) { for (i = 0; i < cnt_p1; i++) { url1 = (char *)off1 + sizeof(long int); off2 = (long int *)(((t_url_data *)p1)->urls); for (j = 0; j < i; j++) { for (j = 0; j < i; j++) { url2 = (char *)off2 + sizeof(long int); if (strcmp(url1, url2) == 0) break; off2 = (long int *)((char *)p1 + *off2); } if (j == i) { // the same url was not found in p1 object if (j == i) { // the same url was not found in p1 object off2 = (long int *)(((t_url_data *)p2)->urls); for (j = 0; j < cnt_p2; j++) { for (j = 0; j < cnt_p2; j++) { url2 = (char *)off2 + sizeof(long int); if (strcmp(url1, url2) == 0) { if (strcmp(url1, url2) == 0) { ++cnt_inter; break; } Loading @@ -243,11 +236,13 @@ T_DISTANCE dist_func_url_sets(void *p1, long int p1_len, void *p2, long int p2_l // evaluate size of union of p1 and p2 off1 = (long int *)(((t_url_data *)p1)->urls); for(i = 0; i < cnt_p1; i++) { for (i = 0; i < cnt_p1; i++) { url1 = (char *)off1 + sizeof(long int); off2 = (long int *)(((t_url_data *)p1)->urls); for (j = 0; j < i; j++) { for (j = 0; j < i; j++) { url2 = (char *)off2 + sizeof(long int); if (strcmp(url1, url2) == 0) break; Loading @@ -261,23 +256,27 @@ T_DISTANCE dist_func_url_sets(void *p1, long int p1_len, void *p2, long int p2_l off1 = (long int *)(((t_url_data *)p2)->urls); for(i = 0; i < cnt_p2; i++) { for (i = 0; i < cnt_p2; i++) { url1 = (char *)off1 + sizeof(long int); // search in p2 off2 = (long int *)(((t_url_data *)p2)->urls); for (j = 0; j < i; j++) { for (j = 0; j < i; j++) { url2 = (char *)off2 + sizeof(long int); if (strcmp(url1, url2) == 0) break; off2 = (long int *)((char *)p2 + *off2); } if (j == i) { if (j == i) { // search in p1 off2 = (long int *)(((t_url_data *)p1)->urls); for (j = 0; j < cnt_p1; j++) { for (j = 0; j < cnt_p1; j++) { url2 = (char *)off2 + sizeof(long int); if (strcmp(url1, url2) == 0) break; Loading Loading @@ -315,20 +314,24 @@ T_DISTANCE dist_func_url_sets(void *p1, long int p1_len, void *p2, long int p2_l url2 = (char *)off2 + sizeof(long int); i = 0; j = 0; while (i < cnt_p1 && j < cnt_p2) { while (i < cnt_p1 && j < cnt_p2) { cmp = strcmp(url1, url2); if (cmp == 0) { // url1 and url2 are the same if (cmp == 0) { // url1 and url2 are the same ++cnt_inter; off1 = (long int *)((char *)p1 + *off1); url1 = (char *)off1 + sizeof(long int); ++i; } else if (cmp < 0) { // url1 is less than url2 else if (cmp < 0) { // url1 is less than url2 off1 = (long int *)((char *)p1 + *off1); url1 = (char *)off1 + sizeof(long int); ++i; } else { // cmp > 0 // url2 is less than url1 else { // cmp > 0 // url2 is less than url1 off2 = (long int *)((char *)p2 + *off2); url2 = (char *)off2 + sizeof(long int); ++j; Loading @@ -345,8 +348,6 @@ T_DISTANCE dist_func_url_sets(void *p1, long int p1_len, void *p2, long int p2_l #endif } void print_obj_func_urls(void *data, long int data_len, FILE *f_out, unsigned int delim, const char *txt_pre, long int obj_id, int is_colored, const char *txt_post) { Loading @@ -359,7 +360,8 @@ void print_obj_func_urls(void *data, long int data_len, FILE *f_out, unsigned in char *url; long int *off; if (txt_pre) { if (txt_pre) { const char *c; c = strchr(txt_pre, '%'); if (c == NULL || c[1] != 'c') Loading @@ -368,13 +370,13 @@ void print_obj_func_urls(void *data, long int data_len, FILE *f_out, unsigned in fprintf(f_out, txt_pre, (is_colored) ? '*' : ' ', obj_id); } cnt = ((t_url_data *)data)->cnt; fprintf(f_out, "%ld", cnt); off = (long int *)(((t_url_data *)data)->urls); for(long int i = 0; i < cnt; i++) { for (long int i = 0; i < cnt; i++) { url = (char *)off + sizeof(long int); fputc(delim, f_out); fputs(url, f_out); Loading @@ -385,8 +387,6 @@ void print_obj_func_urls(void *data, long int data_len, FILE *f_out, unsigned in fputs(txt_post, f_out); } int read_obj_func_urls(FILE *f_in, long int dimensions, P_VOID &data, long int &data_len) { // this functions reads object's data from file f_in Loading @@ -400,13 +400,15 @@ int read_obj_func_urls(FILE *f_in, long int dimensions, P_VOID &data, long int & char *url; long int *off; if (feof(f_in)) { if (feof(f_in)) { data = NULL; data_len = 0L; return 1; } if (fscanf(f_in, "%ld", &cnt) == EOF) { if (fscanf(f_in, "%ld", &cnt) == EOF) { data = NULL; data_len = 0L; return 1; Loading @@ -416,18 +418,18 @@ int read_obj_func_urls(FILE *f_in, long int dimensions, P_VOID &data, long int & allocated = 64 * 1024L; data = malloc(allocated); data_len = sizeof(long int); ((t_url_data *)data)->cnt = cnt; // save count of urls url = (char *)&(((t_url_data *)data)->urls); for (long int i = 0; i < cnt; i++) { // read individual urls for (long int i = 0; i < cnt; i++) { // read individual urls off = (long int *)url; url += sizeof(long int); data_len += sizeof(long int); while ((c = fgetc(f_in)) != ' ' && c != EOF && c != '\r' && c != '\n') { while ((c = fgetc(f_in)) != ' ' && c != EOF && c != '\r' && c != '\n') { url[0] = c; ++url; ++data_len; Loading @@ -436,7 +438,8 @@ int read_obj_func_urls(FILE *f_in, long int dimensions, P_VOID &data, long int & ++url; ++data_len; *off = url - (char *)data; // set offset if (allocated - data_len < 1024) { // increase allocated memory if (allocated - data_len < 1024) { // increase allocated memory long int l_off, l_url; l_off = (char *)off - (char *)data; Loading @@ -452,23 +455,17 @@ int read_obj_func_urls(FILE *f_in, long int dimensions, P_VOID &data, long int & return 0; } //******************************************************** //******************************************************** // TEXT read, print, distance functions, structures //******************************************************** //******************************************************** T_DISTANCE get_max_distance_text() { return MAXIMUM_DISTANCE; } T_DISTANCE dist_func_text(void *p1, long int p1_len, void *p2, long int p2_len, T_DISTANCE threshold) { int *d; // pointer to vector Loading @@ -487,9 +484,11 @@ T_DISTANCE dist_func_text(void *p1, long int p1_len, void *p2, long int p2_len, --p2_len; // aproximate distance if (threshold != UNUSED_DISTANCE) { if (threshold != UNUSED_DISTANCE) { dist = (T_DISTANCE)fabs(p1_len - p2_len); if (threshold < dist) { // dist is the minimum bound, the edit distance cannot be lower if (threshold < dist) { // dist is the minimum bound, the edit distance cannot be lower #ifdef HASH_DEBUG fprintf(stderr, "%8.2f < %8.2f\n", threshold, dist); #endif Loading @@ -501,10 +500,12 @@ T_DISTANCE dist_func_text(void *p1, long int p1_len, void *p2, long int p2_len, // Step 1 if (p1_len <= 0) { if (p1_len <= 0) { return p2_len; } if (p2_len <= 0) { if (p2_len <= 0) { return p1_len; } d = (int *)malloc((p2_len + 1) * sizeof(int)); Loading @@ -517,7 +518,8 @@ T_DISTANCE dist_func_text(void *p1, long int p1_len, void *p2, long int p2_len, // Step 3 s_i = (char *)p1; for (i = 1; i <= p1_len; i++) { for (i = 1; i <= p1_len; i++) { d[0] = i; diag_prev = i - 1; Loading @@ -525,7 +527,8 @@ T_DISTANCE dist_func_text(void *p1, long int p1_len, void *p2, long int p2_len, // Step 4 t_j = (char *)p2; for (j = 1; j <= p2_len; j++) { for (j = 1; j <= p2_len; j++) { // Step 5 Loading Loading @@ -559,8 +562,6 @@ T_DISTANCE dist_func_text(void *p1, long int p1_len, void *p2, long int p2_len, return dist; } T_DISTANCE dist_func_text_length(void *p1, long int p1_len, void *p2, long int p2_len, T_DISTANCE threshold) { if (p1_len > p2_len) Loading @@ -569,8 +570,6 @@ T_DISTANCE dist_func_text_length(void *p1, long int p1_len, void *p2, long int p return p2_len - p1_len; } void print_obj_func_text(void *data, long int data_len, FILE *f_out, unsigned int delim, const char *txt_pre, long int obj_id, int is_colored, const char *txt_post) { // data, data_len - data of object Loading @@ -578,7 +577,8 @@ void print_obj_func_text(void *data, long int data_len, FILE *f_out, unsigned in // txt - additional comment, it's printed out before object's data, this text is used as formating argument to printf function // obj_id - this parameter is used as a parameter to printf function together with txt parameter (%ld can be used in txt for priting out) // is_colored - this parameter indicates whether the object is colored (1) or not (0) (for details see self-similarity join algo) if (txt_pre) { if (txt_pre) { const char *c; c = strchr(txt_pre, '%'); if (c == NULL || c[1] != 'c') Loading @@ -587,15 +587,12 @@ void print_obj_func_text(void *data, long int data_len, FILE *f_out, unsigned in fprintf(f_out, txt_pre, (is_colored) ? '*' : ' ', obj_id); } fputs((char *)data, f_out); if (txt_post) fputs(txt_post, f_out); } int read_obj_func_text(FILE *f_in, long int dimensions, P_VOID &data, long int &data_len) { // this functions reads object's data from file f_in Loading @@ -607,7 +604,8 @@ int read_obj_func_text(FILE *f_in, long int dimensions, P_VOID &data, long int & char c; char *txt; if (feof(f_in)) { if (feof(f_in)) { data = NULL; data_len = 0L; return 1; Loading @@ -618,16 +616,19 @@ int read_obj_func_text(FILE *f_in, long int dimensions, P_VOID &data, long int & data_len = 0; txt = (char *)data; while ((c = fgetc(f_in)) != EOF && c != '\x0a' && c != '\x0d') { while ((c = fgetc(f_in)) != EOF && c != '\x0a' && c != '\x0d') { txt[data_len] = c; ++data_len; if (allocated - data_len < 1) { // increase allocated memory if (allocated - data_len < 1) { // increase allocated memory allocated += 16 * 1024; data = realloc(data, allocated); txt = (char *)data; } } if (data_len == 0 && c == EOF) { if (data_len == 0 && c == EOF) { free(data); return 1; } Loading @@ -635,7 +636,8 @@ int read_obj_func_text(FILE *f_in, long int dimensions, P_VOID &data, long int & txt[data_len] = '\0'; ++data_len; if (c == '\x0d') { // read LF following CR if (c == '\x0d') { // read LF following CR c = fgetc(f_in); if (c != '\x0a') ungetc(c, f_in); Loading @@ -653,11 +655,13 @@ int read_obj_func_text(FILE *f_in, long int dimensions, P_VOID &data, long int & static int weightInsDelDNA = 3; void setWeightInsDelDNA(int w) { void setWeightInsDelDNA(int w) { weightInsDelDNA = w; } int getWeightInsDelDNA() { int getWeightInsDelDNA() { return weightInsDelDNA; } Loading Loading @@ -685,41 +689,64 @@ static int changeMatrixDNA[23][23] = { {3, 5, 4, 5, 6, 5, 5, 5, 6, 1, 3, 5, 2, 5, 4, 3, 4, 9, 5, 0, 3, 2, 2}, {3, 3, 3, 3, 6, 3, 3, 3, 3, 3, 3, 3, 3, 6, 3, 3, 3, 6, 4, 3, 0, 2, 2}, {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2}, {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0 } }; {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0}}; inline int getCharMatrixIndexDNA(char chr) { switch (chr) { case 'A': return 0; case 'R': return 1; case 'N': return 2; case 'D': return 3; case 'C': return 4; case 'Q': return 5; case 'E': return 6; case 'G': return 7; case 'H': return 8; case 'I': return 9; case 'L': return 10; case 'K': return 11; case 'M': return 12; case 'F': return 13; case 'P': return 14; case 'S': return 15; case 'T': return 16; case 'W': return 17; case 'Y': return 18; case 'V': return 19; case 'X': return 20; case 'Z': return 21; case 'B': return 22; inline int getCharMatrixIndexDNA(char chr) { switch (chr) { case 'A': return 0; case 'R': return 1; case 'N': return 2; case 'D': return 3; case 'C': return 4; case 'Q': return 5; case 'E': return 6; case 'G': return 7; case 'H': return 8; case 'I': return 9; case 'L': return 10; case 'K': return 11; case 'M': return 12; case 'F': return 13; case 'P': return 14; case 'S': return 15; case 'T': return 16; case 'W': return 17; case 'Y': return 18; case 'V': return 19; case 'X': return 20; case 'Z': return 21; case 'B': return 22; } printf("UNKNOWN LETTER '%c' IN THE DNA SEQUENCE!!! Aborting program.\n", chr); exit(-1); // return -1; } T_DISTANCE dist_func_dna(void *p1, long int p1_len, void *p2, long int p2_len, T_DISTANCE threshold) { int *d; // pointer to vector Loading Loading @@ -753,10 +780,12 @@ T_DISTANCE dist_func_dna(void *p1, long int p1_len, void *p2, long int p2_len, T // Step 1 if (p1_len <= 0) { if (p1_len <= 0) { return p2_len * weightInsDelDNA; } if (p2_len <= 0) { if (p2_len <= 0) { return p1_len * weightInsDelDNA; } d = (int *)malloc((p2_len + 1) * sizeof(int)); Loading @@ -769,7 +798,8 @@ T_DISTANCE dist_func_dna(void *p1, long int p1_len, void *p2, long int p2_len, T // Step 3 s_i = (char *)p1; for (i = 1; i <= p1_len; i++) { for (i = 1; i <= p1_len; i++) { d[0] = i * weightInsDelDNA; // Deleting i letters from the first string ??? diag_prev = (i - 1) * weightInsDelDNA; Loading @@ -777,7 +807,8 @@ T_DISTANCE dist_func_dna(void *p1, long int p1_len, void *p2, long int p2_len, T // Step 4 t_j = (char *)p2; for (j = 1; j <= p2_len; j++) { for (j = 1; j <= p2_len; j++) { // Step 5 Loading