Commit ca4b5b23 authored by Boris Lukačovič's avatar Boris Lukačovič
Browse files

EDIT: config for output redirect

parent 7cff4fc6
Loading
Loading
Loading
Loading
+659 −505

File changed.

Preview size limit exceeded, changes collapsed.

+267 −236
Original line number Diff line number Diff line
@@ -9,27 +9,22 @@

long int max_dimension = -1; // used for restriction on dimensionality of vector data (used in i.e. print and distance functions).


void restrict_dimension(long int _max_dim)
{
  max_dimension = _max_dim;
}


//********************************************************
//********************************************************
// vector data read, print, distance functions, structures
//********************************************************
//********************************************************


T_DISTANCE get_max_distance_float_vector()
{
  return MAXIMUM_DISTANCE;
}



T_DISTANCE dist_func_L1_float_vector(void *p1, long int p1_len, void *p2, long int p2_len, T_DISTANCE threshold)
{
  // manhattan
@@ -46,7 +41,8 @@ T_DISTANCE dist_func_L1_float_vector(void *p1, long int p1_len, void *p2, long i
  d1 = (float *)p1;
  d2 = (float *)p2;

  for(long int i = 0; i < dim; i++) {
  for (long int i = 0; i < dim; i++)
  {
    if (d1[i] < d2[i])
      ret += (d2[i] - d1[i]);
    else
@@ -56,9 +52,6 @@ T_DISTANCE dist_func_L1_float_vector(void *p1, long int p1_len, void *p2, long i
  return ret;
}




T_DISTANCE dist_func_L2_float_vector(void *p1, long int p1_len, void *p2, long int p2_len, T_DISTANCE threshold)
{
  float *d1, *d2;
@@ -75,7 +68,8 @@ T_DISTANCE dist_func_L2_float_vector(void *p1, long int p1_len, void *p2, long i
  d1 = (float *)p1;
  d2 = (float *)p2;

  for(long int i = 0; i < dim; i++) {
  for (long int i = 0; i < dim; i++)
  {
    diff = d2[i] - d1[i];
    ret += diff * diff;
  }
@@ -85,9 +79,6 @@ T_DISTANCE dist_func_L2_float_vector(void *p1, long int p1_len, void *p2, long i
  return ret;
}




int read_obj_func_float_vector(FILE *f_in, long int dimensions, P_VOID &data, long int &data_len)
{
  // this functions reads object's data from file f_in
@@ -103,14 +94,16 @@ int read_obj_func_float_vector(FILE *f_in, long int dimensions, P_VOID &data, lo
  data = malloc(data_len);
  d = (float *)data;

  while ( (!feof(f_in)) && (i < dimensions) ) {
  while ((!feof(f_in)) && (i < dimensions))
  {
    if (fscanf(f_in, "%f", &tmp) == EOF)
      break;
    d[i] = tmp;
    ++i;
  }

  if (i < dimensions) {
  if (i < dimensions)
  {
    if (data)
      free(data);
    data = NULL;
@@ -120,9 +113,6 @@ int read_obj_func_float_vector(FILE *f_in, long int dimensions, P_VOID &data, lo
  return 0;
}




void print_obj_func_float_vector(void *data, long int data_len, FILE *f_out, unsigned int delim, const char *txt_pre, long int obj_id, int is_colored, const char *txt_post)
{
  // data, data_len - data of object
@@ -133,7 +123,8 @@ void print_obj_func_float_vector(void *data, long int data_len, FILE *f_out, uns
  float *d;
  long int dim;

  if (txt_pre) {
  if (txt_pre)
  {
    const char *c;
    c = strchr(txt_pre, '%');
    if (c == NULL || c[1] != 'c')
@@ -147,7 +138,8 @@ void print_obj_func_float_vector(void *data, long int data_len, FILE *f_out, uns
  if (max_dimension != -1 && dim > max_dimension)
    dim = max_dimension;

  if (dim) {
  if (dim)
  {
    fprintf(f_out, "%f", d[0]);
    for (long int i = 1; i < dim; i++)
      fprintf(f_out, "%c%f", delim, d[i]);
@@ -157,33 +149,29 @@ void print_obj_func_float_vector(void *data, long int data_len, FILE *f_out, uns
    fputs(txt_post, f_out);
}



//********************************************************
//********************************************************
// URL read, print, distance functions, structures
//********************************************************
//********************************************************

struct t_url {
struct t_url
{
  long int next_off;
  char url[1];
};

struct t_url_data {
struct t_url_data
{
  long int cnt;
  t_url urls[1];
};



T_DISTANCE get_max_distance_urls()
{
  return (T_DISTANCE)1;
}



T_DISTANCE dist_func_url_sets(void *p1, long int p1_len, void *p2, long int p2_len, T_DISTANCE threshold)
{
// #define UNSORTED_VERSION
@@ -198,7 +186,8 @@ T_DISTANCE dist_func_url_sets(void *p1, long int p1_len, void *p2, long int p2_l

  cnt_p1 = ((t_url_data *)p1)->cnt;
  cnt_p2 = ((t_url_data *)p2)->cnt;
  if (cnt_p1 < cnt_p2) {
  if (cnt_p1 < cnt_p2)
  {
    void *tmp = p1;
    long int ltmp = p1_len;
    p1 = p2;
@@ -210,26 +199,30 @@ T_DISTANCE dist_func_url_sets(void *p1, long int p1_len, void *p2, long int p2_l
    cnt_p2 = ltmp;
  }


  // evaluate size of intersect of p1 and p2
  off1 = (long int *)(((t_url_data *)p1)->urls);

  for(i = 0; i < cnt_p1; i++) {
  for (i = 0; i < cnt_p1; i++)
  {
    url1 = (char *)off1 + sizeof(long int);

    off2 = (long int *)(((t_url_data *)p1)->urls);
    for (j = 0; j < i; j++) {
    for (j = 0; j < i; j++)
    {
      url2 = (char *)off2 + sizeof(long int);
      if (strcmp(url1, url2) == 0)
        break;
      off2 = (long int *)((char *)p1 + *off2);
    }

    if (j == i) {         // the same url was not found in p1 object
    if (j == i)
    { // the same url was not found in p1 object
      off2 = (long int *)(((t_url_data *)p2)->urls);
      for (j = 0; j < cnt_p2; j++) {
      for (j = 0; j < cnt_p2; j++)
      {
        url2 = (char *)off2 + sizeof(long int);
        if (strcmp(url1, url2) == 0) {
        if (strcmp(url1, url2) == 0)
        {
          ++cnt_inter;
          break;
        }
@@ -243,11 +236,13 @@ T_DISTANCE dist_func_url_sets(void *p1, long int p1_len, void *p2, long int p2_l
  // evaluate size of union of p1 and p2
  off1 = (long int *)(((t_url_data *)p1)->urls);

  for(i = 0; i < cnt_p1; i++) {
  for (i = 0; i < cnt_p1; i++)
  {
    url1 = (char *)off1 + sizeof(long int);
    off2 = (long int *)(((t_url_data *)p1)->urls);

    for (j = 0; j < i; j++) {
    for (j = 0; j < i; j++)
    {
      url2 = (char *)off2 + sizeof(long int);
      if (strcmp(url1, url2) == 0)
        break;
@@ -261,23 +256,27 @@ T_DISTANCE dist_func_url_sets(void *p1, long int p1_len, void *p2, long int p2_l

  off1 = (long int *)(((t_url_data *)p2)->urls);

  for(i = 0; i < cnt_p2; i++) {
  for (i = 0; i < cnt_p2; i++)
  {
    url1 = (char *)off1 + sizeof(long int);

    // search in p2
    off2 = (long int *)(((t_url_data *)p2)->urls);

    for (j = 0; j < i; j++) {
    for (j = 0; j < i; j++)
    {
      url2 = (char *)off2 + sizeof(long int);
      if (strcmp(url1, url2) == 0)
        break;
      off2 = (long int *)((char *)p2 + *off2);
    }
    if (j == i) {
    if (j == i)
    {
      // search in p1
      off2 = (long int *)(((t_url_data *)p1)->urls);

      for (j = 0; j < cnt_p1; j++) {
      for (j = 0; j < cnt_p1; j++)
      {
        url2 = (char *)off2 + sizeof(long int);
        if (strcmp(url1, url2) == 0)
          break;
@@ -315,20 +314,24 @@ T_DISTANCE dist_func_url_sets(void *p1, long int p1_len, void *p2, long int p2_l
  url2 = (char *)off2 + sizeof(long int);
  i = 0;
  j = 0;
  while (i < cnt_p1 && j < cnt_p2) {
  while (i < cnt_p1 && j < cnt_p2)
  {
    cmp = strcmp(url1, url2);
  	if (cmp == 0) {				// url1 and url2 are the same
    if (cmp == 0)
    { // url1 and url2 are the same
      ++cnt_inter;
      off1 = (long int *)((char *)p1 + *off1);
      url1 = (char *)off1 + sizeof(long int);
      ++i;
    }
  	else if (cmp < 0) {		// url1 is less than url2
    else if (cmp < 0)
    { // url1 is less than url2
      off1 = (long int *)((char *)p1 + *off1);
      url1 = (char *)off1 + sizeof(long int);
      ++i;
    }
  	else { // cmp > 0			// url2 is less than url1
    else
    { // cmp > 0			// url2 is less than url1
      off2 = (long int *)((char *)p2 + *off2);
      url2 = (char *)off2 + sizeof(long int);
      ++j;
@@ -345,8 +348,6 @@ T_DISTANCE dist_func_url_sets(void *p1, long int p1_len, void *p2, long int p2_l
#endif
}



void print_obj_func_urls(void *data, long int data_len, FILE *f_out, unsigned int delim,
                         const char *txt_pre, long int obj_id, int is_colored, const char *txt_post)
{
@@ -359,7 +360,8 @@ void print_obj_func_urls(void *data, long int data_len, FILE *f_out, unsigned in
  char *url;
  long int *off;

  if (txt_pre) {
  if (txt_pre)
  {
    const char *c;
    c = strchr(txt_pre, '%');
    if (c == NULL || c[1] != 'c')
@@ -368,13 +370,13 @@ void print_obj_func_urls(void *data, long int data_len, FILE *f_out, unsigned in
      fprintf(f_out, txt_pre, (is_colored) ? '*' : ' ', obj_id);
  }


  cnt = ((t_url_data *)data)->cnt;
  fprintf(f_out, "%ld", cnt);

  off = (long int *)(((t_url_data *)data)->urls);

  for(long int i = 0; i < cnt; i++) {
  for (long int i = 0; i < cnt; i++)
  {
    url = (char *)off + sizeof(long int);
    fputc(delim, f_out);
    fputs(url, f_out);
@@ -385,8 +387,6 @@ void print_obj_func_urls(void *data, long int data_len, FILE *f_out, unsigned in
    fputs(txt_post, f_out);
}



int read_obj_func_urls(FILE *f_in, long int dimensions, P_VOID &data, long int &data_len)
{
  // this functions reads object's data from file f_in
@@ -400,13 +400,15 @@ int read_obj_func_urls(FILE *f_in, long int dimensions, P_VOID &data, long int &
  char *url;
  long int *off;

  if (feof(f_in)) {
  if (feof(f_in))
  {
    data = NULL;
    data_len = 0L;
    return 1;
  }

  if (fscanf(f_in, "%ld", &cnt) == EOF) {
  if (fscanf(f_in, "%ld", &cnt) == EOF)
  {
    data = NULL;
    data_len = 0L;
    return 1;
@@ -416,18 +418,18 @@ int read_obj_func_urls(FILE *f_in, long int dimensions, P_VOID &data, long int &
  allocated = 64 * 1024L;
  data = malloc(allocated);



  data_len = sizeof(long int);

  ((t_url_data *)data)->cnt = cnt; // save count of urls
  url = (char *)&(((t_url_data *)data)->urls);

  for (long int i = 0; i < cnt; i++) {    // read individual urls
  for (long int i = 0; i < cnt; i++)
  { // read individual urls
    off = (long int *)url;
    url += sizeof(long int);
    data_len += sizeof(long int);
    while ((c = fgetc(f_in)) != ' ' && c != EOF && c != '\r' && c != '\n') {
    while ((c = fgetc(f_in)) != ' ' && c != EOF && c != '\r' && c != '\n')
    {
      url[0] = c;
      ++url;
      ++data_len;
@@ -436,7 +438,8 @@ int read_obj_func_urls(FILE *f_in, long int dimensions, P_VOID &data, long int &
    ++url;
    ++data_len;
    *off = url - (char *)data; // set offset
    if (allocated - data_len < 1024) {      // increase allocated memory
    if (allocated - data_len < 1024)
    { // increase allocated memory
      long int l_off, l_url;

      l_off = (char *)off - (char *)data;
@@ -452,23 +455,17 @@ int read_obj_func_urls(FILE *f_in, long int dimensions, P_VOID &data, long int &
  return 0;
}



//********************************************************
//********************************************************
// TEXT read, print, distance functions, structures
//********************************************************
//********************************************************



T_DISTANCE get_max_distance_text()
{
  return MAXIMUM_DISTANCE;
}



T_DISTANCE dist_func_text(void *p1, long int p1_len, void *p2, long int p2_len, T_DISTANCE threshold)
{
  int *d;          // pointer to vector
@@ -487,9 +484,11 @@ T_DISTANCE dist_func_text(void *p1, long int p1_len, void *p2, long int p2_len,
  --p2_len;

  // aproximate distance
  if (threshold != UNUSED_DISTANCE) {
  if (threshold != UNUSED_DISTANCE)
  {
    dist = (T_DISTANCE)fabs(p1_len - p2_len);
    if (threshold < dist) { // dist is the minimum bound, the edit distance cannot be lower
    if (threshold < dist)
    { // dist is the minimum bound, the edit distance cannot be lower
#ifdef HASH_DEBUG
      fprintf(stderr, "%8.2f < %8.2f\n", threshold, dist);
#endif
@@ -501,10 +500,12 @@ T_DISTANCE dist_func_text(void *p1, long int p1_len, void *p2, long int p2_len,

  // Step 1

  if (p1_len <= 0) {
  if (p1_len <= 0)
  {
    return p2_len;
  }
  if (p2_len <= 0) {
  if (p2_len <= 0)
  {
    return p1_len;
  }
  d = (int *)malloc((p2_len + 1) * sizeof(int));
@@ -517,7 +518,8 @@ T_DISTANCE dist_func_text(void *p1, long int p1_len, void *p2, long int p2_len,
  // Step 3
  s_i = (char *)p1;

  for (i = 1; i <= p1_len; i++) {
  for (i = 1; i <= p1_len; i++)
  {

    d[0] = i;
    diag_prev = i - 1;
@@ -525,7 +527,8 @@ T_DISTANCE dist_func_text(void *p1, long int p1_len, void *p2, long int p2_len,
    // Step 4
    t_j = (char *)p2;

    for (j = 1; j <= p2_len; j++) {
    for (j = 1; j <= p2_len; j++)
    {

      // Step 5

@@ -559,8 +562,6 @@ T_DISTANCE dist_func_text(void *p1, long int p1_len, void *p2, long int p2_len,
  return dist;
}



T_DISTANCE dist_func_text_length(void *p1, long int p1_len, void *p2, long int p2_len, T_DISTANCE threshold)
{
  if (p1_len > p2_len)
@@ -569,8 +570,6 @@ T_DISTANCE dist_func_text_length(void *p1, long int p1_len, void *p2, long int p
    return p2_len - p1_len;
}



void print_obj_func_text(void *data, long int data_len, FILE *f_out, unsigned int delim, const char *txt_pre, long int obj_id, int is_colored, const char *txt_post)
{
  // data, data_len - data of object
@@ -578,7 +577,8 @@ void print_obj_func_text(void *data, long int data_len, FILE *f_out, unsigned in
  // txt - additional comment, it's printed out before object's data, this text is used as formating argument to printf function
  // obj_id - this parameter is used as a parameter to printf function together with txt parameter (%ld can be used in txt for priting out)
  // is_colored - this parameter indicates whether the object is colored (1) or not (0) (for details see self-similarity join algo)
  if (txt_pre) {
  if (txt_pre)
  {
    const char *c;
    c = strchr(txt_pre, '%');
    if (c == NULL || c[1] != 'c')
@@ -587,15 +587,12 @@ void print_obj_func_text(void *data, long int data_len, FILE *f_out, unsigned in
      fprintf(f_out, txt_pre, (is_colored) ? '*' : ' ', obj_id);
  }


  fputs((char *)data, f_out);

  if (txt_post)
    fputs(txt_post, f_out);
}



int read_obj_func_text(FILE *f_in, long int dimensions, P_VOID &data, long int &data_len)
{
  // this functions reads object's data from file f_in
@@ -607,7 +604,8 @@ int read_obj_func_text(FILE *f_in, long int dimensions, P_VOID &data, long int &
  char c;
  char *txt;

  if (feof(f_in)) {
  if (feof(f_in))
  {
    data = NULL;
    data_len = 0L;
    return 1;
@@ -618,16 +616,19 @@ int read_obj_func_text(FILE *f_in, long int dimensions, P_VOID &data, long int &
  data_len = 0;
  txt = (char *)data;

  while ((c = fgetc(f_in)) != EOF && c != '\x0a' && c != '\x0d') {
  while ((c = fgetc(f_in)) != EOF && c != '\x0a' && c != '\x0d')
  {
    txt[data_len] = c;
    ++data_len;
    if (allocated - data_len < 1) {      // increase allocated memory
    if (allocated - data_len < 1)
    { // increase allocated memory
      allocated += 16 * 1024;
      data = realloc(data, allocated);
      txt = (char *)data;
    }
  }
  if (data_len == 0 && c == EOF) {
  if (data_len == 0 && c == EOF)
  {
    free(data);
    return 1;
  }
@@ -635,7 +636,8 @@ int read_obj_func_text(FILE *f_in, long int dimensions, P_VOID &data, long int &
  txt[data_len] = '\0';
  ++data_len;

  if (c == '\x0d') {		// read LF following CR
  if (c == '\x0d')
  { // read LF following CR
    c = fgetc(f_in);
    if (c != '\x0a')
      ungetc(c, f_in);
@@ -653,11 +655,13 @@ int read_obj_func_text(FILE *f_in, long int dimensions, P_VOID &data, long int &

static int weightInsDelDNA = 3;

void setWeightInsDelDNA(int w) {
void setWeightInsDelDNA(int w)
{
  weightInsDelDNA = w;
}

int getWeightInsDelDNA() {
int getWeightInsDelDNA()
{
  return weightInsDelDNA;
}

@@ -685,41 +689,64 @@ static int changeMatrixDNA[23][23] = {
    {3, 5, 4, 5, 6, 5, 5, 5, 6, 1, 3, 5, 2, 5, 4, 3, 4, 9, 5, 0, 3, 2, 2},
    {3, 3, 3, 3, 6, 3, 3, 3, 3, 3, 3, 3, 3, 6, 3, 3, 3, 6, 4, 3, 0, 2, 2},
    {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2},
	{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0 }
};
    {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0}};

inline int getCharMatrixIndexDNA(char chr) {
	switch (chr) {
		case 'A': return 0;
		case 'R': return 1;
		case 'N': return 2;
		case 'D': return 3;
		case 'C': return 4;
		case 'Q': return 5;
		case 'E': return 6;
		case 'G': return 7;
		case 'H': return 8;
		case 'I': return 9;
		case 'L': return 10;
		case 'K': return 11;
		case 'M': return 12;
		case 'F': return 13;
		case 'P': return 14;
		case 'S': return 15;
		case 'T': return 16;
		case 'W': return 17;
		case 'Y': return 18;
		case 'V': return 19;
		case 'X': return 20;
		case 'Z': return 21;
		case 'B': return 22;
inline int getCharMatrixIndexDNA(char chr)
{
  switch (chr)
  {
  case 'A':
    return 0;
  case 'R':
    return 1;
  case 'N':
    return 2;
  case 'D':
    return 3;
  case 'C':
    return 4;
  case 'Q':
    return 5;
  case 'E':
    return 6;
  case 'G':
    return 7;
  case 'H':
    return 8;
  case 'I':
    return 9;
  case 'L':
    return 10;
  case 'K':
    return 11;
  case 'M':
    return 12;
  case 'F':
    return 13;
  case 'P':
    return 14;
  case 'S':
    return 15;
  case 'T':
    return 16;
  case 'W':
    return 17;
  case 'Y':
    return 18;
  case 'V':
    return 19;
  case 'X':
    return 20;
  case 'Z':
    return 21;
  case 'B':
    return 22;
  }
  printf("UNKNOWN LETTER '%c' IN THE DNA SEQUENCE!!! Aborting program.\n", chr);
  exit(-1);
  // return -1;
}


T_DISTANCE dist_func_dna(void *p1, long int p1_len, void *p2, long int p2_len, T_DISTANCE threshold)
{
  int *d;          // pointer to vector
@@ -753,10 +780,12 @@ T_DISTANCE dist_func_dna(void *p1, long int p1_len, void *p2, long int p2_len, T

  // Step 1

  if (p1_len <= 0) {
  if (p1_len <= 0)
  {
    return p2_len * weightInsDelDNA;
  }
  if (p2_len <= 0) {
  if (p2_len <= 0)
  {
    return p1_len * weightInsDelDNA;
  }
  d = (int *)malloc((p2_len + 1) * sizeof(int));
@@ -769,7 +798,8 @@ T_DISTANCE dist_func_dna(void *p1, long int p1_len, void *p2, long int p2_len, T
  // Step 3
  s_i = (char *)p1;

  for (i = 1; i <= p1_len; i++) {
  for (i = 1; i <= p1_len; i++)
  {

    d[0] = i * weightInsDelDNA; // Deleting i letters from the first string ???
    diag_prev = (i - 1) * weightInsDelDNA;
@@ -777,7 +807,8 @@ T_DISTANCE dist_func_dna(void *p1, long int p1_len, void *p2, long int p2_len, T
    // Step 4
    t_j = (char *)p2;

    for (j = 1; j <= p2_len; j++) {
    for (j = 1; j <= p2_len; j++)
    {

      // Step 5