/************************************************************************************* * * DBCOMM.C * * Finds subsequences of one database in the other * * The format of the sequences must be: "NAME SEQUENCE........" on 1 line * No empty lines, please. Only one word for the name. * * Note: Sequences shorter than 3 can't be found, since the index word-size is 3. * Exactly duplicated entries are only reported once in self-comparisons, * so that only the first is deleted. * * Algorithm: Build 3-letter word index table of query and step query-lengths in * subject. If last word not in index, take another step. If it is, test inclusion. * * Copyright (C) Erik Sonnhammer, 930818 * **************************************************************************************/ #include #include #define MAXLEN 400000 #define NAMESIZE 50 typedef struct _DBSEQ { char *seq; int len; char name[NAMESIZE+1]; } DBSEQ; # define NA 23 /* 24 letters in alphabet ABCDEFGKI KLMN PQRST VWXYZ \* */ int a2b[] /* ASCII-to-binary translation table */ = { NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA, NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA, NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA, NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA, NA, 0, 1, 2, 3, 4, 5, 6, 7, 8,NA, 9,10,11,12,NA, 13,14,15,16,17,NA,18,19,20,21,22,NA,NA,NA,NA,NA, NA, 0, 1, 2, 3, 4, 5, 6, 7, 8,NA, 9,10,11,12,NA, 13,14,15,16,17,NA,18,19,20,21,22,NA,NA,NA,NA,NA, NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA, NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA, NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA, NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA, NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA, NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA, NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA, NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA }; void memerr() { fprintf(stderr,"Out of memory"); exit(0); } void check_id(void); void strUPcpy(char *dest, char *src) { char *d, *s; for (d = dest, s = src; *s; d++, s++) *d = toupper(*s); *d = 0; } /* Global Variables *******************/ char query[MAXLEN], line[MAXLEN], qname[NAMESIZE], *seqstart, *dbpos, *dbend; DBSEQ **db; FILE *file, *sfile; int qcount, i, j, k, found, qlen, findself = 0, verbose = 0, selfcomp = 0, nomatch = 0, inclusion = 1; void main(int argc, char *argv[]) { unsigned char idx[24][24][24]; register int firstjump, bigjump, snum; int optc; extern int optind; extern char *optarg; char *optstring="svni"; char *usage = "\n\ Usage: dbcomm \n\ \n\ Note: The format of both databases must be \"name sequence\".\n\ \n\ -v Verbose mode (show sequences)\n\ -s Allow query to find itself\n\ -n Report non-matching queries\n\ -i Don't report inclusions, only entire matches\n\n"; while ((optc = getopt(argc, argv, optstring)) != -1) switch (optc) { case 's': findself = 1; break; case 'v': verbose = 1; break; case 'n': nomatch = 1; break; case 'i': inclusion = 0; break; default: fprintf(stderr, "%s", usage); exit(1); } if (argc - optind < 2) { fprintf(stderr, "%s", usage); exit(1); } if ((file = fopen( argv[optind], "r" )) == NULL) { fprintf(stderr, "Cannot open file %s\n", argv[optind]); exit(1); } if ((sfile = fopen( argv[optind+1], "r" )) == NULL) { fprintf(stderr, "Cannot open file %s\n", argv[optind+1]); exit(1); } if (!strcmp(argv[optind], argv[optind+1])) { selfcomp = 1; fprintf(stderr, "Comparing database to itself...\n"); } snum = 0; while ( !feof(sfile) ) { if (!fgets(line, MAXLEN, sfile)) break; snum++; } fprintf(stderr, "Reading %d Subjects into memory...\n", snum); if (!(db = (DBSEQ **)malloc(snum*sizeof(DBSEQ *)))) memerr(); i = 0; rewind(sfile); while ( !feof(sfile) ) { if (!fgets(line, MAXLEN, sfile)) break; if (!(seqstart = (char *)strchr(line, ' '))) { fprintf(stderr, "Bad format or Too long sequence"); exit(0); } if ( !( db[i] = (DBSEQ *)malloc(sizeof(DBSEQ)))) memerr(); db[i]->len = strlen(seqstart)-2; seqstart[db[i]->len+1] = 0; /* get rid of newline */ if ( !( db[i]->seq = (char *)malloc(db[i]->len+1) ) ) memerr(); strUPcpy(db[i]->seq, seqstart+1); *seqstart = 0; strUPcpy(db[i]->name, line); i++; } /* fprintf(stderr, "Searching..."); */ /* Zero index */ for (i = 0; i<24; i++) for (j = 0; j<24; j++) for (k = 0; k<24; k++) idx[i][j][k] = 0; qcount = 0; while ( !feof(file) ) { if (qcount && !(qcount % 100)) fprintf (stderr, "Done %d...\n", qcount); if (!fgets(line, MAXLEN, file)) break; if (!(seqstart = (char *)strchr(line, ' '))) { fprintf(stderr, "Bad format"); exit(0); } qcount++; strUPcpy(query, seqstart+1); qlen = strlen(query) -1; query[qlen] = 0; /* get rid of newline */ *seqstart = 0; strUPcpy(qname, line); /* printf("Searching %s ...\n", qname); */ if (qlen < 3) { printf("Sequence smaller than 3: %s\n", qname); } else { /* fprintf(stderr, "%s\n", qname);*/ /* Preprocess query to a 3-letter word index */ for (i = 0; i <= qlen-3; i++) idx[a2b[query[i]]][a2b[query[i+1]]][a2b[query[i+2]]] = 1; /* Search all Subjects */ found = 0; firstjump = qlen - 3; bigjump = qlen - 2; for(i = 0; i < snum && !found; i++) { if (qlen <= db[i]->len) { dbpos = db[i]->seq + firstjump; dbend = db[i]->seq + db[i]->len - 1; while (dbpos + 2 <= dbend && !found) { if (idx[a2b[*dbpos]][a2b[*(dbpos+1)]][a2b[*(dbpos+2)]]) check_id(); dbpos = dbpos + bigjump; } /* Check sticky end */ if (!found && idx[a2b[*(dbend-2)]][a2b[*(dbend-1)]][a2b[*dbend]]) check_id(); } } if (!found && nomatch) { printf("Query: %s not found in subject db\n", qname); } /* Reset index */ for (i = 0; i <= qlen-3; i++) idx[a2b[query[i]]][a2b[query[i+1]]][a2b[query[i+2]]] = 0; } } } void check_id(void) { if (strstr(db[i]->seq, query)) { if (findself || strcmp(qname, db[i]->name)) { if (qlen == db[i]->len) { /* If symmetrical, report only first one */ if (!selfcomp || strcmp(db[i]->name, qname) >= 0) { found = 1; if (!nomatch) printf("Subject: %-10s identical Query: %-10s\n", db[i]->name, qname); } } else /* Inclusion */ { if (!nomatch && inclusion) { found = 1; printf("Subject: %-10s includes Query: %-10s\n", db[i]->name, qname); if (verbose) { printf("%s %s\nIncluded in\n%s %s\n\n", qname, query, db[i]->name, strstr(db[i]->seq, query)); } } } } } }