#This program takes a nucleotide sequence from a text file and slides a #window along the sequence, looking for motifs within the sliding #window. It then generates the reverse complement of the nucleotide #sequences and looks for the reversed motif in a sliding window of the #same size. # For the purposes of rapidly finding clusters of TF binding motifs, #this program is largely complete. Further improvements will involve #outputting directly to a spreadsheet to enable plotting results and #calculating statistical significance of motif clusters. #NOTE: There is a bug in here somewhere!, albeit not a very serious #one. Occasionally, my output will say that two patterns were found, #but the position for only one of those patterns will be reported. Why #this is happening isn’t immediately obvious – so if readers of this #program can spot the bug and suggest corrections, I’ll be much #obliged. Note that in 24 runs of 12M nucleotides, I saw <20 #occurrences of this bug, so it’s not a big problem… #!usr/bin/perl –w use strict; my $i; my $Teststring = (put any nucleic acid text file here); open (SEQUENCE, $Teststring) or die “Can’t open the DNA sequence file.\n”; my(@lines) = ; close (SEQUENCE); my $big_line; my $current_line; foreach $current_line (@lines) { chomp ($current_line); $big_line = $big_line.$current_line; } #The above part of the program opens up the text file containing the #sequence, turns it into an array of lines, and concatenates the lines #into one large string. print “\nPlease enter the size of the sliding window: “; my $winsize = ; chomp ($winsize); print “\nWhat pattern do you want to find? Type it here: “; my $pattern = ; chomp ($pattern); my $stored_pattern = $pattern; #The above part of the program permits users to enter a size of the #sliding window, and the pattern = motif they wish to find. my @locations = (); my @revlocations = (); #These are two arrays that get filled with positions of motifs as those #positions are found. Once a motif is no longer in the scope of the #window, it’s removed from these arrays – see below for more details. for ($i=0; i<(length($big_line)-($winsize-1)); $i++) { my $smallstring = substr($big_line, $i, $winsize); my $counter = $i+1; my $occur = 0; my $found_new = 0; $pattern =~ tr/a-z/A-Z/; $pattern =~ s/N/[acgtACGT]/g; $pattern =~ s/B/[cgtCGT]/g; $pattern =~ s/D/[agtAGT]/g; $pattern =~ s/H/[actACT]/g; $pattern =~ s/V/[acgACG]/g; $pattern =~ s/R/[agAG]/g; $pattern =~ s/Y/[ctCT]/g; $pattern =~ s/K/[gtGT]/g; $pattern =~ s/M/[acAC]/g; $pattern =~ s/S/[gcGC]/g; $pattern =~ s/W/[atAT]/g; #This part of the program permits the use of IUB codes for mixed bases #in the motif entered by the user. while ($smallstring =~ /$pattern/g) { my @motifs = (); my $length = length($stored_pattern); $occur += 1; my $currentPos = pos($smallstring) + $counter - $length; #This next part of the program is the brainchild of Patrick Killion. #Here, we check every new position at which the pattern is found to see #if it’s in @locations. If it’s not, we add it to @locations. If it #is, we don’t. This way, we avoid outputs of hundreds of windows in #which the found positions are the same. If we discover them and #output them once, that’s all we need to see, and all we need to output #to a spreadsheet. (P.S.: Patrick Rocks!) my $safetoadd = 1 #This is a way to initialize T/F #statement foreach my $singlelocation (@locations) { if ($singlelocation == $currentPos) { $safetoadd = 0 } if ($safetoadd == 1) { push (@locations, $currentPos); $found_new = 1; } } $counter++; foreach my $singlelocation(@locations) { if ($singlelocation < $counter) { shift(@locations); } } if($found_new == 1 && $occur > 1) #Setting $occur > 1 ensures that we report only windows in which two or #more patterns are found. { print “$counter,”; print “$occur,”; print join (‘,’,@locations); print “\n”; $found_new = 0; } } my $RevTeststring = $big_line; $RevTeststring =~ tr/ACGT/TGCA/; #Here, the reverse complement of the nucleotide string is generated. my $revpattern = reverse $stored_pattern; my $j; for ($j=0; $j<(length($RevTeststring)-($winsize-1)); $j++) { my $revsmallstring = substr($RevTeststring, $j, $winsize); my $counter = $j + 1; my $occur = 0; my $found_new = 0; $revpattern =~ tr/a-z/A-Z/; $revpattern =~ s/N/[acgtACGT]/g; $revpattern =~ s/B/[cgtCGT]/g; $revpattern =~ s/D/[agtAGT]/g; $revpattern =~ s/H/[actACT]/g; $revpattern =~ s/V/[acgACG]/g; $revpattern =~ s/R/[agAG]/g; $revpattern =~ s/Y/[ctCT]/g; $revpattern =~ s/K/[gtGT]/g; $revpattern =~ s/M/[acAC]/g; $revpattern =~ s/S/[gcGC]/g; $revpattern =~ s/W/[atAT]/g; while ($revsmallstring =~ /$revpattern/g) { my @motifs = (); my $length = length($stored_pattern); $occur += 1; my $currentPos = pos($revsmallstring) + $counter - $length; my $safetoadd = 1; foreach my$singlelocation(@revlocations) { if($singlelocation == $currentPos) { $safetoadd = 0; } } if ($safetoadd == 1) { push (@revlocations, $currentPos); $found_new = 1; } } $counter++; foreach my $singlelocation (@revlocations) { if($singlelocation < $counter) { shift(@revlocations); } if($found_new == 1 and $revoccur > 1) { print “$counter,”; print “$revoccur,”; print join(‘,’,@revlocations); print “\n”; $found_new = 0; } }