# ARGV[0] = filename of sequences to analyze
# ARGV[1] = window size
# ARGV[2] = output CSV file
require 'rubygems'
require 'benchmark'
require 'fileutils'
require 'inline'
# using inline, commenting this out
#class String
# def hamming_distance(s)
# sum = 0
# for i in 0...length
# sum = sum + 1 if slice(i) != s.slice(i)
# end
# return sum
# end
#end
class String
inline do |builder|
if test ?d, "/opt/local" then
builder.add_compile_flags "-I/opt/local/include"
builder.add_link_flags "-L/opt/local/lib"
end
builder.prefix <<-"END"
int rb_str_hamming_distance(char *x, char *y) {
int sum = 0;
while (*x != '\\0') {
if (*x != *y)
sum++;
++x; ++y;
}
return sum;
}
END
builder.prefix <<-"END"
VALUE rb_str_sub_sequence(VALUE str, long offset, long window_size) {
return rb_str_substr(str, offset, offset + window_size);
}
END
builder.prefix <<-"END"
int rb_str_hamming_inject(VALUE sequences, VALUE sequence, long index, long offset, long window_size) {
VALUE sub_sequence, sub_sub_sequence;
char *cstr_sequence = StringValueCStr(sequence);
int sum = 0, i = index;
for (i; i < RARRAY(sequences)->len; i++) {
sub_sequence = RARRAY(sequences)->ptr[i];
sub_sub_sequence = rb_str_sub_sequence(RARRAY(sequences)->ptr[i], offset, window_size);
sum += rb_str_hamming_distance(cstr_sequence, StringValueCStr(sub_sub_sequence));
}
return sum;
}
END
builder.c <<-"END"
int hamming_distance(char *s) {
return rb_str_hamming_distance(StringValueCStr(self), s);
}
END
builder.c <<-"END"
int hamming_inject(VALUE sequences, long index, long offset, long window_size) {
return rb_str_hamming_inject(sequences, rb_str_sub_sequence(self, offset, window_size), index, offset, window_size);
}
END
end
end
class Array
inline do |builder|
if test ?d, "/opt/local" then
builder.add_compile_flags "-I/opt/local/include"
builder.add_link_flags "-L/opt/local/lib"
end
builder.c <<-"END"
int hamming_sum(long offset, long window_size) {
VALUE sequence;
int sum = 0, i = 0;
for (i; i < RARRAY(self)->len; i++) {
sequence = rb_str_sub_sequence(RARRAY(self)->ptr[i], offset, window_size);
sum += rb_str_hamming_inject(self, sequence, i, offset, window_size);
}
return sum;
}
END
end
end
module DNA
class Cache
attr_reader :filename
def initialize(filename = 'data.cache')
@filename = filename + (File.extname(filename) != 'cache' ? '.cache' : '')
create
end
def handle
@handle || open
end
def exists?
File.file?(filename)
end
def clear
delete
end
def data
begin
@data = Marshal.load(open('rb').read)
rescue
@data = 0
ensure
close
end
@data
end
def save(object)
handle.write(Marshal.dump(object))
close
end
protected
def create
FileUtils.touch(filename)
end
def open(flag = 'wb')
create unless exists?
@handle = File.open(filename, flag)
end
def close
if @handle
@handle.close
@handle = nil
end
end
def delete
close
FileUtils.rm(filename)
end
end
end
Benchmark.benchmark(" "*7 + Benchmark::CAPTION, 7, Benchmark::FMTSTR, "total:", "avg:") do |x|
results = []
10.times do
results << x.report {
sequences = IO.readlines(ARGV[0])
window_size = ARGV[1].to_i
num_of_seqs = sequences.length
genome_len = sequences[0].length
cache = DNA::Cache.new(ARGV[2])
File.open(ARGV[2], cache.exists? ? 'a' : 'w') do |out|
out.puts "window center,mean pairwise hamming distance" unless cache.exists?
for offset in cache.data...(genome_len - window_size + 1)
sum = sequences.hamming_sum(offset, window_size)
# commenting out the ruby version, using inline instead
#sum = 0
#for i in 0...(num_of_seqs - 1)
# temp = sequences[i][offset, offset + window_size]
# for j in (i + 1)...(num_of_seqs)
# sum = sum + temp.hamming_distance(sequences[j][offset, offset + window_size])
# end
#end
out.puts "#{offset + window_size / 2},#{sum.quo(num_of_seqs * (num_of_seqs - 1) / 2).to_f}"
cache.save((offset + window_size / 2) - 1)
# next two lines are optional, but might help if ruby locks up
out.fsync # forces a write to the disk
out.flush # flushes the ruby buffer
end
end
cache.clear
}
end
reports = results.inject { |total,report| total + report }
[reports, (reports) / results.length]
end