Report abuse

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# ARGV[0] = filename of sequences to analyze
# ARGV[1] = window size
# ARGV[2] = output CSV file

require 'rubygems'
require 'benchmark'
require 'fileutils'
require 'inline'

# using inline, commenting this out
#class String
#  def hamming_distance(s)
#    sum = 0
#    for i in 0...length
#      sum = sum + 1 if slice(i) != s.slice(i)
#    end
#    return sum
#  end
#end

class String
  inline do |builder|
    if test ?d, "/opt/local" then
      builder.add_compile_flags "-I/opt/local/include"
      builder.add_link_flags "-L/opt/local/lib"
    end
    
    builder.prefix <<-"END"
      int rb_str_hamming_distance(char *x, char *y) {
        int sum = 0;
        while (*x != '\\0') {
          if (*x != *y)
            sum++;
          ++x; ++y;
        }
        return sum;
      }
    END
    
    builder.prefix <<-"END"
      VALUE rb_str_sub_sequence(VALUE str, long offset, long window_size) {
        return rb_str_substr(str, offset, offset + window_size);
      }
    END
    
    builder.prefix <<-"END"
      int rb_str_hamming_inject(VALUE sequences, VALUE sequence, long index, long offset, long window_size) {
        VALUE sub_sequence, sub_sub_sequence;
        char *cstr_sequence = StringValueCStr(sequence);
        int sum = 0, i = index;
        for (i; i < RARRAY(sequences)->len; i++) {
          sub_sequence = RARRAY(sequences)->ptr[i];
          sub_sub_sequence = rb_str_sub_sequence(RARRAY(sequences)->ptr[i], offset, window_size);
          sum += rb_str_hamming_distance(cstr_sequence, StringValueCStr(sub_sub_sequence));
        }
        return sum;
      }
    END
    
    builder.c <<-"END"
      int hamming_distance(char *s) {
        return rb_str_hamming_distance(StringValueCStr(self), s);
      }
    END
    
    builder.c <<-"END"
      int hamming_inject(VALUE sequences, long index, long offset, long window_size) {
        return rb_str_hamming_inject(sequences, rb_str_sub_sequence(self, offset, window_size), index, offset, window_size);
      }
    END
  end
end

class Array
  inline do |builder|
    if test ?d, "/opt/local" then
      builder.add_compile_flags "-I/opt/local/include"
      builder.add_link_flags "-L/opt/local/lib"
    end
    
    builder.c <<-"END"
      int hamming_sum(long offset, long window_size) {
        VALUE sequence;
        int sum = 0, i = 0;
        for (i; i < RARRAY(self)->len; i++) {
          sequence = rb_str_sub_sequence(RARRAY(self)->ptr[i], offset, window_size);
          sum += rb_str_hamming_inject(self, sequence, i, offset, window_size);
        }
        return sum;
      }
    END
  end
end

module DNA
  class Cache
    attr_reader :filename
  
    def initialize(filename = 'data.cache')
      @filename = filename + (File.extname(filename) != 'cache' ? '.cache' : '')
      create
    end
    
    def handle
      @handle || open
    end
    
    def exists?
      File.file?(filename)
    end
    
    def clear
      delete
    end
    
    def data
      begin
        @data = Marshal.load(open('rb').read)
      rescue
        @data = 0
      ensure
        close
      end
      @data
    end
    
    def save(object)
      handle.write(Marshal.dump(object))
      close
    end
    
  protected
    def create
      FileUtils.touch(filename)
    end
  
    def open(flag = 'wb')
      create unless exists?
      @handle = File.open(filename, flag)
    end
    
    def close
      if @handle
        @handle.close
        @handle = nil
      end
    end
    
    def delete
      close
      FileUtils.rm(filename)
    end
  end
end

Benchmark.benchmark(" "*7 + Benchmark::CAPTION, 7, Benchmark::FMTSTR, "total:", "avg:") do |x|
  results = []
  10.times do
    results << x.report {
      sequences = IO.readlines(ARGV[0])
      window_size = ARGV[1].to_i
      num_of_seqs = sequences.length
      genome_len = sequences[0].length

      cache = DNA::Cache.new(ARGV[2])
      File.open(ARGV[2], cache.exists? ? 'a' : 'w') do |out|
        out.puts "window center,mean pairwise hamming distance" unless cache.exists?
        for offset in cache.data...(genome_len - window_size + 1)
          # the next line does all of the for loops in the inline code above
          sum = sequences.hamming_sum(offset, window_size)
          # commenting out the ruby version, using inline instead
          #sum = 0
          #for i in 0...(num_of_seqs - 1)
          #  temp = sequences[i][offset, offset + window_size]
          #  for j in (i + 1)...(num_of_seqs)
          #    # the next line uses inline for hamming_distance
          #    sum = sum + temp.hamming_distance(sequences[j][offset, offset + window_size])
          #  end
          #end
          out.puts "#{offset + window_size / 2},#{sum.quo(num_of_seqs * (num_of_seqs - 1) / 2).to_f}"
          cache.save((offset + window_size / 2) - 1)
          # next two lines are optional, but might help if ruby locks up
          out.fsync # forces a write to the disk
          out.flush # flushes the ruby buffer
        end
      end
      cache.clear
    }
  end
  reports = results.inject { |total,report| total + report }
  [reports, (reports) / results.length]
end