Report abuse

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def word_file_cleaner
  document = self.cleaned_document
  # Remove all the manually inserted breaks, don't need them. 
  document = document.chars.gsub(/<br>/u, '<br />')
  # Strip out all the new lines, then replace all </p> tags with </p>\n so that we 
  # can split up the string to paragraphs using newline as the delimiter
  document = document.chars.gsub(/\r\n/u, ' ')
  document = document.chars.gsub(/\n/u, ' ')
  document = document.chars.gsub(/\r/u, ' ')
  document = document.chars.gsub(/\s/u, ' ')
  document = document.chars.squeeze(" ")
  # Insert a line break before every <p tag - allows import of file to be cleaner
  document = document.chars.gsub(/<p /u, "\n<p ")
  # Replace mdash and ndash just in case
  document = document.chars.gsub(/(\W\W)/u, '&mdash;')
  self.cleaned_document = document
end


######  Test:

def test_should_ensure_cleaned_document_is_clean_of_unwanted_characters
  word_file = File.new("#{File.expand_path(RAILS_ROOT)}/test/fixtures/Test SOS.htm")
  vms_document = create
  vms_document.cleaned_document = word_file.read
  vms_document.cleaned_document =~ /\r\n\r\n/m
  assert $&, "Test file should have double line feeds"
  vms_document.cleaned_document =~ /(\w\w)/u
  assert $&, "Test file should have an unicode U+2014 mdash"
  vms_document.word_file_cleaner
  vms_document.save!
  # Sanity check, make sure we are working on a valid file
  vms_document.cleaned_document =~ /schemas\-microsoft\-com/u
  assert $&, "There is no text in the cleaned document section"
  vms_document.cleaned_document =~ /\r\n/u
  deny $&, "Text should not have any double line feeds"
  vms_document.cleaned_document =~ /\r/u
  deny $&, "Text should not have any carriage returns"
  vms_document.cleaned_document =~ /(\W\W)/u
  deny $&, "Text should not have an unicode mdash"
end