defword_file_cleaner
document = self.cleaned_document
# Remove all the manually inserted breaks, don't need them.
document = document.chars.gsub(/<br>/u, '<br />')
# Strip out all the new lines, then replace all </p> tags with </p>\n so that we
# can split up the string to paragraphs using newline as the delimiter
document = document.chars.gsub(/\r\n/u, '')
document = document.chars.gsub(/\n/u, '')
document = document.chars.gsub(/\r/u, '')
document = document.chars.gsub(/\s/u, '')
document = document.chars.squeeze("")
# Insert a line break before every <p tag - allows import of file to be cleaner
document = document.chars.gsub(/<p /u, "\n<p ")
# Replace mdash and ndash just in case
document = document.chars.gsub(/(\W—\W)/u, '—')
self.cleaned_document = document
end###### Test:
deftest_should_ensure_cleaned_document_is_clean_of_unwanted_characters
word_file = File.new("#{File.expand_path(RAILS_ROOT)}/test/fixtures/Test SOS.htm")
vms_document = create
vms_document.cleaned_document = word_file.read
vms_document.cleaned_document =~ /\r\n\r\n/m
assert $&, "Test file should have double line feeds"
vms_document.cleaned_document =~ /(\w—\w)/u
assert $&, "Test file should have an unicode U+2014 mdash"
vms_document.word_file_cleaner
vms_document.save!
# Sanity check, make sure we are working on a valid file
vms_document.cleaned_document =~ /schemas\-microsoft\-com/u
assert $&, "There is no text in the cleaned document section"
vms_document.cleaned_document =~ /\r\n/u
deny $&, "Text should not have any double line feeds"
vms_document.cleaned_document =~ /\r/u
deny $&, "Text should not have any carriage returns"
vms_document.cleaned_document =~ /(\W—\W)/u
deny $&, "Text should not have an unicode mdash"end