Skip to content

"ArgumentError: comparison of Float with NaN failed" if trying to search a corpus with an item that lacks common words #179

@tra38

Description

@tra38
require 'classifier-reborn'

lsi = ClassifierReborn::LSI.new

strings = [
    "This is filler text that I invented.This is also a paragraph that could be used", 
    "This post is amazing. Please take a look", 
    "For all sports fan, you must watch this video. Hey you have to check this out."
]

strings.each { |x| lsi.add_item x}

p lsi
#<ClassifierReborn::LSI:0x007fdcfc9af868 @auto_rebuild=true, @word_list=#<ClassifierReborn::WordList:0x007fdcfe80afd8 @location_table={:filler=>0, :text=>1, :inventedthi=>2, :paragraph=>3, :could=>4, :us=>5, :post=>6, :amaz=>7, :pleas=>8, :take=>9, :look=>10, :for=>11, :sport=>12, :fan=>13, :must=>14, :watch=>15, :video=>16, :hei=>17, :check=>18, :out=>19}>, @items={"This is filler text that I invented.This is also a paragraph that could be used"=>#<ClassifierReborn::ContentNode:0x007fdcfd05ea88 @categories=[], @word_hash={:filler=>1, :text=>1, :inventedthi=>1, :paragraph=>1, :could=>1, :us=>1}, @lsi_norm=GSL::Vector
# [ 4.082e-01 4.082e-01 4.082e-01 4.082e-01 4.082e-01 4.082e-01 0.000e+00 ... ], @lsi_vector=GSL::Vector
# [ 3.869e-01 3.869e-01 3.869e-01 3.869e-01 3.869e-01 3.869e-01 0.000e+00 ... ], @raw_norm=GSL::Vector
# [ 4.082e-01 4.082e-01 4.082e-01 4.082e-01 4.082e-01 4.082e-01 0.000e+00 ... ], @raw_vector=GSL::Vector
# [ 3.869e-01 3.869e-01 3.869e-01 3.869e-01 3.869e-01 3.869e-01 0.000e+00 ... ]>, "This post is amazing. Please take a look"=>#<ClassifierReborn::ContentNode:0x007fdcfd05c918 @categories=[], @word_hash={:post=>1, :amaz=>1, :pleas=>1, :take=>1, :look=>1}, @lsi_norm=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 4.472e-01 ... ], @lsi_vector=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 4.307e-01 ... ], @raw_norm=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 4.472e-01 ... ], @raw_vector=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 4.307e-01 ... ]>, "For all sports fan, you must watch this video. Hey you have to check this out."=>#<ClassifierReborn::ContentNode:0x007fdcfe80b050 @categories=[], @word_hash={:for=>1, :sport=>1, :fan=>1, :must=>1, :watch=>1, :video=>1, :hei=>1, :check=>1, :out=>1}, @lsi_norm=GSL::Vector
# [   nan   nan   nan   nan   nan   nan   nan ... ], @lsi_vector=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 ... ], @raw_norm=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 ... ], @raw_vector=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 ... ]>}, @version=3, @built_at_version=3, @language="en", @cache_node_vectors=nil>

# Because the last content node has a GSL::Vector consisting of NaNs, 
# this following code will raise an exception

p lsi.search('filler')
#ArgumentError: comparison of Float with NaN failed
  # from /Users/tariqali/.rbenv/versions/2.4.1/lib/ruby/gems/2.4.0/gems/classifier-reborn-2.2.0/lib/classifier-reborn/lsi.rb:225:in `sort_by'
  # from /Users/tariqali/.rbenv/versions/2.4.1/lib/ruby/gems/2.4.0/gems/classifier-reborn-2.2.0/lib/classifier-reborn/lsi.rb:225:in `content_node_norms'
  # from /Users/tariqali/.rbenv/versions/2.4.1/lib/ruby/gems/2.4.0/gems/classifier-reborn-2.2.0/lib/classifier-reborn/lsi.rb:211:in `proximity_norms_for_content'
  # from /Users/tariqali/.rbenv/versions/2.4.1/lib/ruby/gems/2.4.0/gems/classifier-reborn-2.2.0/lib/classifier-reborn/lsi.rb:237:in `search'
  # from (irb):13
  # from /Users/tariqali/.rbenv/versions/2.4.1/bin/irb:11:in `<main>'

# But if I add some new content...

lsi.add_item "I love sports"

p lsi

# #<ClassifierReborn::LSI:0x007fdcfc9af868 @auto_rebuild=true, @word_list=#<ClassifierReborn::WordList:0x007fdcfd08fe80 @location_table={:filler=>0, :text=>1, :inventedthi=>2, :paragraph=>3, :could=>4, :us=>5, :post=>6, :amaz=>7, :pleas=>8, :take=>9, :look=>10, :for=>11, :sport=>12, :fan=>13, :must=>14, :watch=>15, :video=>16, :hei=>17, :check=>18, :out=>19, :love=>20}>, @items={"This is filler text that I invented.This is also a paragraph that could be used"=>#<ClassifierReborn::ContentNode:0x007fdcfd05ea88 @categories=[], @word_hash={:filler=>1, :text=>1, :inventedthi=>1, :paragraph=>1, :could=>1, :us=>1}, @lsi_norm=GSL::Vector
# [ 4.082e-01 4.082e-01 4.082e-01 4.082e-01 4.082e-01 4.082e-01 0.000e+00 ... ], @lsi_vector=GSL::Vector
# [ 3.869e-01 3.869e-01 3.869e-01 3.869e-01 3.869e-01 3.869e-01 0.000e+00 ... ], @raw_norm=GSL::Vector
# [ 4.082e-01 4.082e-01 4.082e-01 4.082e-01 4.082e-01 4.082e-01 0.000e+00 ... ], @raw_vector=GSL::Vector
# [ 3.869e-01 3.869e-01 3.869e-01 3.869e-01 3.869e-01 3.869e-01 0.000e+00 ... ]>, "This post is amazing. Please take a look"=>#<ClassifierReborn::ContentNode:0x007fdcfd05c918 @categories=[], @word_hash={:post=>1, :amaz=>1, :pleas=>1, :take=>1, :look=>1}, @lsi_norm=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 4.472e-01 ... ], @lsi_vector=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 4.307e-01 ... ], @raw_norm=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 4.472e-01 ... ], @raw_vector=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 4.307e-01 ... ]>, "For all sports fan, you must watch this video. Hey you have to check this out."=>#<ClassifierReborn::ContentNode:0x007fdcfe80b050 @categories=[], @word_hash={:for=>1, :sport=>1, :fan=>1, :must=>1, :watch=>1, :video=>1, :hei=>1, :check=>1, :out=>1}, @lsi_norm=GSL::Vector
# [ 1.303e-17 3.778e-18 -2.815e-17 3.778e-18 3.778e-18 3.778e-18 0.000e+00 ... ], @lsi_vector=GSL::Vector
# [ 4.828e-18 1.400e-18 -1.043e-17 1.400e-18 1.400e-18 1.400e-18 0.000e+00 ... ], @raw_norm=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 ... ], @raw_vector=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 ... ]>, "I love sports"=>#<ClassifierReborn::ContentNode:0x007fdcfd08ff20 @categories=[], @word_hash={:love=>1, :sport=>1}, @lsi_norm=GSL::Vector
# [ 1.303e-17 3.778e-18 -2.815e-17 3.778e-18 3.778e-18 3.778e-18 0.000e+00 ... ], @lsi_vector=GSL::Vector
# [ 1.818e-17 5.272e-18 -3.927e-17 5.272e-18 5.272e-18 5.272e-18 0.000e+00 ... ], @raw_norm=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 ... ], @raw_vector=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 ... ]>}, @version=4, @built_at_version=4, @language="en", @cache_node_vectors=nil>

# Now, no content node has NaNs, so I'm able to do a successful search.

p lsi.search('filler')
# ["This is filler text that I invented.This is also a paragraph that could be used", "I love sports", "For all sports fan, you must watch this video. Hey you have to check this out."]

I've encountered this error in tra38/ZombieWriter#8 . What would be the best way of handling this issue? My first instinct is to simply add all my strings and then scan through the entire LSI again to remove all content that contain vectors of NaNs, but that seems exceedingly inelegant...there has to be a better solution, right?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions