-- Difficult: write a QuickCheck property that checks whether the observed false
-- positive rate is close to the requested false positive rate.


-- I think testing/proving statistical properties can never be done using finite
-- data. Thus, this test will only be a good effort for adequate testing. This
-- also implies that instead of trying to make every test run pass, the overall
-- test result will be determined by comparing number of passes and number of
-- discards.

-- Requesting false positive rate happens via the easyList function, so this
-- exercise is about testing that function.

-- The "is close" in the assignment is unclear. The authors probably meant to
-- check that the false positive rate is lower or close-to-equal than the
-- requested rate. For the "close" I add 10% margin to the requested false
-- positive rate when comparing the actual rate to it.


-- After running a few experiments with QuickCheck, I found that the initial
-- lists passed to the property function were, for example '[(),(),(),()]' or
-- '[0,0,0,0]' when I restricted the type to [Int].
--
-- I want to test the worst cases. It means, increasing the chance of getting a
-- false positive and see whether it is still less or equally frequent than
-- requested. The more values with different hashes are inserted to the bloom
-- filter (it means, the more bits are set), the higher chance of a false
-- positive. Thus, lists filled with the same value are not suitable.
--
-- To make the test simpler and easier to control, I will restrict the input
-- list type to [Int].

-- The test for worst cases will be as follows:
--
--    1) Generate a random number L from range [maxLen..minLen]. The range is
--       limited so the lists are not too big and the testing not too slow.
--
--    2) Create a list XS of length L of random Int values. They don't have to
--       be unique. I don't assume properties of the hash function used, so it's
--       not possible to tell that all-unique values would be the worst case
--       (set as many bits in the filter as possible).
--
--    3) Create a list YS of length ysRatio*L of random Int values different
--       from XS. For the worst case it is important that all values in YS are
--       different from values in XS for getting as many false positives as
--       possible. It doesn't matter whether YS values are unique among
--       themselves. YS list should be as long as possible. If it's short, for
--       example, just 2 values, and both of them are false positives, the
--       actual false positive rate would be 2/2 = 1.0. The longer the list is,
--       the higher chance of getting closer to true false positive rate. I
--       restricted the length of YS to be a multiple of the length of XS. It
--       should be long enough, but not too long for the testing to be too slow.
--
--    4) Generate a random requested false positive rate.
--
--    5) Create a bloom filter via easyList function with values XS. If it
--       fails, discard the test case.
--
--    6) Get an actual false positive rate. For each item in YS check if it is
--       in the bloom filter. If it is, increment false positive count. Divide
--       the count by length of YS.
--
--    7) If the actual false positive rate is lower or close-to-equal to the
--       requested false positive rate, the test case passes. If not, it is
--       discarded.


{-- From examples/examples/ch26/examples/BloomCheck.hs and modified --}
import Test.QuickCheck
import qualified BloomFilter.Easy as B
import qualified Data.Map.Lazy as Map

-- Instead of failing the testing, use 'discard' so it can continue and report
-- the overall results
handyCheck :: Testable a => Int -> a -> IO ()
-- 'withDiscardRatio 1' means that the max. number of discards for the testing
-- to still pass will be 1*number of requested successful runs. The number of
-- all test runs might be different every time because the testing continues
-- until either the max. success limit or the max. discard limit is
-- reached. This means that testing passes when number of discards is < than
-- number of successful runs, and fails otherwise.
handyCheck limit p = quickCheck $ withDiscardRatio 1 $ withMaxSuccess limit p

falsePositive :: Gen Double
falsePositive = choose (epsilon, 1 - epsilon)
  where epsilon = 1e-6

(=~>) :: Either a b -> (b -> Bool) -> Bool
k =~> f = either (const discard) f k
{-- End of code from examples --}


randomInt :: Gen Int
randomInt = chooseInt (minBound :: Int, maxBound :: Int)

randomList :: Int -> Gen [Int]
randomList len = vectorOf len randomInt

randomListWithout :: Int -> [Int] -> Gen [Int]
randomListWithout len without = go len withoutMap []
  -- Use Map for faster lookup of excluded Ints
  where withoutMap = Map.fromList $ zip without (repeat ())
        go 0 _ res = return res
        go len wo res = do
          i <- randomInt
          case Map.lookup i wo of
            Nothing ->
              -- Good value. Not excluded by the 'without' list. Add it to the
              -- result.
              go (len-1) wo (i:res)
            Just _ ->
              -- Excluded value. Try again.
              go len wo res

prop_false_positive_rate =
  let
      -- These exact values are set based on experimental test runs for the
      -- testing not to be too slow
      minLen = 1
      maxLen = 50
      ysRatio = 500
  in
    -- Choose number of items in the bloom filter
    forAll (choose (minLen, maxLen) :: Gen Int) $ \len -> do
      -- Generate values to insert to the filter
      forAll (randomList len) $ \xs-> do
        -- Generate a longer list with values that won't be in the filter
        forAll (randomListWithout (ysRatio * len) xs) $ \ys-> do
          forAll falsePositive $ \requestedErrRate ->
            B.easyList requestedErrRate xs =~> \filt ->
              -- Add 10% margin to the requested rate
              if (errRate filt ys) <= (requestedErrRate * 1.1)
              then True
              else discard

errRate :: B.Bloom Int -> [Int] -> Double
errRate filt ys = (fromIntegral fp) / fromIntegral (length ys)
  where fp = countIsElem filt ys

countIsElem :: B.Bloom Int -> [Int] -> Int
countIsElem filt ys = length $ filter id $ map (`B.elem` filt) ys


-- stack ghci --ghci-options "-L$(pwd) -llookup3"

-- ghci> :l 26_a_2.hs
-- [1 of 7] Compiling BloomFilter.Hash ( BloomFilter/Hash.hs, interpreted )
--
--   ... omitted compiler warnings
--
-- [2 of 7] Compiling BloomFilter.Internal ( BloomFilter/Internal.hs, interpreted )
-- [3 of 7] Compiling BloomFilter.Mutable ( BloomFilter/Mutable.hs, interpreted )
-- [4 of 7] Compiling BloomFilter      ( BloomFilter.hs, interpreted )
-- [5 of 7] Compiling BloomFilter.Easy ( BloomFilter/Easy.hs, interpreted )
-- [6 of 7] Compiling Main             ( 26_a_2.hs, interpreted )
-- Ok, six modules loaded.

-- ghci> handyCheck 100 prop_false_positive_rate
-- +++ OK, passed 100 tests; 20 discarded.

-- ghci> handyCheck 600 prop_false_positive_rate
-- +++ OK, passed 600 tests; 179 discarded.

-- ghci> handyCheck 2000 prop_false_positive_rate
-- +++ OK, passed 2000 tests; 561 discarded.