{-# LANGUAGE BangPatterns       #-}
{-# LANGUAGE CPP                #-}
{-# LANGUAGE NumericUnderscores #-}

module Main ( main ) where

import           Control.Exception (evaluate)
import           Control.Monad
import           Control.Monad.ST
import           Control.Monad.ST.Unsafe
import           Data.Bits ((.&.))
import           Data.BloomFilter.Blocked (Bloom, BloomSize)
import qualified Data.BloomFilter.Blocked as Bloom
import           Data.Time
import           Data.Vector (Vector)
import qualified Data.Vector as V
import qualified Data.Vector.Primitive as VP
import           Data.WideWord.Word256 (Word256)
import           GHC.Stats
import           Numeric
import           System.IO
import           System.Mem (performMajorGC)
import           System.Random
import           Text.Printf (printf)

import           Database.LSMTree.Extras.Orphans ()
import           Database.LSMTree.Internal.Assertions (fromIntegralChecked)
import qualified Database.LSMTree.Internal.BloomFilter as Bloom
import           Database.LSMTree.Internal.Serialise (SerialisedKey,
                     serialiseKey)

main :: IO ()
main = do
  hSetBuffering stdout NoBuffering
  benchmarks

-- Benchmark parameters you can tweak. The defaults are rather small, so the
-- runtime is short and do not show the effects of data sizes no longer fitting
-- into the CPU cache.

-- | The number of entries in the filter in the smallest LSM runs is
-- @2^benchmarkSizeBase@
benchmarkSizeBase :: SizeBase
benchmarkSizeBase = 16

-- | The number of lookups to do. This has to be smaller than the total size of
-- all the filters (otherwise we will not get true positive probes, which is
-- part of the point of this benchmark).
benchmarkNumLookups :: Integer
benchmarkNumLookups = 25_000_000

-- | The number of lookups to do in a single batch.
benchmarkBatchSize :: Int
benchmarkBatchSize = 256

benchmarkNumBitsPerEntry :: RequestedBitsPerEntry
benchmarkNumBitsPerEntry = 10

benchmarks :: IO ()
benchmarks = do
#ifdef NO_IGNORE_ASSERTS
    putStrLn "WARNING: Benchmarking in debug mode."
    putStrLn "         To benchmark in release mode, pass:"
    putStrLn "         --project-file=cabal.project.release"
#endif

    enabled <- getRTSStatsEnabled
    unless enabled $ fail "Need RTS +T statistics enabled"
    let filterSizes = lsmStyleBloomFilters benchmarkSizeBase
                                           benchmarkNumBitsPerEntry
    putStrLn "Bloom filter stats:"
    putStrLn "(numEntries, sizeFactor, BloomSize { sizeBits, sizeHashes })"
    mapM_ print filterSizes
    putStrLn $ "total number of entries:\t " ++ show (totalNumEntries filterSizes)
    putStrLn $ "total filter size in bytes:\t " ++ show (totalNumBytes filterSizes)
    putStrLn $ "total number of key lookups:\t " ++ show benchmarkNumLookups

    unless (totalNumEntriesSanityCheck benchmarkSizeBase filterSizes) $
      fail "totalNumEntriesSanityCheck failed"
    unless (totalNumEntries filterSizes >= benchmarkNumLookups) $
      fail "number of key lookups is more than number of entries"

    putStrLn "Generating bloom filters..."
    let rng0 = mkStdGen 42
    vbs <- elemManyEnv filterSizes rng0
    putStrLn " finished."
    putStrLn ""

    hashcost <-
      benchmark "makeHashes"
                "(This baseline is the cost of computing and hashing the keys)"
                (benchInBatches benchmarkBatchSize rng0
                   (benchMakeHashes vbs))
                (fromIntegralChecked benchmarkNumLookups)
                (0, 0)
                289

    _ <-
      benchmark "elemHashes"
                "(this is the simple one-by-one lookup, less the cost of computing and hashing the keys)"
                (benchInBatches benchmarkBatchSize rng0
                  (benchElemHashes vbs))
                (fromIntegralChecked benchmarkNumLookups)
                hashcost
                0

    _ <-
      benchmark "bloomQueries"
                "(this is the batch lookup, less the cost of computing and hashing the keys)"
                (benchInBatches benchmarkBatchSize rng0
                  (\ks -> Bloom.bloomQueries benchSalt vbs ks `seq` ()))
                (fromIntegralChecked benchmarkNumLookups)
                hashcost
                0

    pure ()

type Alloc = Int

benchmark :: String
          -> String
          -> (Int -> ())
          -> Int
          -> (NominalDiffTime, Alloc)
          -> Int
          -> IO (NominalDiffTime, Alloc)
benchmark name description action n (subtractTime, subtractAlloc) expectedAlloc = do
    putStrLn $ "Benchmarking " ++ name ++ " ... "
    putStrLn description
    performMajorGC
    allocBefore <- allocated_bytes <$> getRTSStats
    timeBefore  <- getCurrentTime
    evaluate (action n)
    timeAfter   <- getCurrentTime
    performMajorGC
    allocAfter  <- allocated_bytes <$> getRTSStats
    putStrLn "Finished."
    let allocTotal :: Alloc
        timeTotal  = timeAfter `diffUTCTime` timeBefore
        allocTotal = fromIntegral allocAfter - fromIntegral allocBefore
        timeNet    = timeTotal - subtractTime
        allocNet   = allocTotal - subtractAlloc

        timePerKey, allocPerKey :: Double
        timePerKey     = realToFrac timeNet / fromIntegral n
        allocPerKey    = fromIntegral allocNet / fromIntegral n
    let printStat :: String -> Double -> String -> IO ()
        printStat label v unit =
          putStrLn $ label ++ showGFloat (Just 2) v (' ':unit)
    printStat "Time total:        " (realToFrac timeTotal) "seconds"
    printStat "Alloc total:       " (fromIntegral allocTotal) "bytes"
    printStat "Time net:          " (realToFrac timeNet) "seconds"
    printStat "Alloc net:         " (fromIntegral allocNet) "bytes"
    printStat "Time net per key:  " timePerKey "seconds"
    printStat "Alloc net per key: " allocPerKey "bytes"

    unless (truncate allocPerKey == expectedAlloc) $ do
        printf "WARNING: expecting %d, got %d bytes allocated per key\n"
            expectedAlloc
            (truncate allocPerKey :: Int)

    putStrLn ""
    pure (timeNet, allocNet)

-- | (numEntries, sizeFactor, (BloomSize numBits numHashFuncs))
type BloomFilterSizeInfo = (Integer, Integer, BloomSize)
type SizeBase     = Int
type RequestedBitsPerEntry = Double

-- | Calculate the sizes of a realistic LSM style set of Bloom filters, one
-- for each LSM run. This uses base 4, with 4 disk levels, using tiering
-- for internal levels and leveling for the final (biggest) level.
--
-- Due to the incremental merging, each level actually has (in the worst case)
-- 2x the number of runs, hence 8 per level for tiering levels.
--
lsmStyleBloomFilters :: SizeBase -> RequestedBitsPerEntry -> [BloomFilterSizeInfo]
lsmStyleBloomFilters l1 requestedBitsPerEntry =
    [ (numEntries, sizeFactor, bsize)
    | (numEntries, sizeFactor)
        <- replicate 8 (2^(l1+0), 1)   -- 8 runs at level 1 (tiering)
        ++ replicate 8 (2^(l1+2), 4)   -- 8 runs at level 2 (tiering)
        ++ replicate 8 (2^(l1+4),16)   -- 8 runs at level 3 (tiering)
        ++            [(2^(l1+8),256)] -- 1 run  at level 4 (leveling)
    , let bsize = Bloom.sizeForBits requestedBitsPerEntry (fromIntegral numEntries)
    ]

totalNumEntries, totalNumBytes :: [BloomFilterSizeInfo] -> Integer
totalNumEntries filterSizes =
    sum [ numEntries | (numEntries, _, _) <- filterSizes ]

totalNumBytes filterSizes =
    sum [ toInteger (Bloom.sizeBits bsize)
        | (_,_,bsize) <- filterSizes ]
      `div` 8

totalNumEntriesSanityCheck :: SizeBase -> [BloomFilterSizeInfo] -> Bool
totalNumEntriesSanityCheck l1 filterSizes =
    totalNumEntries filterSizes
    ==
    sum [ 2^l1 * sizeFactor | (_, sizeFactor, _) <- filterSizes ]

benchSalt :: Bloom.Salt
benchSalt = 4

-- | Input environment for benchmarking 'Bloom.elemMany'.
--
-- The idea here is to have a collection of bloom filters corresponding to
-- the sizes used in a largeish LSM. In particular, the sizes are in increasing
-- powers of 4, and the largest ones should be bigger than the CPU L3 cache.
-- Furthermore, the keys in the filters are non-overlapping, and lookups will
-- be true positives in only one filter. Thus most lookups will be true
-- negatives.
--
-- The goal is to benchmark the benefits of optimisations for the LSM situation:
--
-- * where the same key is being looked up in many filters,
-- * with a hit in only one filter expected, and
-- * where the total size of the filters is too large to fully fit in cache
--   (though the smaller ones may fit in the caches).
--
elemManyEnv :: [BloomFilterSizeInfo]
            -> StdGen
            -> IO (Vector (Bloom SerialisedKey))
elemManyEnv filterSizes rng0 =
  stToIO $ do
    -- create the filters
    mbs <- sequence
      [ Bloom.new bsize benchSalt
      | (_, _, bsize) <- filterSizes
      ]
    -- add elements
    foldM_
      (\rng (i, mb) -> do
         -- progress
         when (i .&. 0xFFFF == 0) (unsafeIOToST $ putStr ".")
         -- insert n elements into filter b
         let k :: Word256
             (!k, !rng') = uniform rng
         Bloom.insert mb (serialiseKey k)
         pure rng'
      )
      rng0
      (zip [0 .. totalNumEntries filterSizes - 1]
           (cycle [ mb'
                  | (mb, (_, sizeFactor, _)) <- zip mbs filterSizes
                  , mb' <- replicate (fromIntegralChecked sizeFactor) mb ]))
    V.fromList <$> mapM Bloom.unsafeFreeze mbs

type BatchBench = V.Vector SerialisedKey -> ()

{-# NOINLINE benchInBatches #-}
benchInBatches :: Int -> StdGen -> BatchBench -> Int -> ()
benchInBatches !b !rng0 !action =
    go rng0
  where
    go !rng !n
      | n <= 0    = ()
      | otherwise =
        let (!rng'', !rng') = splitGen rng
            ks  :: VP.Vector Word256
            !ks  = VP.unfoldrExactN b uniform rng'
            ks' :: V.Vector SerialisedKey
            !ks' = V.map serialiseKey (V.convert ks)
        in action ks' `seq` go rng'' (n-b)

-- | This gives us a combined cost of calculating the series of keys and their
-- hashes (when used with 'benchInBatches').
benchMakeHashes :: Vector (Bloom SerialisedKey) -> BatchBench
benchMakeHashes !_bs !ks =
    let khs :: VP.Vector (Bloom.Hashes SerialisedKey)
        !khs = V.convert (V.map (Bloom.hashesWithSalt benchSalt) ks)
     in khs `seq` ()

-- | This gives us a combined cost of calculating the series of keys, their
-- hashes, and then using 'Bloom.elemHashes' with each filter  (when used
-- with 'benchInBatches').
benchElemHashes :: Vector (Bloom SerialisedKey) -> BatchBench
benchElemHashes !bs !ks =
    let khs :: VP.Vector (Bloom.Hashes SerialisedKey)
        !khs = V.convert (V.map (Bloom.hashesWithSalt benchSalt) ks)
     in V.foldl'
          (\_ b -> VP.foldl'
                     (\_ kh -> Bloom.elemHashes b kh `seq` ())
                     () khs)
          () bs