{-# LANGUAGE MagicHash                 #-}
{-# LANGUAGE UnboxedTuples             #-}
{-# LANGUAGE ExistentialQuantification #-}
{-# LANGUAGE ScopedTypeVariables       #-}
{-# LANGUAGE CPP                       #-}
{-# LANGUAGE ForeignFunctionInterface  #-}

#include "MachDeps.h"

-----------------------------------------------------------------------------
-- |
-- Module      : Data.Array.Vector.Arr.BUArr
-- Copyright   : (c) [2001..2002] Manuel M T Chakravarty & Gabriele Keller
--               (c) [2006..2007] Manuel M T Chakravarty & Roman Leshchinskiy
-- License     : see libraries/ndp/LICENSE
-- 
-- Maintainer  : Roman Leshchinskiy <rl@cse.unsw.edu.au>
-- Stability   : internal
-- Portability : non-portable (unboxed values and GHC libraries)
--
-- Description ---------------------------------------------------------------
--
-- This module define our own infrastructure for unboxed arrays, but recycle
-- some of the existing abstractions for boxed arrays.  It's more important to
-- have precise control over the implementation of unboxed arrays, because
-- they are more performance critical.  All arrays defined here are
-- `Int'-indexed without H98 `Ix' support.
--
-- So far, we only support Char, Int, Float, and Double in unboxed arrays
-- (adding more is merely a matter of tedious typing).
--
-- Todo ----------------------------------------------------------------------
--
-- * For some not understood reason, `checkCritical' prevents the write
--   operations to be inlined.  Instead, a specialised version of them is
--   called.  Interestingly, this doesn't seem to affect runtime negatively
--   (as opposed to still checking, but inlining everything).  Nevertheless,
--   bounds checks cost performance.  (Checking only the writes in SMVM costs
--   about a factor of two for the fully fused version and about 50% for the
--   partially fused version.)
--
--   We could check only check some of the writes (eg, in permutations) as we
--   know for others that they can never be out of bounds (provided this
--   library is correct).
--
-- * There is no proper block copy support yet.  It would be helpful for
--   extracting and copying.  But do we need extracting if we have slicing?
--   (Slicing instead of extracting may introduce space leaks..)
--
-- * If during freezing it becomes clear that the array is much smaller than
--   originally allocated, it might be worthwhile to copy the data into a new,
--   smaller array.


module Data.Array.Vector.Prim.BUArr (
  -- * Unboxed primitive arrays (both immutable and mutable)
  BUArr(..), MBUArr,

  -- * Class of elements of such arrays
  UAE(..),

  -- * Operations on mutable arrays
  lengthMBU, newMBU, extractMBU, copyMBU,
  unsafeFreezeMBU, unsafeFreezeAllMBU,

  -- * Fast copying of mutable arrays
  memcpyMBU, memcpyOffMBU, memmoveOffMBU,

  -- * Basic operations
  lengthBU, emptyBU, replicateBU, sliceBU, extractBU,

  -- * Streaming
  streamBU, unstreamBU,

  -- * Higher-order operations
  mapBU, foldlBU, foldBU, scanlBU, scanBU,

  -- * Arithmetic operations
  sumBU,

  -- * Conversions to\/from lists
  toBU, fromBU,

  -- * Re-exporting some of GHC's internals that higher-level modules need
--  Char#, Int#, Float#, Double#, Char(..), Int(..), Float(..), Double(..), ST,
--  runST
) where

-- GHC-internal definitions
import GHC.Prim (
  Char#, Int#, Float#, Double#, Word#,

  ByteArray#, MutableByteArray#, RealWorld,
  newByteArray#, unsafeFreezeArray#, unsafeThawArray#, unsafeCoerce#,

  (+#), (*#), and#, or#, xor#, neWord#, word2Int#, int2Word#,
  uncheckedIShiftRA#, uncheckedShiftL#,

  indexWideCharArray#, readWideCharArray#, writeWideCharArray#,
  indexIntArray#, readIntArray#, writeIntArray#,
  indexWordArray#, readWordArray#, writeWordArray#,

  indexWord8Array#, readWord8Array#, writeWord8Array#,
  indexWord16Array#, readWord16Array#, writeWord16Array#,
  indexWord32Array#, readWord32Array#, writeWord32Array#,
  indexWord64Array#, readWord64Array#, writeWord64Array#,

  indexInt8Array#, readInt8Array#, writeInt8Array#,
  indexInt16Array#, readInt16Array#, writeInt16Array#,
  indexInt32Array#, readInt32Array#, writeInt32Array#,
  indexInt64Array#, readInt64Array#, writeInt64Array#,

  indexFloatArray#, readFloatArray#, writeFloatArray#,
  indexDoubleArray#, readDoubleArray#, writeDoubleArray#)

import GHC.Base (
  Char(..), Int(..))
import GHC.Float (
  Float(..), Double(..))
import GHC.Word (
  Word(..), Word8(..), Word16(..), Word32(..), Word64(..))
import GHC.Int (
  Int8(..), Int16(..), Int32(..), Int64(..))
import GHC.ST 
import GHC.IO

import System.IO
import Foreign
import Foreign.C   (CSize)

import GHC.Handle
import GHC.IOBase
import GHC.Ptr

import Foreign.C.Types

-- NDP library
import Data.Array.Vector.Prim.Hyperstrict
import Data.Array.Vector.Prim.Debug
import Data.Array.Vector.Stream

infixl 9 `indexBU`, `readMBU`

here s = "Arr.BUArr." ++ s

-- |Unboxed arrays
-- ---------------

-- Unboxed arrays of primitive element types arrays constructed from an
-- explicit length and a byte array in both an immutable and a mutable variant
--
data BUArr    e =
        BUArr  {-# UNPACK #-}!Int
               {-# UNPACK #-}!Int
               ByteArray#
data MBUArr s e =
        MBUArr {-# UNPACK #-}!Int
               (MutableByteArray# s)

-- instance HS e => HS (BUArr e)
-- instance HS e => HS (MBUArr s e)

-- |Number of elements of an immutable unboxed array
--
lengthBU :: BUArr e -> Int
lengthBU (BUArr _ n _) = n

-- |Number of elements of a mutable unboxed array
--
lengthMBU :: MBUArr s e -> Int
lengthMBU (MBUArr n _) = n

-- |The basic operations on unboxed arrays are overloaded
--
class UAE e where
  sizeBU   :: Int -> e -> Int           -- size of an array with n elements
  indexBU  :: BUArr e    -> Int      -> e
  readMBU  :: MBUArr s e -> Int      -> ST s e
  writeMBU :: MBUArr s e -> Int -> e -> ST s ()

-- |Empty array
--
emptyBU :: UAE e => BUArr e
emptyBU = runST (do
            a <- newMBU 0
            unsafeFreezeMBU a 0
          )

-- |Produces an array that consists of a subrange of the original one without
-- copying any elements.
--
sliceBU :: BUArr e -> Int -> Int -> BUArr e
sliceBU (BUArr start len arr) newStart newLen =
  let start' = start + newStart
  in
  BUArr start' ((len - newStart) `min` newLen) arr

-- |Allocate an uninitialised unboxed array
--
newMBU :: forall s e. UAE e => Int -> ST s (MBUArr s e)
{-# INLINE newMBU #-}
newMBU n = ST $ \s1# ->
  case sizeBU n (undefined::e) of {I# len#          ->
  case newByteArray# len# s1#   of {(# s2#, marr# #) ->
  (# s2#, MBUArr n marr# #) }}

-- |Turn a mutable into an immutable array WITHOUT copying its contents, which
-- implies that the mutable array must not be mutated anymore after this
-- operation has been executed.
--
-- * The explicit size parameter supports partially filled arrays (and must be
--   less than or equal the size used when allocating the mutable array)
--
unsafeFreezeMBU :: MBUArr s e -> Int -> ST s (BUArr e)
{-# INLINE unsafeFreezeMBU #-}
unsafeFreezeMBU (MBUArr m mba#) n = 
  checkLen (here "unsafeFreezeMBU") m n $ ST $ \s# ->
  (# s#, BUArr 0 n (unsafeCoerce# mba#) #)

-- |Turn a mutable into an immutable array WITHOUT copying its contents, which
-- implies that the mutable array must not be mutated anymore after this
-- operation has been executed.
--
-- * In contrast to 'unsafeFreezeMBU', this operation always freezes the
-- entire array.
-- 
unsafeFreezeAllMBU :: MBUArr s e -> ST s (BUArr e)
{-# INLINE unsafeFreezeAllMBU #-}
unsafeFreezeAllMBU (MBUArr m mba#) = 
  ST $ \s# -> (# s#, BUArr 0 m (unsafeCoerce# mba#) #)

foreign import ccall unsafe "string.h memcpy"
  memcpy :: MutableByteArray# s -> MutableByteArray# s -> CInt -> IO (Ptr a)

foreign import ccall unsafe "memcpy_extra.h memcpy_offset"
  memcpyOffset :: MutableByteArray# s -> MutableByteArray# s -> CInt -> CInt -> CInt -> IO ()

foreign import ccall unsafe "memcpy_extra.h memmove_offset"
  memmoveOffset :: MutableByteArray# s -> MutableByteArray# s -> CInt -> CInt -> CInt -> IO ()

-- | Copy values from one mutable array to another.
memcpyMBU :: forall s e. UAE e => MBUArr s e -> MBUArr s e -> Int -> ST s ()
memcpyMBU (MBUArr _ src) (MBUArr _ dst) l
  = ST (\s -> case memcpy dst src (fromIntegral len) of
                IO m -> case unsafeCoerce# m s of
                          (# s, _ #) -> (# s, () #))
 where
 len = sizeBU l (undefined :: e)
{-# inline memcpyMBU #-}

memcpyOffMBU :: forall s e. UAE e => MBUArr s e -> MBUArr s e -> Int -> Int -> Int -> ST s ()
memcpyOffMBU (MBUArr _ src) (MBUArr _ dst) s d l
  = ST (\s -> case memcpyOffset dst src (fromIntegral dOff) (fromIntegral sOff) (fromIntegral len) of
         IO m -> unsafeCoerce# m s)
 where
 sOff = sizeBU s (undefined :: e)
 dOff = sizeBU d (undefined :: e)
 len  = sizeBU l (undefined :: e)
{-# INLINE memcpyOffMBU #-}

memmoveOffMBU :: forall s e. UAE e => MBUArr s e -> MBUArr s e -> Int -> Int -> Int -> ST s ()
memmoveOffMBU (MBUArr _ src) (MBUArr _ dst) s d l
  = ST (\s -> case memmoveOffset dst src (fromIntegral dOff) (fromIntegral sOff) (fromIntegral len) of
         IO m -> unsafeCoerce# m s)
 where
 sOff = sizeBU s (undefined :: e)
 dOff = sizeBU d (undefined :: e)
 len  = sizeBU l (undefined :: e)
{-# INLINE memmoveOffMBU #-}


-- |Instances of unboxed arrays
-- -

-- This is useful to define loops that act as generators cheaply (see the
-- ``Functional Array Fusion'' paper)
--
instance UAE () where
  sizeBU _ _ = 0

  {-# INLINE indexBU #-}
  indexBU (BUArr _ _ _) (I# _) = ()

  {-# INLINE readMBU #-}
  readMBU (MBUArr _ _) (I# _) = ST $ \s# ->
    (# s#, () #)

  {-# INLINE writeMBU #-}
  writeMBU (MBUArr _ _) (I# _) () = ST $ \s# ->
    (# s#, () #)

{-
instance UAE Bool where
  sizeBU (I# n#) _ = I# n#

  {-# INLINE indexBU #-}
  indexBU (BUArr (I# s#) n ba#) i@(I# i#) =
    check (here "indexBU[Bool]") n i $
      (indexWord8Array# ba# (s# +# i#) `neWord#` int2Word# 0#)

  {-# INLINE readMBU #-}
  readMBU (MBUArr n mba#) i@(I# i#) =
    check (here "readMBU[Bool]") n i $
    ST $ \s# ->
    case readWord8Array# mba# i# s#   of {(# s2#, r# #) ->
    (# s2#, r# `neWord#` int2Word# 0# #)}

  {-# INLINE writeMBU #-}
  writeMBU (MBUArr n mba#) i@(I# i#) e# = 
    checkCritical (here "writeMBU[Bool]") n i $
    ST $ \s# ->
    case writeWord8Array# mba# i# b# s# of {s2# ->
    (# s2#, () #)}
    where
      b# = int2Word# (if e# then 1# else 0#)
-}

instance UAE Bool where
  sizeBU (I# n#) _ = I# (bOOL_SCALE n#)

  {-# INLINE indexBU #-}
  indexBU (BUArr (I# s#) n ba#) i@(I# i#) =
    check (here "indexBU[Bool]") n i $
      (indexWordArray# ba# (bOOL_INDEX (s# +# i#)) `and#` bOOL_BIT (s# +# i#))
      `neWord#` int2Word# 0#

  {-# INLINE readMBU #-}
  readMBU (MBUArr n mba#) i@(I# i#) =
    check (here "readMBU[Bool]") n i $ ST $ \s# ->
    case readWordArray# mba# (bOOL_INDEX i#) s#   of {(# s2#, r# #) ->
    (# s2#, (r# `and#` bOOL_BIT i#) `neWord#` int2Word# 0# #)}

  {-# INLINE writeMBU #-}
  writeMBU (MBUArr n mba#) i@(I# i#) e# =
    checkCritical (here "writeMBU[Bool]") n i $ ST $ \s# ->
    case bOOL_INDEX i#                            of {j#            ->
    case readWordArray# mba# j# s#                of {(# s2#, v# #) ->
    case if e# then v# `or#`  bOOL_BIT     i#
               else v# `and#` bOOL_NOT_BIT i#     of {v'#           ->
    case writeWordArray# mba# j# v'# s2#          of {s3#           ->
    (# s3#, () #)}}}}

instance UAE Char where
  sizeBU (I# n#) _ = I# (cHAR_SCALE n#)

  {-# INLINE indexBU #-}
  indexBU (BUArr (I# s#) n ba#) i@(I# i#) =
    check (here "indexBU[Char]") n i $
    case indexWideCharArray# ba# (s# +# i#)         of {r# ->
    (C# r#)}

  {-# INLINE readMBU #-}
  readMBU (MBUArr n mba#) i@(I# i#) =
    check (here "readMBU[Char]") n i $
    ST $ \s# ->
    case readWideCharArray# mba# i# s#      of {(# s2#, r# #) ->
    (# s2#, C# r# #)}

  {-# INLINE writeMBU #-}
  writeMBU (MBUArr n mba#) i@(I# i#) (C# e#) = 
    checkCritical (here "writeMBU[Char]") n i $
    ST $ \s# ->
    case writeWideCharArray# mba# i# e# s#  of {s2#   ->
    (# s2#, () #)}

instance UAE Int where
  sizeBU (I# n#) _ = I# (iINT_SCALE n#)

  {-# INLINE indexBU #-}
  indexBU (BUArr (I# s#) n ba#) i@(I# i#) =
    check (here "indexBU[Int]") n i $
    case indexIntArray# ba# (s# +# i#)         of {r# ->
    (I# r#)}

  {-# INLINE readMBU #-}
  readMBU (MBUArr n mba#) i@(I# i#) =
    check (here "readMBU[Int]") n i $
    ST $ \s# ->
    case readIntArray# mba# i# s#      of {(# s2#, r# #) ->
    (# s2#, I# r# #)}

  {-# INLINE writeMBU #-}
  writeMBU (MBUArr n mba#) i@(I# i#) (I# e#) = 
    checkCritical (here "writeMBU[Int]") n i $
    ST $ \s# ->
    case writeIntArray# mba# i# e# s#  of {s2#   ->
    (# s2#, () #)}

instance UAE Word where
  sizeBU (I# n#) _ = I# (wORD_SCALE n#)

  {-# INLINE indexBU #-}
  indexBU (BUArr (I# s#) n ba#) i@(I# i#) =
    check (here "indexBU[Word]") n i $
    case indexWordArray# ba# (s# +# i#)         of {r# ->
    (W# r#)}

  {-# INLINE readMBU #-}
  readMBU (MBUArr n mba#) i@(I# i#) =
    check (here "readMBU[Word]") n i $
    ST $ \s# ->
    case readWordArray# mba# i# s#      of {(# s2#, r# #) ->
    (# s2#, W# r# #)}

  {-# INLINE writeMBU #-}
  writeMBU (MBUArr n mba#) i@(I# i#) (W# e#) =
    checkCritical (here "writeMBU[Word]") n i $
    ST $ \s# ->
    case writeWordArray# mba# i# e# s#  of {s2#   ->
    (# s2#, () #)}

instance UAE Float where
  sizeBU (I# n#) _ = I# (fLOAT_SCALE n#)

  {-# INLINE indexBU #-}
  indexBU (BUArr (I# s#) n ba#) i@(I# i#) =
    check (here "indexBU[Float]") n i $
    case indexFloatArray# ba# (s# +# i#)         of {r# ->
    (F# r#)}

  {-# INLINE readMBU #-}
  readMBU (MBUArr n mba#) i@(I# i#) =
    check (here "readMBU[Float]") n i $
    ST $ \s# ->
    case readFloatArray# mba# i# s#      of {(# s2#, r# #) ->
    (# s2#, F# r# #)}

  {-# INLINE writeMBU #-}
  writeMBU (MBUArr n mba#) i@(I# i#) (F# e#) =
    checkCritical (here "writeMBU[Float]") n i $
    ST $ \s# ->
    case writeFloatArray# mba# i# e# s#  of {s2#   ->
    (# s2#, () #)}

instance UAE Double where
  sizeBU (I# n#) _ = I# (dOUBLE_SCALE n#)

  {-# INLINE indexBU #-}
  indexBU (BUArr (I# s#) n ba#) i@(I# i#) =
    check (here "indexBU[Double]") n i $
    case indexDoubleArray# ba# (s# +# i#)         of {r# ->
    (D# r#)}

  {-# INLINE readMBU #-}
  readMBU (MBUArr n mba#) i@(I# i#) =
    check (here "readMBU[Double]") n i $
    ST $ \s# ->
    case readDoubleArray# mba# i# s#      of {(# s2#, r# #) ->
    (# s2#, D# r# #)}

  {-# INLINE writeMBU #-}
  writeMBU (MBUArr n mba#) i@(I# i#) (D# e#) =
    checkCritical (here "writeMBU[Double]") n i $
    ST $ \s# ->
    case writeDoubleArray# mba# i# e# s#  of {s2#   ->
    (# s2#, () #)}

instance UAE Word8 where
  sizeBU (I# n#) _ = I# (wORD8_SCALE n#)

  {-# INLINE indexBU #-}
  indexBU (BUArr (I# s#) n ba#) i@(I# i#) =
    check (here "indexBU[Word8]") n i $
    case indexWord8Array# ba# (s# +# i#)         of {r# ->
    (W8# r#)}

  {-# INLINE readMBU #-}
  readMBU (MBUArr n mba#) i@(I# i#) =
    check (here "readMBU[Word8]") n i $
    ST $ \s# ->
    case readWord8Array# mba# i# s#      of {(# s2#, r# #) ->
    (# s2#, W8# r# #)}

  {-# INLINE writeMBU #-}
  writeMBU (MBUArr n mba#) i@(I# i#) (W8# e#) =
    checkCritical (here "writeMBU[Word8]") n i $
    ST $ \s# ->
    case writeWord8Array# mba# i# e# s#  of {s2#   ->
    (# s2#, () #)}

instance UAE Word16 where
  sizeBU (I# n#) _ = I# (wORD16_SCALE n#)

  {-# INLINE indexBU #-}
  indexBU (BUArr (I# s#) n ba#) i@(I# i#) =
    check (here "indexBU[Word16]") n i $
    case indexWord16Array# ba# (s# +# i#)         of {r# ->
    (W16# r#)}

  {-# INLINE readMBU #-}
  readMBU (MBUArr n mba#) i@(I# i#) =
    check (here "readMBU[Word16]") n i $
    ST $ \s# ->
    case readWord16Array# mba# i# s#      of {(# s2#, r# #) ->
    (# s2#, W16# r# #)}

  {-# INLINE writeMBU #-}
  writeMBU (MBUArr n mba#) i@(I# i#) (W16# e#) =
    checkCritical (here "writeMBU[Word16]") n i $
    ST $ \s# ->
    case writeWord16Array# mba# i# e# s#  of {s2#   ->
    (# s2#, () #)}

instance UAE Word32 where
  sizeBU (I# n#) _ = I# (wORD32_SCALE n#)

  {-# INLINE indexBU #-}
  indexBU (BUArr (I# s#) n ba#) i@(I# i#) =
    check (here "indexBU[Word32]") n i $
    case indexWord32Array# ba# (s# +# i#)         of {r# ->
    (W32# r#)}

  {-# INLINE readMBU #-}
  readMBU (MBUArr n mba#) i@(I# i#) =
    check (here "readMBU[Word32]") n i $
    ST $ \s# ->
    case readWord32Array# mba# i# s#      of {(# s2#, r# #) ->
    (# s2#, W32# r# #)}

  {-# INLINE writeMBU #-}
  writeMBU (MBUArr n mba#) i@(I# i#) (W32# e#) =
    checkCritical (here "writeMBU[Word32]") n i $
    ST $ \s# ->
    case writeWord32Array# mba# i# e# s#  of {s2#   ->
    (# s2#, () #)}

instance UAE Word64 where
  sizeBU (I# n#) _ = I# (wORD64_SCALE n#)

  {-# INLINE indexBU #-}
  indexBU (BUArr (I# s#) n ba#) i@(I# i#) =
    check (here "indexBU[Word64]") n i $
    case indexWord64Array# ba# (s# +# i#)         of {r# ->
    (W64# r#)}

  {-# INLINE readMBU #-}
  readMBU (MBUArr n mba#) i@(I# i#) =
    check (here "readMBU[Word64]") n i $
    ST $ \s# ->
    case readWord64Array# mba# i# s#      of {(# s2#, r# #) ->
    (# s2#, W64# r# #)}

  {-# INLINE writeMBU #-}
  writeMBU (MBUArr n mba#) i@(I# i#) (W64# e#) =
    checkCritical (here "writeMBU[Word64]") n i $
    ST $ \s# ->
    case writeWord64Array# mba# i# e# s#  of {s2#   ->
    (# s2#, () #)}

instance UAE Int8 where
  sizeBU (I# n#) _ = I# (iNT8_SCALE n#)

  {-# INLINE indexBU #-}
  indexBU (BUArr (I# s#) n ba#) i@(I# i#) =
    check (here "indexBU[Int8]") n i $
    case indexInt8Array# ba# (s# +# i#)         of {r# ->
    (I8# r#)}

  {-# INLINE readMBU #-}
  readMBU (MBUArr n mba#) i@(I# i#) =
    check (here "readMBU[Int8]") n i $
    ST $ \s# ->
    case readInt8Array# mba# i# s#      of {(# s2#, r# #) ->
    (# s2#, I8# r# #)}

  {-# INLINE writeMBU #-}
  writeMBU (MBUArr n mba#) i@(I# i#) (I8# e#) =
    checkCritical (here "writeMBU[Int8]") n i $
    ST $ \s# ->
    case writeInt8Array# mba# i# e# s#  of {s2#   ->
    (# s2#, () #)}

instance UAE Int16 where
  sizeBU (I# n#) _ = I# (iNT16_SCALE n#)

  {-# INLINE indexBU #-}
  indexBU (BUArr (I# s#) n ba#) i@(I# i#) =
    check (here "indexBU[Int16]") n i $
    case indexInt16Array# ba# (s# +# i#)         of {r# ->
    (I16# r#)}

  {-# INLINE readMBU #-}
  readMBU (MBUArr n mba#) i@(I# i#) =
    check (here "readMBU[Int16]") n i $
    ST $ \s# ->
    case readInt16Array# mba# i# s#      of {(# s2#, r# #) ->
    (# s2#, I16# r# #)}

  {-# INLINE writeMBU #-}
  writeMBU (MBUArr n mba#) i@(I# i#) (I16# e#) =
    checkCritical (here "writeMBU[Int16]") n i $
    ST $ \s# ->
    case writeInt16Array# mba# i# e# s#  of {s2#   ->
    (# s2#, () #)}

instance UAE Int32 where
  sizeBU (I# n#) _ = I# (iNT32_SCALE n#)

  {-# INLINE indexBU #-}
  indexBU (BUArr (I# s#) n ba#) i@(I# i#) =
    check (here "indexBU[Int32]") n i $
    case indexInt32Array# ba# (s# +# i#)         of {r# ->
    (I32# r#)}

  {-# INLINE readMBU #-}
  readMBU (MBUArr n mba#) i@(I# i#) =
    check (here "readMBU[Int32]") n i $
    ST $ \s# ->
    case readInt32Array# mba# i# s#      of {(# s2#, r# #) ->
    (# s2#, I32# r# #)}

  {-# INLINE writeMBU #-}
  writeMBU (MBUArr n mba#) i@(I# i#) (I32# e#) =
    checkCritical (here "writeMBU[Int32]") n i $
    ST $ \s# ->
    case writeInt32Array# mba# i# e# s#  of {s2#   ->
    (# s2#, () #)}

instance UAE Int64 where
  sizeBU (I# n#) _ = I# (iNT64_SCALE n#)

  {-# INLINE indexBU #-}
  indexBU (BUArr (I# s#) n ba#) i@(I# i#) =
    check (here "indexBU[Int64]") n i $
    case indexInt64Array# ba# (s# +# i#)         of {r# ->
    (I64# r#)}

  {-# INLINE readMBU #-}
  readMBU (MBUArr n mba#) i@(I# i#) =
    check (here "readMBU[Int64]") n i $
    ST $ \s# ->
    case readInt64Array# mba# i# s#      of {(# s2#, r# #) ->
    (# s2#, I64# r# #)}

  {-# INLINE writeMBU #-}
  writeMBU (MBUArr n mba#) i@(I# i#) (I64# e#) =
    checkCritical (here "writeMBU[Int64]") n i $
    ST $ \s# ->
    case writeInt64Array# mba# i# e# s#  of {s2#   ->
    (# s2#, () #)}

------------------------------------------------------------------------

-- |Stream of unboxed arrays
-- -------------------------

-- | Generate a stream from an array, from left to right
--
streamBU :: UAE e => BUArr e -> Stream e
{-# INLINE [1] streamBU #-}
streamBU arr = Stream next 0 (lengthBU arr)
  where
    n = lengthBU arr
    --
    next i | i == n    = Done
           | otherwise = Yield (arr `indexBU` i) (i+1)

-- | Construct an array from a stream, filling it from left to right
--
unstreamBU :: UAE e => Stream e -> BUArr e
{-# INLINE [1] unstreamBU #-}
unstreamBU (Stream next s n) =
  runST (do
    marr <- newMBU n
    n'   <- fill0 marr
    unsafeFreezeMBU marr n'
  )
  where
    fill0 marr = fill s 0
      where
        fill s i = i `seq`
                   case next s of
                     Done       -> return i
                     Skip s'    -> fill s' i
                     Yield x s' -> do
                                     writeMBU marr i x
                                     fill s' (i+1)

-- Fusion rules for unboxed arrays

{-# RULES  -- -} (for font-locking)

"streamBU/unstreamBU" forall s.
  streamBU (unstreamBU s) = s

 #-}


-- |Combinators for unboxed arrays
-- -

-- |Replicate combinator for unboxed arrays
--
replicateBU :: UAE e => Int -> e -> BUArr e
{-# INLINE replicateBU #-}
replicateBU n = unstreamBU . replicateS n


-- |Extract a slice from an array (given by its start index and length)
--
extractBU :: UAE e => BUArr e -> Int -> Int -> BUArr e
{-# INLINE extractBU #-}
extractBU arr i n = 
  runST (do
    ma <- newMBU n
    copy0 ma
    unsafeFreezeMBU ma n
  )
  where
    fence = n `min` (lengthBU arr - i)
    copy0 ma = copy 0
      where
        copy off | off == fence = return ()
                 | otherwise    = do
                                    writeMBU ma off (arr `indexBU` (i + off))
                                    copy (off + 1)
-- NB: If we had a bounded version of loopBU, we could express extractBU in
--     terms of that loop combinator.  The problem is that this makes fusion
--     more awkward; in particular, when the second loopBU in a
--     "loopBU/loopBU" situation has restricted bounds.  On the other hand
--     sometimes fusing the extraction of a slice with the following
--     computation on that slice is very useful.
-- FIXME: If we leave it as it, we should at least use a block copy operation.
--        (What we really want is to represent extractBU as a loop when we can
--        fuse it with a following loop on the computed slice and, otherwise,
--        when there is no opportunity for fusion, we want to use a block copy
--        routine.)
-- FIXME: The above comments no longer apply as we've switched to stream-based
--        fusion. Moreover, slicing gives us bounded iteration for free.

-- |Map a function over an unboxed array
--
mapBU :: (UAE a, UAE b) => (a -> b) -> BUArr a -> BUArr b
{-# INLINE mapBU #-}
mapBU f = unstreamBU . mapS f . streamBU

-- |Reduce an unboxed array
--
foldlBU :: UAE b => (a -> b -> a) -> a -> BUArr b -> a
{-# INLINE foldlBU #-}
foldlBU f z = foldS f z . streamBU

-- |Reduce an unboxed array using an *associative* combining operator
--
foldBU :: UAE a => (a -> a -> a) -> a -> BUArr a -> a
{-# INLINE foldBU #-}
foldBU = foldlBU

-- |Summation of an unboxed array
--
sumBU :: (UAE a, Num a) => BUArr a -> a
{-# INLINE sumBU #-}
sumBU = foldBU (+) 0

-- |Prefix reduction of an unboxed array
--
scanlBU :: (UAE a, UAE b) => (a -> b -> a) -> a -> BUArr b -> BUArr a
{-# INLINE scanBU #-}
scanlBU f z = unstreamBU . scanS f z . streamBU

-- |Prefix reduction of an unboxed array using an *associative* combining
-- operator
--
scanBU :: UAE a => (a -> a -> a) -> a -> BUArr a -> BUArr a
scanBU = scanlBU

-- |Extract a slice from a mutable array (the slice is immutable)
--
extractMBU :: UAE e => MBUArr s e -> Int -> Int -> ST s (BUArr e)
{-# INLINE extractMBU #-}
extractMBU arr i n = do
                       arr' <- unsafeFreezeMBU arr (i + n)
                       return $ extractBU arr' i n

-- |Copy a the contents of an immutable array into a mutable array from the
-- specified position on
--
copyMBU :: UAE e => MBUArr s e -> Int -> BUArr e -> ST s ()
{-# SPECIALIZE 
      copyMBU :: MBUArr s Int -> Int -> BUArr Int -> ST s () #-}
copyMBU marr i arr = ins i 0
  where
    n = lengthBU arr
    --
    ins i j | j == n    = return ()
            | otherwise = do
                            writeMBU marr i (arr `indexBU` j)
                            ins (i + 1) (j + 1)

-- Eq instance
--
instance (Eq e, UAE e) => Eq (BUArr e) where
  arr == brr = n == lengthBU brr && eq 0
    where
      n = lengthBU arr
      eq i | i == n    = True
           | otherwise = (arr `indexBU` i) == (brr `indexBU` i)
                         && eq (i+1)

-- Show instance
--
instance (Show e, UAE e) => Show (BUArr e) where
  showsPrec _ a =   showString "toBU "
                  . showList [a `indexBU` i | i <- [0..lengthBU a - 1]]

------------------------------------------------------------------------

-- Auxilliary functions
-- --------------------

-- |Convert a list to an array
--
toBU :: UAE e => [e] -> BUArr e
toBU = unstreamBU . toStream

-- |Convert an array to a list
--
fromBU :: UAE e => BUArr e -> [e]
fromBU a = map (a `indexBU`) [0 .. lengthBU a - 1]

------------------------------------------------------------------------
-- To and from ByteStrings

{-
toBS :: forall e . UAE e => BUArr e -> ByteString
toBS arr@(BUArr off len addr#) = unsafePerformIO  $ do
        p <- newForeignPtr_ (Ptr (unsafeCoerce# addr#))
        return $ PS p off_bytes len_bytes
    where
        len_bytes = sizeBU len (undefined :: e)
        off_bytes = sizeBU off (undefined :: e)
-}

-----------------------------------------------------------------------------
-- Translation between elements and bytes
-- Duplicated here from Data.Array.Base to avoid build dependency

cHAR_SCALE, wORD_SCALE, iINT_SCALE, dOUBLE_SCALE, fLOAT_SCALE :: Int# -> Int#
cHAR_SCALE   n# = scale# *# n# where I# scale# = SIZEOF_HSCHAR
iINT_SCALE   n# = scale# *# n# where I# scale# = SIZEOF_HSINT
wORD_SCALE   n# = scale# *# n# where I# scale# = SIZEOF_HSWORD
dOUBLE_SCALE n# = scale# *# n# where I# scale# = SIZEOF_HSDOUBLE
fLOAT_SCALE  n# = scale# *# n# where I# scale# = SIZEOF_HSFLOAT

wORD8_SCALE, wORD16_SCALE, wORD32_SCALE, wORD64_SCALE :: Int# -> Int#
wORD8_SCALE  n# = scale# *# n# where I# scale# = SIZEOF_WORD8
wORD16_SCALE n# = scale# *# n# where I# scale# = SIZEOF_WORD16
wORD32_SCALE n# = scale# *# n# where I# scale# = SIZEOF_WORD32
wORD64_SCALE n# = scale# *# n# where I# scale# = SIZEOF_WORD64

iNT8_SCALE, iNT16_SCALE, iNT32_SCALE, iNT64_SCALE :: Int# -> Int#
iNT8_SCALE  n# = scale# *# n# where I# scale# = SIZEOF_INT8
iNT16_SCALE n# = scale# *# n# where I# scale# = SIZEOF_INT16
iNT32_SCALE n# = scale# *# n# where I# scale# = SIZEOF_INT32
iNT64_SCALE n# = scale# *# n# where I# scale# = SIZEOF_INT64

bOOL_SCALE, bOOL_WORD_SCALE :: Int# -> Int#
bOOL_SCALE n# = (n# +# last#) `uncheckedIShiftRA#` 3#
  where I# last# = SIZEOF_HSWORD * 8 - 1
bOOL_WORD_SCALE n# = bOOL_INDEX (n# +# last#)
  where I# last# = SIZEOF_HSWORD * 8 - 1

bOOL_INDEX :: Int# -> Int#
#if SIZEOF_HSWORD == 4
bOOL_INDEX i# = i# `uncheckedIShiftRA#` 5#
#elif SIZEOF_HSWORD == 8
bOOL_INDEX i# = i# `uncheckedIShiftRA#` 6#
#endif

bOOL_BIT, bOOL_NOT_BIT :: Int# -> Word#
bOOL_BIT     n# = int2Word# 1# `uncheckedShiftL#` (word2Int# (int2Word# n# `and#` mask#))
  where W# mask# = SIZEOF_HSWORD * 8 - 1
bOOL_NOT_BIT n# = bOOL_BIT n# `xor#` mb# where W# mb# = maxBound