historical/toontown-classic.git/panda/include/dr_flac.h

// Public domain. See "unlicense" statement at the end of this file.
//NB: modified by rdb to use 16-bit instead of 32-bit samples.

// ABOUT
//
// This is a simple library for decoding FLAC files.
//
//
//
// USAGE
//
// This is a single-file library. To use it, do something like the following in one .c file.
//   #define DR_FLAC_IMPLEMENTATION
//   #include "dr_flac.h"
//
// You can then #include this file in other parts of the program as you would with any other header file. To decode audio data,
// do something like the following:
//
//     drflac* pFlac = drflac_open_file("MySong.flac");
//     if (pFlac == NULL) {
//         ... Failed to open FLAC file ...
//     }
//
//     int16_t* pSamples = malloc(pFlac->totalSampleCount * sizeof(int16_t));
//     uint64_t numberOfSamplesActuallyRead = drflac_read_s16(pFlac, pFlac->totalSampleCount, pSamples);
//
//     ... pSamples now contains the decoded samples as interleaved signed 16-bit PCM ...
//
// The drflac object represents the decoder. It is a transparent type so all the information you need, such as the number of
// channels and the bits per sample, should be directly accessible - just make sure you don't change their values.
//
// You do not need to decode the entire stream in one go - you just specify how many samples you'd like at any given time and
// the decoder will give you as many samples as it can, up to the amount requested. Later on when you need the next batch of
// samples, just call it again. Example:
//
//     while (drflac_read_s16(pFlac, chunkSize, pChunkSamples) > 0) {
//         do_something();
//     }
//
// You can seek to a specific sample with drflac_seek_to_sample(). The given sample is based on interleaving. So for example,
// if you were to seek to the sample at index 0 in a stereo stream, you'll be seeking to the first sample of the left channel.
// The sample at index 1 will be the first sample of the right channel. The sample at index 2 will be the second sample of the
// left channel, etc.
//
//
//
// OPTIONS
// #define these options before including this file.
//
// #define DR_FLAC_NO_STDIO
//   Disable drflac_open_file().
//
// #define DR_FLAC_NO_WIN32_IO
//   Don't use the Win32 API internally for drflac_open_file(). Setting this will force stdio FILE APIs instead. This is
//   mainly for testing, but it's left here in case somebody might find use for it. dr_flac will use the Win32 API by
//   default. Ignored when DR_FLAC_NO_STDIO is #defined.
//
// #define DR_FLAC_BUFFER_SIZE <number>
//   Defines the size of the internal buffer to store data from onRead(). This buffer is used to reduce the number of calls
//   back to the client for more data. Larger values means more memory, but better performance. My tests show diminishing
//   returns after about 4KB (which is the default). Consider reducing this if you have a very efficient implementation of
//   onRead(), or increase it if it's very inefficient.
//
//
//
// QUICK NOTES
//
// - Based on my own tests, the 32-bit build is about about 1.1x-1.25x slower than the reference implementation. The 64-bit
//   build is at about parity.
// - This should work fine with valid native FLAC files, but it won't work very well when the STREAMINFO block is unavailable
//   and when a stream starts in the middle of a frame. This is something I plan on addressing.
// - Audio data is retrieved as signed 16-bit PCM, regardless of the bits per sample the FLAC stream is encoded as.
// - This has not been tested on big-endian architectures.
// - Rice codes in unencoded binary form (see https://xiph.org/flac/format.html#rice_partition) has not been tested. If anybody
//   knows where I can find some test files for this, let me know.
// - Perverse and erroneous files have not been tested. Again, if you know where I can get some test files let me know.
// - dr_flac is not thread-safe, but it's APIs can be called from any thread so long as you do your own synchronization.
// - dr_flac does not currently do any CRC checks.
// - Ogg encapsulation is not supported, but I want to add it at some point.
//
//
//
// TODO
// - Implement a proper test suite.
// - Add support for initializing the decoder without a STREAMINFO block. Build a synthethic test to get support working at at least
//   a basic level.
// - Add support for retrieving metadata blocks so applications can retrieve the album art or whatnot.
// - Add support for Ogg encapsulation.

#ifndef dr_flac_h
#define dr_flac_h

#include <stddef.h>
#include <stdint.h>
//#include <stdbool.h>

// As data is read from the client it is placed into an internal buffer for fast access. This controls the
// size of that buffer. Larger values means more speed, but also more memory. In my testing there is diminishing
// returns after about 4KB, but you can fiddle with this to suit your own needs. Must be a multiple of 8.
#ifndef DR_FLAC_BUFFER_SIZE
#define DR_FLAC_BUFFER_SIZE   4096
#endif

// Check if we can enable 64-bit optimizations.
#if defined(_WIN64)
#define DRFLAC_64BIT
#endif

#if defined(__GNUC__)
#if defined(__x86_64__) || defined(__ppc64__)
#define DRFLAC_64BIT
#endif
#endif

#ifdef DRFLAC_64BIT
typedef uint64_t drflac_cache_t;
#else
typedef uint32_t drflac_cache_t;
#endif


// Callback for when data is read. Return value is the number of bytes actually read.
typedef size_t (* drflac_read_proc)(void* userData, void* bufferOut, size_t bytesToRead);

// Callback for when data needs to be seeked. Offset is always relative to the current position. Return value is false on failure, true success.
typedef bool (* drflac_seek_proc)(void* userData, int offset);


typedef struct
{
    // The absolute position of the first byte of the data of the block. This is just past the block's header.
    long long pos;

    // The size in bytes of the block's data.
    unsigned int sizeInBytes;

} drflac_block;

typedef struct
{
    // The type of the subframe: SUBFRAME_CONSTANT, SUBFRAME_VERBATIM, SUBFRAME_FIXED or SUBFRAME_LPC.
    unsigned char subframeType;

    // The number of wasted bits per sample as specified by the sub-frame header.
    unsigned char wastedBitsPerSample;

    // The order to use for the prediction stage for SUBFRAME_FIXED and SUBFRAME_LPC.
    unsigned char lpcOrder;

    // The number of bits per sample for this subframe. This is not always equal to the current frame's bit per sample because
    // an extra bit is required for side channels when interchannel decorrelation is being used.
    int bitsPerSample;

    // A pointer to the buffer containing the decoded samples in the subframe. This pointer is an offset from drflac::pHeap, or
    // NULL if the heap is not being used. Note that it's a signed 32-bit integer for each value.
    int32_t* pDecodedSamples;

} drflac_subframe;

typedef struct
{
    // If the stream uses variable block sizes, this will be set to the index of the first sample. If fixed block sizes are used, this will
    // always be set to 0.
    unsigned long long sampleNumber;

    // If the stream uses fixed block sizes, this will be set to the frame number. If variable block sizes are used, this will always be 0.
    unsigned int frameNumber;

    // The sample rate of this frame.
    unsigned int sampleRate;

    // The number of samples in each sub-frame within this frame.
    unsigned short blockSize;

    // The channel assignment of this frame. This is not always set to the channel count. If interchannel decorrelation is being used this
    // will be set to DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE, DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE or DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE.
    unsigned char channelAssignment;

    // The number of bits per sample within this frame.
    unsigned char bitsPerSample;

    // The frame's CRC. This is set, but unused at the moment.
    unsigned char crc8;

    // The number of samples left to be read in this frame. This is initially set to the block size multiplied by the channel count. As samples
    // are read, this will be decremented. When it reaches 0, the decoder will see this frame as fully consumed and load the next frame.
    unsigned int samplesRemaining;

    // The list of sub-frames within the frame. There is one sub-frame for each channel, and there's a maximum of 8 channels.
    drflac_subframe subframes[8];

} drflac_frame;

typedef struct
{
    // The function to call when more data needs to be read. This is set by drflac_open().
    drflac_read_proc onRead;

    // The function to call when the current read position needs to be moved.
    drflac_seek_proc onSeek;

    // The user data to pass around to onRead and onSeek.
    void* pUserData;


    // The sample rate. Will be set to something like 44100.
    unsigned int sampleRate;

    // The number of channels. This will be set to 1 for monaural streams, 2 for stereo, etc. Maximum 8. This is set based on the
    // value specified in the STREAMINFO block.
    unsigned char channels;

    // The bits per sample. Will be set to somthing like 16, 24, etc.
    unsigned char bitsPerSample;

    // The maximum block size, in samples. This number represents the number of samples in each channel (not combined).
    unsigned short maxBlockSize;

    // The total number of samples making up the stream. This includes every channel. For example, if the stream has 2 channels,
    // with each channel having a total of 4096, this value will be set to 2*4096 = 8192.
    uint64_t totalSampleCount;


    // The location and size of the APPLICATION block.
    drflac_block applicationBlock;

    // The location and size of the SEEKTABLE block.
    drflac_block seektableBlock;

    // The location and size of the VORBIS_COMMENT block.
    drflac_block vorbisCommentBlock;

    // The location and size of the CUESHEET block.
    drflac_block cuesheetBlock;

    // The location and size of the PICTURE block.
    drflac_block pictureBlock;


    // Information about the frame the decoder is currently sitting on.
    drflac_frame currentFrame;

    // The position of the first frame in the stream. This is only ever used for seeking.
    unsigned long long firstFramePos;


    // The current byte position in the client's data stream.
    uint64_t currentBytePos;

    // The index of the next valid cache line in the "L2" cache.
    size_t nextL2Line;

    // The number of bits that have been consumed by the cache. This is used to determine how many valid bits are remaining.
    size_t consumedBits;

    // Unused L2 lines. This will always be 0 until the end of the stream is hit. Used for correctly calculating the current byte
    // position of the read pointer in the stream.
    size_t unusedL2Lines;

    // The cached data which was most recently read from the client. When data is read from the client, it is placed within this
    // variable. As data is read, it's bit-shifted such that the next valid bit is sitting on the most significant bit.
    drflac_cache_t cache;
    drflac_cache_t cacheL2[DR_FLAC_BUFFER_SIZE/sizeof(drflac_cache_t)];


    // A pointer to the decoded sample data. This is an offset of pExtraData.
    int32_t* pDecodedSamples;

    // Variable length extra data. We attach this to the end of the object so we avoid unnecessary mallocs.
    char pExtraData[1];

} drflac;


// Opens a FLAC decoder.
//
// This is the lowest level function for opening a FLAC stream. You can also use drflac_open_file() and drflac_open_memory()
// to open the stream from a file or from a block of memory respectively.
//
// At the moment the STREAMINFO block must be present for this to succeed.
//
// The onRead and onSeek callbacks are used to read and seek data provided by the client.
static drflac* drflac_open(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData);

// Closes the given FLAC decoder.
static void drflac_close(drflac* pFlac);

// Reads sample data from the given FLAC decoder, output as interleaved signed 16-bit PCM.
//
// Returns the number of samples actually read.
static uint64_t drflac_read_s16(drflac* pFlac, uint64_t samplesToRead, int16_t* pBufferOut);

// Seeks to the sample at the given index.
static bool drflac_seek_to_sample(drflac* pFlac, uint64_t sampleIndex);


#ifndef DR_FLAC_NO_STDIO
// Opens a flac decoder from the file at the given path.
static drflac* drflac_open_file(const char* pFile);
#endif

// Helper for opening a file from a pre-allocated memory buffer.
//
// This does not create a copy of the data. It is up to the application to ensure the buffer remains valid for
// the lifetime of the decoder.
static drflac* drflac_open_memory(const void* data, size_t dataSize);

#endif  //dr_flac_h


///////////////////////////////////////////////////////////////////////////////
//
// IMPLEMENTATION
//
///////////////////////////////////////////////////////////////////////////////
#ifdef DR_FLAC_IMPLEMENTATION
#include <stdlib.h>
#include <string.h>
#include <assert.h>

#ifdef _MSC_VER
#include <intrin.h>     // For _byteswap_ulong and _byteswap_uint64
#endif

#ifdef __linux__
#ifndef _BSD_SOURCE
#define _BSD_SOURCE
#endif
#include <endian.h>
#endif

#define DRFLAC_INLINE ALWAYS_INLINE

#define DRFLAC_BLOCK_TYPE_STREAMINFO                    0
#define DRFLAC_BLOCK_TYPE_PADDING                       1
#define DRFLAC_BLOCK_TYPE_APPLICATION                   2
#define DRFLAC_BLOCK_TYPE_SEEKTABLE                     3
#define DRFLAC_BLOCK_TYPE_VORBIS_COMMENT                4
#define DRFLAC_BLOCK_TYPE_CUESHEET                      5
#define DRFLAC_BLOCK_TYPE_PICTURE                       6
#define DRFLAC_BLOCK_TYPE_INVALID                       127

#define DRFLAC_SUBFRAME_CONSTANT                        0
#define DRFLAC_SUBFRAME_VERBATIM                        1
#define DRFLAC_SUBFRAME_FIXED                           8
#define DRFLAC_SUBFRAME_LPC                             32
#define DRFLAC_SUBFRAME_RESERVED                        255

#define DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE  0
#define DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2 1

#define DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT           0
#define DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE             8
#define DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE            9
#define DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE              10

typedef struct
{
    uint64_t firstSample;
    uint64_t frameOffset;   // The offset from the first byte of the header of the first frame.
    uint16_t sampleCount;
} drflac_seekpoint;

#ifndef DR_FLAC_NO_STDIO
#if defined(DR_FLAC_NO_WIN32_IO) || !defined(_WIN32)
#include <stdio.h>

static size_t drflac__on_read_stdio(void* pUserData, void* bufferOut, size_t bytesToRead)
{
    return fread(bufferOut, 1, bytesToRead, (FILE*)pUserData);
}

static bool drflac__on_seek_stdio(void* pUserData, int offset)
{
    return fseek((FILE*)pUserData, offset, SEEK_CUR) == 0;
}

drflac* drflac_open_file(const char* filename)
{
    FILE* pFile;
#ifdef _MSC_VER
    if (fopen_s(&pFile, filename, "rb") != 0) {
        return NULL;
    }
#else
    pFile = fopen(filename, "rb");
    if (pFile == NULL) {
        return NULL;
    }
#endif

    return drflac_open(drflac__on_read_stdio, drflac__on_seek_stdio, pFile);
}
#else
#include <windows.h>

static size_t drflac__on_read_stdio(void* pUserData, void* bufferOut, size_t bytesToRead)
{
    assert(bytesToRead < 0xFFFFFFFF);   // dr_flac will never request huge amounts of data at a time. This is a safe assertion.

    DWORD bytesRead;
    ReadFile((HANDLE)pUserData, bufferOut, (DWORD)bytesToRead, &bytesRead, NULL);

    return (size_t)bytesRead;
}

static bool drflac__on_seek_stdio(void* pUserData, int offset)
{
    return SetFilePointer((HANDLE)pUserData, offset, NULL, FILE_CURRENT) != INVALID_SET_FILE_POINTER;
}

static drflac* drflac_open_file(const char* filename)
{
    HANDLE hFile = CreateFileA(filename, FILE_GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
    if (hFile == INVALID_HANDLE_VALUE) {
        return NULL;
    }

    return drflac_open(drflac__on_read_stdio, drflac__on_seek_stdio, (void*)hFile);
}
#endif
#endif  //DR_FLAC_NO_STDIO


typedef struct
{
    /// A pointer to the beginning of the data. We use a char as the type here for easy offsetting.
    const unsigned char* data;

    /// The size of the data.
    size_t dataSize;

    /// The position we're currently sitting at.
    size_t currentReadPos;

} drflac_memory;

static size_t drflac__on_read_memory(void* pUserData, void* bufferOut, size_t bytesToRead)
{
    drflac_memory* memory = (drflac_memory*)pUserData;
    assert(memory != NULL);
    assert(memory->dataSize >= memory->currentReadPos);

    size_t bytesRemaining = memory->dataSize - memory->currentReadPos;
    if (bytesToRead > bytesRemaining) {
        bytesToRead = bytesRemaining;
    }

    if (bytesToRead > 0) {
        memcpy(bufferOut, memory->data + memory->currentReadPos, bytesToRead);
        memory->currentReadPos += bytesToRead;
    }

    return bytesToRead;
}

static bool drflac__on_seek_memory(void* pUserData, int offset)
{
    drflac_memory* memory = (drflac_memory*)pUserData;
    assert(memory != NULL);

    if (offset > 0) {
        if (memory->currentReadPos + offset > memory->dataSize) {
            offset = (int)(memory->dataSize - memory->currentReadPos);     // Trying to seek too far forward.
        }
    } else {
        if (memory->currentReadPos < (size_t)-offset) {
            offset = -(int)memory->currentReadPos;                  // Trying to seek too far backwards.
        }
    }

    // This will never underflow thanks to the clamps above.
    memory->currentReadPos += offset;

    return 1;
}

static drflac* drflac_open_memory(const void* data, size_t dataSize)
{
    drflac_memory* pUserData = (drflac_memory*)malloc(sizeof(*pUserData));
    if (pUserData == NULL) {
        return NULL;
    }

    pUserData->data = (const unsigned char*)data;
    pUserData->dataSize = dataSize;
    pUserData->currentReadPos = 0;
    return drflac_open(drflac__on_read_memory, drflac__on_seek_memory, pUserData);
}


//// Endian Management ////
static DRFLAC_INLINE bool drflac__is_little_endian()
{
    int n = 1;
    return (*(char*)&n) == 1;
}

static DRFLAC_INLINE uint32_t drflac__swap_endian_uint32(uint32_t n)
{
#ifdef _MSC_VER
    return _byteswap_ulong(n);
#elif defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
    return __builtin_bswap32(n);
#else
    return ((n & 0xFF000000) >> 24) |
           ((n & 0x00FF0000) >>  8) |
           ((n & 0x0000FF00) <<  8) |
           ((n & 0x000000FF) << 24);
#endif
}

static DRFLAC_INLINE uint64_t drflac__swap_endian_uint64(uint64_t n)
{
#ifdef _MSC_VER
    return _byteswap_uint64(n);
#elif defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
    return __builtin_bswap64(n);
#else
    return ((n & 0xFF00000000000000ULL) >> 56) |
           ((n & 0x00FF000000000000ULL) >> 40) |
           ((n & 0x0000FF0000000000ULL) >> 24) |
           ((n & 0x000000FF00000000ULL) >>  8) |
           ((n & 0x00000000FF000000ULL) <<  8) |
           ((n & 0x0000000000FF0000ULL) << 24) |
           ((n & 0x000000000000FF00ULL) << 40) |
           ((n & 0x00000000000000FFULL) << 56);
#endif
}


static DRFLAC_INLINE uint32_t drflac__be2host_32(uint32_t n)
{
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER == __ORDER_LITTLE_ENDIAN__)
    return drflac__swap_endian_uint32(n);
#elif defined(__linux__)
    return be32toh(n);
#else
    if (drflac__is_little_endian()) {
        return drflac__swap_endian_uint32(n);
    }

    return n;
#endif
}

static DRFLAC_INLINE uint64_t drflac__be2host_64(uint64_t n)
{
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER == __ORDER_LITTLE_ENDIAN__)
    return drflac__swap_endian_uint64(n);
#elif defined(__linux__)
    return be64toh(n);
#else
    if (drflac__is_little_endian()) {
        return drflac__swap_endian_uint64(n);
    }

    return n;
#endif
}

#ifdef DRFLAC_64BIT
#define drflac__be2host__cache_line drflac__be2host_64
#else
#define drflac__be2host__cache_line drflac__be2host_32
#endif


// BIT READING ATTEMPT #2
//
// This uses a 32- or 64-bit bit-shifted cache - as bits are read, the cache is shifted such that the first valid bit is sitting
// on the most significant bit. It uses the notion of an L1 and L2 cache (borrowed from CPU architecture), where the L1 cache
// is a 32- or 64-bit unsigned integer (depending on whether or not a 32- or 64-bit build is being compiled) and the L2 is an
// array of "cache lines", with each cache line being the same size as the L1. The L2 is a buffer of about 4KB and is where data
// from onRead() is read into.
#define DRFLAC_CACHE_L1_SIZE_BYTES                  (sizeof(pFlac->cache))
#define DRFLAC_CACHE_L1_SIZE_BITS                   (sizeof(pFlac->cache)*8)
#define DRFLAC_CACHE_L1_BITS_REMAINING              (DRFLAC_CACHE_L1_SIZE_BITS - (pFlac->consumedBits))
#ifdef DRFLAC_64BIT
#define DRFLAC_CACHE_L1_SELECTION_MASK(_bitCount)   (~(((uint64_t)-1LL) >> (_bitCount)))
#else
#define DRFLAC_CACHE_L1_SELECTION_MASK(_bitCount)   (~(((uint32_t)-1) >> (_bitCount)))
#endif
#define DRFLAC_CACHE_L1_SELECTION_SHIFT(_bitCount)  (DRFLAC_CACHE_L1_SIZE_BITS - (_bitCount))
#define DRFLAC_CACHE_L1_SELECT(_bitCount)           ((pFlac->cache) & DRFLAC_CACHE_L1_SELECTION_MASK(_bitCount))
#define DRFLAC_CACHE_L1_SELECT_AND_SHIFT(_bitCount) (DRFLAC_CACHE_L1_SELECT(_bitCount) >> DRFLAC_CACHE_L1_SELECTION_SHIFT(_bitCount))
#define DRFLAC_CACHE_L2_SIZE_BYTES                  (sizeof(pFlac->cacheL2))
#define DRFLAC_CACHE_L2_LINE_COUNT                  (DRFLAC_CACHE_L2_SIZE_BYTES / sizeof(pFlac->cacheL2[0]))
#define DRFLAC_CACHE_L2_LINES_REMAINING             (DRFLAC_CACHE_L2_LINE_COUNT - pFlac->nextL2Line)

static DRFLAC_INLINE bool drflac__reload_l1_cache_from_l2(drflac* pFlac)
{
    // Fast path. Try loading straight from L2.
    if (pFlac->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT) {
        pFlac->cache = pFlac->cacheL2[pFlac->nextL2Line++];
        return true;
    }

    // If we get here it means we've run out of data in the L2 cache. We'll need to fetch more from the client.
    size_t bytesRead = pFlac->onRead(pFlac->pUserData, pFlac->cacheL2, DRFLAC_CACHE_L2_SIZE_BYTES);
    pFlac->currentBytePos += bytesRead;

    pFlac->nextL2Line = 0;
    if (bytesRead == DRFLAC_CACHE_L2_SIZE_BYTES) {
        pFlac->cache = pFlac->cacheL2[pFlac->nextL2Line++];
        return true;
    }


    // If we get here it means we were unable to retrieve enough data to fill the entire L2 cache. It probably
    // means we've just reached the end of the file. We need to move the valid data down to the end of the buffer
    // and adjust the index of the next line accordingly. Also keep in mind that the L2 cache must be aligned to
    // the size of the L1 so we'll need to seek backwards by any misaligned bytes.
    size_t alignedL1LineCount = bytesRead / DRFLAC_CACHE_L1_SIZE_BYTES;
    if (alignedL1LineCount > 0)
    {
        size_t offset = DRFLAC_CACHE_L2_LINE_COUNT - alignedL1LineCount;
        for (size_t i = alignedL1LineCount; i > 0; --i) {
            pFlac->cacheL2[i-1 + offset] = pFlac->cacheL2[i-1];
        }

        pFlac->nextL2Line = offset;
        pFlac->unusedL2Lines = offset;

        // At this point there may be some leftover unaligned bytes. We need to seek backwards so we don't lose
        // those bytes.
        size_t unalignedBytes = bytesRead - (alignedL1LineCount * DRFLAC_CACHE_L1_SIZE_BYTES);
        if (unalignedBytes > 0) {
            pFlac->onSeek(pFlac->pUserData, -(int)unalignedBytes);
            pFlac->currentBytePos -= unalignedBytes;
        }

        pFlac->cache = pFlac->cacheL2[pFlac->nextL2Line++];
        return true;
    }
    else
    {
        // If we get into this branch it means we weren't able to load any L1-aligned data. We just need to seek
        // backwards by the leftover bytes and return false.
        if (bytesRead > 0) {
            pFlac->onSeek(pFlac->pUserData, -(int)bytesRead);
            pFlac->currentBytePos -= bytesRead;
        }

        pFlac->nextL2Line = DRFLAC_CACHE_L2_LINE_COUNT;
        return false;
    }
}

static bool drflac__reload_cache(drflac* pFlac)
{
    // Fast path. Try just moving the next value in the L2 cache to the L1 cache.
    if (drflac__reload_l1_cache_from_l2(pFlac)) {
        pFlac->cache = drflac__be2host__cache_line(pFlac->cache);
        pFlac->consumedBits = 0;
        return true;
    }

    // Slow path.

    // If we get here it means we have failed to load the L1 cache from the L2. Likely we've just reached the end of the stream and the last
    // few bytes did not meet the alignment requirements for the L2 cache. In this case we need to fall back to a slower path and read the
    // data straight from the client into the L1 cache. This should only really happen once per stream so efficiency is not important.
    size_t bytesRead = pFlac->onRead(pFlac->pUserData, &pFlac->cache, DRFLAC_CACHE_L1_SIZE_BYTES);
    if (bytesRead == 0) {
        return false;
    }

    pFlac->currentBytePos += bytesRead;

    assert(bytesRead < DRFLAC_CACHE_L1_SIZE_BYTES);
    pFlac->consumedBits = (DRFLAC_CACHE_L1_SIZE_BYTES - bytesRead) * 8;

    pFlac->cache = drflac__be2host__cache_line(pFlac->cache);
    pFlac->cache &= DRFLAC_CACHE_L1_SELECTION_MASK(DRFLAC_CACHE_L1_SIZE_BITS - pFlac->consumedBits);    // <-- Make sure the consumed bits are always set to zero. Other parts of the library depend on this property.
    return true;
}

static bool drflac__seek_bits(drflac* pFlac, size_t bitsToSeek)
{
    if (bitsToSeek <= DRFLAC_CACHE_L1_BITS_REMAINING) {
        pFlac->consumedBits += bitsToSeek;
        pFlac->cache <<= bitsToSeek;
        return true;
    } else {
        // It straddles the cached data. This function isn't called too frequently so I'm favouring simplicity here.
        bitsToSeek -= DRFLAC_CACHE_L1_BITS_REMAINING;
        pFlac->consumedBits += DRFLAC_CACHE_L1_BITS_REMAINING;
        pFlac->cache = 0;

        size_t wholeBytesRemaining = bitsToSeek/8;
        if (wholeBytesRemaining > 0)
        {
            // The next bytes to seek will be located in the L2 cache. The problem is that the L2 cache is not byte aligned,
            // but rather DRFLAC_CACHE_L1_SIZE_BYTES aligned (usually 4 or 8). If, for example, the number of bytes to seek is
            // 3, we'll need to handle it in a special way.
            size_t wholeCacheLinesRemaining = wholeBytesRemaining / DRFLAC_CACHE_L1_SIZE_BYTES;
            if (wholeCacheLinesRemaining < DRFLAC_CACHE_L2_LINES_REMAINING)
            {
                wholeBytesRemaining -= wholeCacheLinesRemaining * DRFLAC_CACHE_L1_SIZE_BYTES;
                bitsToSeek -= wholeCacheLinesRemaining * DRFLAC_CACHE_L1_SIZE_BITS;
                pFlac->nextL2Line += wholeCacheLinesRemaining;
            }
            else
            {
                wholeBytesRemaining -= DRFLAC_CACHE_L2_LINES_REMAINING * DRFLAC_CACHE_L1_SIZE_BYTES;
                bitsToSeek -= DRFLAC_CACHE_L2_LINES_REMAINING * DRFLAC_CACHE_L1_SIZE_BITS;
                pFlac->nextL2Line += DRFLAC_CACHE_L2_LINES_REMAINING;

                pFlac->onSeek(pFlac->pUserData, (int)wholeBytesRemaining);
                pFlac->currentBytePos += wholeBytesRemaining;
                bitsToSeek -= wholeBytesRemaining*8;
            }
        }


        if (bitsToSeek > 0) {
            if (!drflac__reload_cache(pFlac)) {
                return false;
            }

            return drflac__seek_bits(pFlac, bitsToSeek);
        }

        return true;
    }
}

static bool drflac__read_uint32(drflac* pFlac, unsigned int bitCount, uint32_t* pResultOut)
{
    assert(pFlac != NULL);
    assert(pResultOut != NULL);
    assert(bitCount > 0);
    assert(bitCount <= 32);

    if (pFlac->consumedBits == DRFLAC_CACHE_L1_SIZE_BITS) {
        if (!drflac__reload_cache(pFlac)) {
            return false;
        }
    }

    if (bitCount <= DRFLAC_CACHE_L1_BITS_REMAINING) {
        if (bitCount < DRFLAC_CACHE_L1_SIZE_BITS) {
            *pResultOut = DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bitCount);
            pFlac->consumedBits += bitCount;
            pFlac->cache <<= bitCount;
        } else {
            *pResultOut = (uint32_t)pFlac->cache;
            pFlac->consumedBits = DRFLAC_CACHE_L1_SIZE_BITS;
            pFlac->cache = 0;
        }
        return true;
    } else {
        // It straddles the cached data. It will never cover more than the next chunk. We just read the number in two parts and combine them.
        size_t bitCountHi = DRFLAC_CACHE_L1_BITS_REMAINING;
        size_t bitCountLo = bitCount - bitCountHi;
        uint32_t resultHi = DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bitCountHi);

        if (!drflac__reload_cache(pFlac)) {
            return false;
        }

        *pResultOut = (resultHi << bitCountLo) | DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bitCountLo);
        pFlac->consumedBits += bitCountLo;
        pFlac->cache <<= bitCountLo;
        return true;
    }
}

static bool drflac__read_int32(drflac* pFlac, unsigned int bitCount, int32_t* pResult)
{
    assert(pFlac != NULL);
    assert(pResult != NULL);
    assert(bitCount > 0);
    assert(bitCount <= 32);

    uint32_t result;
    if (!drflac__read_uint32(pFlac, bitCount, &result)) {
        return false;
    }

    if ((result & (1 << (bitCount - 1)))) {  // TODO: See if we can get rid of this branch.
        result |= (-1 << bitCount);
    }

    *pResult = (int32_t)result;
    return true;
}

static bool drflac__read_uint64(drflac* pFlac, unsigned int bitCount, uint64_t* pResultOut)
{
    assert(bitCount <= 64);
    assert(bitCount >  32);

    uint32_t resultHi;
    if (!drflac__read_uint32(pFlac, bitCount - 32, &resultHi)) {
        return false;
    }

    uint32_t resultLo;
    if (!drflac__read_uint32(pFlac, 32, &resultLo)) {
        return false;
    }

    *pResultOut = (((uint64_t)resultHi) << 32) | ((uint64_t)resultLo);
    return true;
}

static bool drflac__read_int64(drflac* pFlac, unsigned int bitCount, int64_t* pResultOut)
{
    assert(bitCount <= 64);

    uint64_t result;
    if (!drflac__read_uint64(pFlac, bitCount, &result)) {
        return false;
    }

    if ((result & (1ULL << (bitCount - 1)))) {  // TODO: See if we can get rid of this branch.
        result |= (-1LL << bitCount);
    }

    *pResultOut = (int64_t)result;
    return true;
}

static bool drflac__read_uint16(drflac* pFlac, unsigned int bitCount, uint16_t* pResult)
{
    assert(pFlac != NULL);
    assert(pResult != NULL);
    assert(bitCount > 0);
    assert(bitCount <= 16);

    uint32_t result;
    if (!drflac__read_uint32(pFlac, bitCount, &result)) {
        return false;
    }

    *pResult = (uint16_t)result;
    return true;
}

static bool drflac__read_int16(drflac* pFlac, unsigned int bitCount, int16_t* pResult)
{
    assert(pFlac != NULL);
    assert(pResult != NULL);
    assert(bitCount > 0);
    assert(bitCount <= 16);

    int32_t result;
    if (!drflac__read_int32(pFlac, bitCount, &result)) {
        return false;
    }

    *pResult = (int16_t)result;
    return true;
}

static bool drflac__read_uint8(drflac* pFlac, unsigned int bitCount, uint8_t* pResult)
{
    assert(pFlac != NULL);
    assert(pResult != NULL);
    assert(bitCount > 0);
    assert(bitCount <= 8);

    uint32_t result;
    if (!drflac__read_uint32(pFlac, bitCount, &result)) {
        return false;
    }

    *pResult = (uint8_t)result;
    return true;
}

static bool drflac__read_int8(drflac* pFlac, unsigned int bitCount, int8_t* pResult)
{
    assert(pFlac != NULL);
    assert(pResult != NULL);
    assert(bitCount > 0);
    assert(bitCount <= 8);

    int32_t result;
    if (!drflac__read_int32(pFlac, bitCount, &result)) {
        return false;
    }

    *pResult = (int8_t)result;
    return true;
}


static inline bool drflac__seek_past_next_set_bit(drflac* pFlac, unsigned int* pOffsetOut)
{
    unsigned int zeroCounter = 0;
    while (pFlac->cache == 0) {
        zeroCounter += (unsigned int)DRFLAC_CACHE_L1_BITS_REMAINING;
        if (!drflac__reload_cache(pFlac)) {
            return false;
        }
    }

    // At this point the cache should not be zero, in which case we know the first set bit should be somewhere in here. There is
    // no need for us to perform any cache reloading logic here which should make things much faster.
    assert(pFlac->cache != 0);

    unsigned int bitOffsetTable[] = {
        0,
        4,
        3, 3,
        2, 2, 2, 2,
        1, 1, 1, 1, 1, 1, 1, 1
    };

    unsigned int setBitOffsetPlus1 = bitOffsetTable[DRFLAC_CACHE_L1_SELECT_AND_SHIFT(4)];
    if (setBitOffsetPlus1 == 0) {
        if (pFlac->cache == 1) {
            setBitOffsetPlus1 = DRFLAC_CACHE_L1_SIZE_BITS;
        } else {
            setBitOffsetPlus1 = 5;
            for (;;)
            {
                if ((pFlac->cache & DRFLAC_CACHE_L1_SELECT(setBitOffsetPlus1))) {
                    break;
                }

                setBitOffsetPlus1 += 1;
            }
        }
    }

    pFlac->consumedBits += setBitOffsetPlus1;
    pFlac->cache <<= setBitOffsetPlus1;

    *pOffsetOut = zeroCounter + setBitOffsetPlus1 - 1;
    return true;
}


static bool drflac__seek_to_byte(drflac* pFlac, long long offsetFromStart)
{
    assert(pFlac != NULL);

    long long bytesToMove = offsetFromStart - pFlac->currentBytePos;
    if (bytesToMove == 0) {
        return 1;
    }

    if (bytesToMove > 0x7FFFFFFF) {
        while (bytesToMove > 0x7FFFFFFF) {
            if (!pFlac->onSeek(pFlac->pUserData, 0x7FFFFFFF)) {
                return 0;
            }

            pFlac->currentBytePos += 0x7FFFFFFF;
            bytesToMove -= 0x7FFFFFFF;
        }
    } else {
        while (bytesToMove < (int)0x80000000) {
            if (!pFlac->onSeek(pFlac->pUserData, (int)0x80000000)) {
                return 0;
            }

            pFlac->currentBytePos += (int)0x80000000;
            bytesToMove -= (int)0x80000000;
        }
    }

    assert(bytesToMove <= 0x7FFFFFFF && bytesToMove >= (int)0x80000000);

    bool result = pFlac->onSeek(pFlac->pUserData, (int)bytesToMove);    // <-- Safe cast as per the assert above.
    pFlac->currentBytePos += (int)bytesToMove;

    pFlac->consumedBits = DRFLAC_CACHE_L1_SIZE_BITS;
    pFlac->cache = 0;
    pFlac->nextL2Line = DRFLAC_CACHE_L2_LINE_COUNT; // <-- This clears the L2 cache.

    return result;
}

static long long drflac__tell(drflac* pFlac)
{
    assert(pFlac != NULL);

    size_t unreadBytesFromL1 = (DRFLAC_CACHE_L1_SIZE_BYTES - (pFlac->consumedBits/8));
    size_t unreadBytesFromL2 = (DRFLAC_CACHE_L2_SIZE_BYTES - ((pFlac->nextL2Line - pFlac->unusedL2Lines)*DRFLAC_CACHE_L1_SIZE_BYTES));

    return pFlac->currentBytePos - unreadBytesFromL1 - unreadBytesFromL2;
}


static bool drflac__read_utf8_coded_number(drflac* pFlac, unsigned long long* pNumberOut)
{
    assert(pFlac != NULL);
    assert(pNumberOut != NULL);

    // We should never need to read UTF-8 data while not being aligned to a byte boundary. Therefore we can grab the data
    // directly from the input stream rather than using drflac__read_uint8().
    assert((pFlac->consumedBits & 7) == 0);

    unsigned char utf8[7] = {0};
    if (!drflac__read_uint8(pFlac, 8, utf8)) {
        *pNumberOut = 0;
        return false;
    }

    if ((utf8[0] & 0x80) == 0) {
        *pNumberOut = utf8[0];
        return true;
    }

    int byteCount = 1;
    if ((utf8[0] & 0xE0) == 0xC0) {
        byteCount = 2;
    } else if ((utf8[0] & 0xF0) == 0xE0) {
        byteCount = 3;
    } else if ((utf8[0] & 0xF8) == 0xF0) {
        byteCount = 4;
    } else if ((utf8[0] & 0xFC) == 0xF8) {
        byteCount = 5;
    } else if ((utf8[0] & 0xFE) == 0xFC) {
        byteCount = 6;
    } else if ((utf8[0] & 0xFF) == 0xFE) {
        byteCount = 7;
    } else {
        *pNumberOut = 0;
        return false;     // Bad UTF-8 encoding.
    }

    // Read extra bytes.
    assert(byteCount > 1);

    unsigned long long result = ((long long)(utf8[0] & (0xFF >> (byteCount + 1))));
    for (int i = 1; i < byteCount; ++i) {
        if (!drflac__read_uint8(pFlac, 8, utf8 + i)) {
            *pNumberOut = 0;
            return false;
        }

        result = (result << 6) | (utf8[i] & 0x3F);
    }

    *pNumberOut = result;
    return true;
}


static DRFLAC_INLINE bool drflac__read_and_seek_rice(drflac* pFlac, unsigned char m)
{
    unsigned int unused;
    if (!drflac__seek_past_next_set_bit(pFlac, &unused)) {
        return false;
    }

    if (m > 0) {
        if (!drflac__seek_bits(pFlac, m)) {
            return false;
        }
    }

    return true;
}


// The next two functions are responsible for calculating the prediction.
//
// When the bits per sample is >16 we need to use 64-bit integer arithmetic because otherwise we'll run out of precision. It's
// safe to assume this will be slower on 32-bit platforms so we use a more optimal solution when the bits per sample is <=16.
//
//
// Optimization Experiment #1
//
// The first optimization experiment I'm trying here is a loop unroll for the most common LPC orders. I've done a little test
// and the results are as follows, in order of most common:
// 1)  order = 8  : 93.1M
// 2)  order = 7  : 36.6M
// 3)  order = 3  : 33.2M
// 4)  order = 6  : 20.9M
// 5)  order = 5  : 18.1M
// 6)  order = 4  : 15.8M
// 7)  order = 12 : 10.8M
// 8)  order = 2  :  9.8M
// 9)  order = 1  :  1.6M
// 10) order = 10 :  1.0M
// 11) order = 9  :  0.8M
// 12) order = 11 :  0.8M
//
// We'll experiment with unrolling the top 8 most common ones. We'll ignore the least common ones since there seems to be a
// large drop off there.
//
// Result: There's a tiny improvement in some cases, but it could just be within margin of error so unsure if it's worthwhile
// just yet.
static DRFLAC_INLINE int32_t drflac__calculate_prediction_32(unsigned int order, int shift, const short* coefficients, int32_t* pDecodedSamples)
{
    assert(order <= 32);

    // 32-bit version.

    // This method is slower on both 32- and 64-bit builds with VC++. Leaving this here for now just in case we need it later
    // for whatever reason.
#if 0
    int prediction;
    if (order == 8)
    {
        prediction  = coefficients[0] * pDecodedSamples[-1];
        prediction += coefficients[1] * pDecodedSamples[-2];
        prediction += coefficients[2] * pDecodedSamples[-3];
        prediction += coefficients[3] * pDecodedSamples[-4];
        prediction += coefficients[4] * pDecodedSamples[-5];
        prediction += coefficients[5] * pDecodedSamples[-6];
        prediction += coefficients[6] * pDecodedSamples[-7];
        prediction += coefficients[7] * pDecodedSamples[-8];
    }
    else if (order == 7)
    {
        prediction  = coefficients[0] * pDecodedSamples[-1];
        prediction += coefficients[1] * pDecodedSamples[-2];
        prediction += coefficients[2] * pDecodedSamples[-3];
        prediction += coefficients[3] * pDecodedSamples[-4];
        prediction += coefficients[4] * pDecodedSamples[-5];
        prediction += coefficients[5] * pDecodedSamples[-6];
        prediction += coefficients[6] * pDecodedSamples[-7];
    }
    else if (order == 3)
    {
        prediction  = coefficients[0] * pDecodedSamples[-1];
        prediction += coefficients[1] * pDecodedSamples[-2];
        prediction += coefficients[2] * pDecodedSamples[-3];
    }
    else if (order == 6)
    {
        prediction  = coefficients[0] * pDecodedSamples[-1];
        prediction += coefficients[1] * pDecodedSamples[-2];
        prediction += coefficients[2] * pDecodedSamples[-3];
        prediction += coefficients[3] * pDecodedSamples[-4];
        prediction += coefficients[4] * pDecodedSamples[-5];
        prediction += coefficients[5] * pDecodedSamples[-6];
    }
    else if (order == 5)
    {
        prediction  = coefficients[0] * pDecodedSamples[-1];
        prediction += coefficients[1] * pDecodedSamples[-2];
        prediction += coefficients[2] * pDecodedSamples[-3];
        prediction += coefficients[3] * pDecodedSamples[-4];
        prediction += coefficients[4] * pDecodedSamples[-5];
    }
    else if (order == 4)
    {
        prediction  = coefficients[0] * pDecodedSamples[-1];
        prediction += coefficients[1] * pDecodedSamples[-2];
        prediction += coefficients[2] * pDecodedSamples[-3];
        prediction += coefficients[3] * pDecodedSamples[-4];
    }
    else if (order == 12)
    {
        prediction  = coefficients[0]  * pDecodedSamples[-1];
        prediction += coefficients[1]  * pDecodedSamples[-2];
        prediction += coefficients[2]  * pDecodedSamples[-3];
        prediction += coefficients[3]  * pDecodedSamples[-4];
        prediction += coefficients[4]  * pDecodedSamples[-5];
        prediction += coefficients[5]  * pDecodedSamples[-6];
        prediction += coefficients[6]  * pDecodedSamples[-7];
        prediction += coefficients[7]  * pDecodedSamples[-8];
        prediction += coefficients[8]  * pDecodedSamples[-9];
        prediction += coefficients[9]  * pDecodedSamples[-10];
        prediction += coefficients[10] * pDecodedSamples[-11];
        prediction += coefficients[11] * pDecodedSamples[-12];
    }
    else if (order == 2)
    {
        prediction  = coefficients[0] * pDecodedSamples[-1];
        prediction += coefficients[1] * pDecodedSamples[-2];
    }
    else if (order == 1)
    {
        prediction = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
    }
    else if (order == 10)
    {
        prediction  = coefficients[0]  * pDecodedSamples[-1];
        prediction += coefficients[1]  * pDecodedSamples[-2];
        prediction += coefficients[2]  * pDecodedSamples[-3];
        prediction += coefficients[3]  * pDecodedSamples[-4];
        prediction += coefficients[4]  * pDecodedSamples[-5];
        prediction += coefficients[5]  * pDecodedSamples[-6];
        prediction += coefficients[6]  * pDecodedSamples[-7];
        prediction += coefficients[7]  * pDecodedSamples[-8];
        prediction += coefficients[8]  * pDecodedSamples[-9];
        prediction += coefficients[9]  * pDecodedSamples[-10];
    }
    else if (order == 9)
    {
        prediction  = coefficients[0]  * pDecodedSamples[-1];
        prediction += coefficients[1]  * pDecodedSamples[-2];
        prediction += coefficients[2]  * pDecodedSamples[-3];
        prediction += coefficients[3]  * pDecodedSamples[-4];
        prediction += coefficients[4]  * pDecodedSamples[-5];
        prediction += coefficients[5]  * pDecodedSamples[-6];
        prediction += coefficients[6]  * pDecodedSamples[-7];
        prediction += coefficients[7]  * pDecodedSamples[-8];
        prediction += coefficients[8]  * pDecodedSamples[-9];
    }
    else if (order == 11)
    {
        prediction  = coefficients[0]  * pDecodedSamples[-1];
        prediction += coefficients[1]  * pDecodedSamples[-2];
        prediction += coefficients[2]  * pDecodedSamples[-3];
        prediction += coefficients[3]  * pDecodedSamples[-4];
        prediction += coefficients[4]  * pDecodedSamples[-5];
        prediction += coefficients[5]  * pDecodedSamples[-6];
        prediction += coefficients[6]  * pDecodedSamples[-7];
        prediction += coefficients[7]  * pDecodedSamples[-8];
        prediction += coefficients[8]  * pDecodedSamples[-9];
        prediction += coefficients[9]  * pDecodedSamples[-10];
        prediction += coefficients[10] * pDecodedSamples[-11];
    }
    else
    {
        prediction = 0;
        for (int j = 0; j < (int)order; ++j) {
            prediction += coefficients[j] * pDecodedSamples[-j-1];
        }
    }
#endif

    // Experiment #2. See if we can use a switch and let the compiler optimize it to a jump table.
    // Result: VC++ definitely optimizes this to a single jmp as expected. I expect other compilers should do the same, but I've
    // not verified yet.
#if 1
    int prediction = 0;

    switch (order)
    {
    case 32: prediction += coefficients[31] * pDecodedSamples[-32];
    case 31: prediction += coefficients[30] * pDecodedSamples[-31];
    case 30: prediction += coefficients[29] * pDecodedSamples[-30];
    case 29: prediction += coefficients[28] * pDecodedSamples[-29];
    case 28: prediction += coefficients[27] * pDecodedSamples[-28];
    case 27: prediction += coefficients[26] * pDecodedSamples[-27];
    case 26: prediction += coefficients[25] * pDecodedSamples[-26];
    case 25: prediction += coefficients[24] * pDecodedSamples[-25];
    case 24: prediction += coefficients[23] * pDecodedSamples[-24];
    case 23: prediction += coefficients[22] * pDecodedSamples[-23];
    case 22: prediction += coefficients[21] * pDecodedSamples[-22];
    case 21: prediction += coefficients[20] * pDecodedSamples[-21];
    case 20: prediction += coefficients[19] * pDecodedSamples[-20];
    case 19: prediction += coefficients[18] * pDecodedSamples[-19];
    case 18: prediction += coefficients[17] * pDecodedSamples[-18];
    case 17: prediction += coefficients[16] * pDecodedSamples[-17];
    case 16: prediction += coefficients[15] * pDecodedSamples[-16];
    case 15: prediction += coefficients[14] * pDecodedSamples[-15];
    case 14: prediction += coefficients[13] * pDecodedSamples[-14];
    case 13: prediction += coefficients[12] * pDecodedSamples[-13];
    case 12: prediction += coefficients[11] * pDecodedSamples[-12];
    case 11: prediction += coefficients[10] * pDecodedSamples[-11];
    case 10: prediction += coefficients[ 9] * pDecodedSamples[-10];
    case  9: prediction += coefficients[ 8] * pDecodedSamples[- 9];
    case  8: prediction += coefficients[ 7] * pDecodedSamples[- 8];
    case  7: prediction += coefficients[ 6] * pDecodedSamples[- 7];
    case  6: prediction += coefficients[ 5] * pDecodedSamples[- 6];
    case  5: prediction += coefficients[ 4] * pDecodedSamples[- 5];
    case  4: prediction += coefficients[ 3] * pDecodedSamples[- 4];
    case  3: prediction += coefficients[ 2] * pDecodedSamples[- 3];
    case  2: prediction += coefficients[ 1] * pDecodedSamples[- 2];
    case  1: prediction += coefficients[ 0] * pDecodedSamples[- 1];
    }
#endif

    return (int32_t)(prediction >> shift);
}

static DRFLAC_INLINE int32_t drflac__calculate_prediction(unsigned int order, int shift, const short* coefficients, int32_t* pDecodedSamples)
{
    assert(order <= 32);

    // 64-bit version.

    // This method is faster on the 32-bit build when compiling with VC++. See note below.
#ifndef DRFLAC_64BIT
    long long prediction;
    if (order == 8)
    {
        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
        prediction += (long long)coefficients[4] * (long long)pDecodedSamples[-5];
        prediction += (long long)coefficients[5] * (long long)pDecodedSamples[-6];
        prediction += (long long)coefficients[6] * (long long)pDecodedSamples[-7];
        prediction += (long long)coefficients[7] * (long long)pDecodedSamples[-8];
    }
    else if (order == 7)
    {
        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
        prediction += (long long)coefficients[4] * (long long)pDecodedSamples[-5];
        prediction += (long long)coefficients[5] * (long long)pDecodedSamples[-6];
        prediction += (long long)coefficients[6] * (long long)pDecodedSamples[-7];
    }
    else if (order == 3)
    {
        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
    }
    else if (order == 6)
    {
        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
        prediction += (long long)coefficients[4] * (long long)pDecodedSamples[-5];
        prediction += (long long)coefficients[5] * (long long)pDecodedSamples[-6];
    }
    else if (order == 5)
    {
        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
        prediction += (long long)coefficients[4] * (long long)pDecodedSamples[-5];
    }
    else if (order == 4)
    {
        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
        prediction += (long long)coefficients[2] * (long long)pDecodedSamples[-3];
        prediction += (long long)coefficients[3] * (long long)pDecodedSamples[-4];
    }
    else if (order == 12)
    {
        prediction  = (long long)coefficients[0]  * (long long)pDecodedSamples[-1];
        prediction += (long long)coefficients[1]  * (long long)pDecodedSamples[-2];
        prediction += (long long)coefficients[2]  * (long long)pDecodedSamples[-3];
        prediction += (long long)coefficients[3]  * (long long)pDecodedSamples[-4];
        prediction += (long long)coefficients[4]  * (long long)pDecodedSamples[-5];
        prediction += (long long)coefficients[5]  * (long long)pDecodedSamples[-6];
        prediction += (long long)coefficients[6]  * (long long)pDecodedSamples[-7];
        prediction += (long long)coefficients[7]  * (long long)pDecodedSamples[-8];
        prediction += (long long)coefficients[8]  * (long long)pDecodedSamples[-9];
        prediction += (long long)coefficients[9]  * (long long)pDecodedSamples[-10];
        prediction += (long long)coefficients[10] * (long long)pDecodedSamples[-11];
        prediction += (long long)coefficients[11] * (long long)pDecodedSamples[-12];
    }
    else if (order == 2)
    {
        prediction  = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
        prediction += (long long)coefficients[1] * (long long)pDecodedSamples[-2];
    }
    else if (order == 1)
    {
        prediction = (long long)coefficients[0] * (long long)pDecodedSamples[-1];
    }
    else if (order == 10)
    {
        prediction  = (long long)coefficients[0]  * (long long)pDecodedSamples[-1];
        prediction += (long long)coefficients[1]  * (long long)pDecodedSamples[-2];
        prediction += (long long)coefficients[2]  * (long long)pDecodedSamples[-3];
        prediction += (long long)coefficients[3]  * (long long)pDecodedSamples[-4];
        prediction += (long long)coefficients[4]  * (long long)pDecodedSamples[-5];
        prediction += (long long)coefficients[5]  * (long long)pDecodedSamples[-6];
        prediction += (long long)coefficients[6]  * (long long)pDecodedSamples[-7];
        prediction += (long long)coefficients[7]  * (long long)pDecodedSamples[-8];
        prediction += (long long)coefficients[8]  * (long long)pDecodedSamples[-9];
        prediction += (long long)coefficients[9]  * (long long)pDecodedSamples[-10];
    }
    else if (order == 9)
    {
        prediction  = (long long)coefficients[0]  * (long long)pDecodedSamples[-1];
        prediction += (long long)coefficients[1]  * (long long)pDecodedSamples[-2];
        prediction += (long long)coefficients[2]  * (long long)pDecodedSamples[-3];
        prediction += (long long)coefficients[3]  * (long long)pDecodedSamples[-4];
        prediction += (long long)coefficients[4]  * (long long)pDecodedSamples[-5];
        prediction += (long long)coefficients[5]  * (long long)pDecodedSamples[-6];
        prediction += (long long)coefficients[6]  * (long long)pDecodedSamples[-7];
        prediction += (long long)coefficients[7]  * (long long)pDecodedSamples[-8];
        prediction += (long long)coefficients[8]  * (long long)pDecodedSamples[-9];
    }
    else if (order == 11)
    {
        prediction  = (long long)coefficients[0]  * (long long)pDecodedSamples[-1];
        prediction += (long long)coefficients[1]  * (long long)pDecodedSamples[-2];
        prediction += (long long)coefficients[2]  * (long long)pDecodedSamples[-3];
        prediction += (long long)coefficients[3]  * (long long)pDecodedSamples[-4];
        prediction += (long long)coefficients[4]  * (long long)pDecodedSamples[-5];
        prediction += (long long)coefficients[5]  * (long long)pDecodedSamples[-6];
        prediction += (long long)coefficients[6]  * (long long)pDecodedSamples[-7];
        prediction += (long long)coefficients[7]  * (long long)pDecodedSamples[-8];
        prediction += (long long)coefficients[8]  * (long long)pDecodedSamples[-9];
        prediction += (long long)coefficients[9]  * (long long)pDecodedSamples[-10];
        prediction += (long long)coefficients[10] * (long long)pDecodedSamples[-11];
    }
    else
    {
        prediction = 0;
        for (int j = 0; j < (int)order; ++j) {
            prediction += (long long)coefficients[j] * (long long)pDecodedSamples[-j-1];
        }
    }
#endif

    // Experiment #2. See if we can use a switch and let the compiler optimize it to a single jmp instruction.
    // Result: VC++ optimizes this to a single jmp on the 64-bit build, but for some reason the 32-bit version compiles to less efficient
    // code. Thus, we use this version on the 64-bit build and the uglier version above for the 32-bit build. If anyone has an idea on how
    // I can get VC++ to generate an efficient jump table for the 32-bit build let me know.
#ifdef DRFLAC_64BIT
    long long prediction = 0;

    switch (order)
    {
    case 32: prediction += (long long)coefficients[31] * (long long)pDecodedSamples[-32];
    case 31: prediction += (long long)coefficients[30] * (long long)pDecodedSamples[-31];
    case 30: prediction += (long long)coefficients[29] * (long long)pDecodedSamples[-30];
    case 29: prediction += (long long)coefficients[28] * (long long)pDecodedSamples[-29];
    case 28: prediction += (long long)coefficients[27] * (long long)pDecodedSamples[-28];
    case 27: prediction += (long long)coefficients[26] * (long long)pDecodedSamples[-27];
    case 26: prediction += (long long)coefficients[25] * (long long)pDecodedSamples[-26];
    case 25: prediction += (long long)coefficients[24] * (long long)pDecodedSamples[-25];
    case 24: prediction += (long long)coefficients[23] * (long long)pDecodedSamples[-24];
    case 23: prediction += (long long)coefficients[22] * (long long)pDecodedSamples[-23];
    case 22: prediction += (long long)coefficients[21] * (long long)pDecodedSamples[-22];
    case 21: prediction += (long long)coefficients[20] * (long long)pDecodedSamples[-21];
    case 20: prediction += (long long)coefficients[19] * (long long)pDecodedSamples[-20];
    case 19: prediction += (long long)coefficients[18] * (long long)pDecodedSamples[-19];
    case 18: prediction += (long long)coefficients[17] * (long long)pDecodedSamples[-18];
    case 17: prediction += (long long)coefficients[16] * (long long)pDecodedSamples[-17];
    case 16: prediction += (long long)coefficients[15] * (long long)pDecodedSamples[-16];
    case 15: prediction += (long long)coefficients[14] * (long long)pDecodedSamples[-15];
    case 14: prediction += (long long)coefficients[13] * (long long)pDecodedSamples[-14];
    case 13: prediction += (long long)coefficients[12] * (long long)pDecodedSamples[-13];
    case 12: prediction += (long long)coefficients[11] * (long long)pDecodedSamples[-12];
    case 11: prediction += (long long)coefficients[10] * (long long)pDecodedSamples[-11];
    case 10: prediction += (long long)coefficients[ 9] * (long long)pDecodedSamples[-10];
    case  9: prediction += (long long)coefficients[ 8] * (long long)pDecodedSamples[- 9];
    case  8: prediction += (long long)coefficients[ 7] * (long long)pDecodedSamples[- 8];
    case  7: prediction += (long long)coefficients[ 6] * (long long)pDecodedSamples[- 7];
    case  6: prediction += (long long)coefficients[ 5] * (long long)pDecodedSamples[- 6];
    case  5: prediction += (long long)coefficients[ 4] * (long long)pDecodedSamples[- 5];
    case  4: prediction += (long long)coefficients[ 3] * (long long)pDecodedSamples[- 4];
    case  3: prediction += (long long)coefficients[ 2] * (long long)pDecodedSamples[- 3];
    case  2: prediction += (long long)coefficients[ 1] * (long long)pDecodedSamples[- 2];
    case  1: prediction += (long long)coefficients[ 0] * (long long)pDecodedSamples[- 1];
    }
#endif

    return (int32_t)(prediction >> shift);
}


// Reads and decodes a string of residual values as Rice codes. The decoder should be sitting on the first bit of the Rice codes.
//
// This is the most frequently called function in the library. It does both the Rice decoding and the prediction in a single loop
// iteration.
static bool drflac__decode_samples_with_residual__rice(drflac* pFlac, unsigned int count, unsigned char riceParam, unsigned int order, int shift, const short* coefficients, int* pSamplesOut)
{
    assert(pFlac != NULL);
    assert(count > 0);
    assert(pSamplesOut != NULL);

    static unsigned int bitOffsetTable[] = {
        0,
        4,
        3, 3,
        2, 2, 2, 2,
        1, 1, 1, 1, 1, 1, 1, 1
    };

    drflac_cache_t riceParamMask = DRFLAC_CACHE_L1_SELECTION_MASK(riceParam);
    drflac_cache_t resultHiShift = DRFLAC_CACHE_L1_SIZE_BITS - riceParam;

    for (int i = 0; i < (int)count; ++i)
    {
        unsigned int zeroCounter = 0;
        while (pFlac->cache == 0) {
            zeroCounter += (unsigned int)DRFLAC_CACHE_L1_BITS_REMAINING;
            if (!drflac__reload_cache(pFlac)) {
                return false;
            }
        }

        // At this point the cache should not be zero, in which case we know the first set bit should be somewhere in here. There is
        // no need for us to perform any cache reloading logic here which should make things much faster.
        assert(pFlac->cache != 0);
        unsigned int decodedRice;

        unsigned int setBitOffsetPlus1 = bitOffsetTable[DRFLAC_CACHE_L1_SELECT_AND_SHIFT(4)];
        if (setBitOffsetPlus1 > 0) {
            decodedRice = (zeroCounter + (setBitOffsetPlus1-1)) << riceParam;
        } else {
            if (pFlac->cache == 1) {
                setBitOffsetPlus1 = DRFLAC_CACHE_L1_SIZE_BITS;
                decodedRice = (zeroCounter + (DRFLAC_CACHE_L1_SIZE_BITS-1)) << riceParam;
            } else {
                setBitOffsetPlus1 = 5;
                for (;;)
                {
                    if ((pFlac->cache & DRFLAC_CACHE_L1_SELECT(setBitOffsetPlus1))) {
                        decodedRice = (zeroCounter + (setBitOffsetPlus1-1)) << riceParam;
                        break;
                    }

                    setBitOffsetPlus1 += 1;
                }
            }
        }


        unsigned int bitsLo = 0;
        unsigned int riceLength = setBitOffsetPlus1 + riceParam;
        if (riceLength < DRFLAC_CACHE_L1_BITS_REMAINING)
        {
            bitsLo = (unsigned int)((pFlac->cache & (riceParamMask >> setBitOffsetPlus1)) >> (DRFLAC_CACHE_L1_SIZE_BITS - riceLength));

            pFlac->consumedBits += riceLength;
            pFlac->cache <<= riceLength;
        }
        else
        {
            pFlac->consumedBits += riceLength;
            pFlac->cache <<= setBitOffsetPlus1;

            // It straddles the cached data. It will never cover more than the next chunk. We just read the number in two parts and combine them.
            size_t bitCountLo = pFlac->consumedBits - DRFLAC_CACHE_L1_SIZE_BITS;
            drflac_cache_t resultHi = pFlac->cache & riceParamMask;    // <-- This mask is OK because all bits after the first bits are always zero.


            if (pFlac->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT) {
                pFlac->cache = drflac__be2host__cache_line(pFlac->cacheL2[pFlac->nextL2Line++]);
            } else {
                // Slow path. We need to fetch more data from the client.
                if (!drflac__reload_cache(pFlac)) {
                    return false;
                }
            }

            bitsLo = (unsigned int)((resultHi >> resultHiShift) | DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bitCountLo));
            pFlac->consumedBits = bitCountLo;
            pFlac->cache <<= bitCountLo;
        }


        decodedRice |= bitsLo;
        if ((decodedRice & 0x01)) {
            decodedRice = ~(decodedRice >> 1);
        } else {
            decodedRice = (decodedRice >> 1);
        }


        // In order to properly calculate the prediction when the bits per sample is >16 we need to do it using 64-bit arithmetic. We can assume this
        // is probably going to be slower on 32-bit systems so we'll do a more optimized 32-bit version when the bits per sample is low enough.
        if (pFlac->currentFrame.bitsPerSample > 16) {
            pSamplesOut[i] = ((int)decodedRice + drflac__calculate_prediction(order, shift, coefficients, pSamplesOut + i));
        } else {
            pSamplesOut[i] = ((int)decodedRice + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + i));
        }
    }

    return true;
}


// Reads and seeks past a string of residual values as Rice codes. The decoder should be sitting on the first bit of the Rice codes.
static bool drflac__read_and_seek_residual__rice(drflac* pFlac, unsigned int count, unsigned char riceParam)
{
    assert(pFlac != NULL);
    assert(count > 0);

    for (unsigned int i = 0; i < count; ++i) {
        if (!drflac__read_and_seek_rice(pFlac, riceParam)) {
            return false;
        }
    }

    return true;
}

static bool drflac__decode_samples_with_residual__unencoded(drflac* pFlac, unsigned int count, unsigned char unencodedBitsPerSample, unsigned int order, int shift, const short* coefficients, int* pSamplesOut)
{
    assert(pFlac != NULL);
    assert(count > 0);
    assert(unencodedBitsPerSample > 0 && unencodedBitsPerSample <= 32);
    assert(pSamplesOut != NULL);

    for (unsigned int i = 0; i < count; ++i)
    {
        if (!drflac__read_int32(pFlac, unencodedBitsPerSample, pSamplesOut + i)) {
            return false;
        }

        pSamplesOut[i] += drflac__calculate_prediction(order, shift, coefficients, pSamplesOut + i);
    }

    return true;
}


// Reads and decodes the residual for the sub-frame the decoder is currently sitting on. This function should be called
// when the decoder is sitting at the very start of the RESIDUAL block. The first <order> residuals will be ignored. The
// <blockSize> and <order> parameters are used to determine how many residual values need to be decoded.
static bool drflac__decode_samples_with_residual(drflac* pFlac, unsigned int blockSize, unsigned int order, int shift, const short* coefficients, int* pDecodedSamples)
{
    assert(pFlac != NULL);
    assert(blockSize != 0);
    assert(pDecodedSamples != NULL);       // <-- Should we allow NULL, in which case we just seek past the residual rather than do a full decode?

    unsigned char residualMethod;
    if (!drflac__read_uint8(pFlac, 2, &residualMethod)) {
        return false;
    }

    if (residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE && residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
        return false;    // Unknown or unsupported residual coding method.
    }

    // Ignore the first <order> values.
    pDecodedSamples += order;


    unsigned char partitionOrder;
    if (!drflac__read_uint8(pFlac, 4, &partitionOrder)) {
        return false;
    }


    unsigned int samplesInPartition = (blockSize / (1 << partitionOrder)) - order;
    unsigned int partitionsRemaining = (1 << partitionOrder);
    for (;;)
    {
        unsigned char riceParam = 0;
        if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE) {
            if (!drflac__read_uint8(pFlac, 4, &riceParam)) {
                return false;
            }
            if (riceParam == 16) {
                riceParam = 0xFF;
            }
        } else if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
            if (!drflac__read_uint8(pFlac, 5, &riceParam)) {
                return false;
            }
            if (riceParam == 32) {
                riceParam = 0xFF;
            }
        }

        if (riceParam != 0xFF) {
            if (!drflac__decode_samples_with_residual__rice(pFlac, samplesInPartition, riceParam, order, shift, coefficients, pDecodedSamples)) {
                return false;
            }
        } else {
            unsigned char unencodedBitsPerSample = 0;
            if (!drflac__read_uint8(pFlac, 5, &unencodedBitsPerSample)) {
                return false;
            }

            if (!drflac__decode_samples_with_residual__unencoded(pFlac, samplesInPartition, unencodedBitsPerSample, order, shift, coefficients, pDecodedSamples)) {
                return false;
            }
        }

        pDecodedSamples += samplesInPartition;


        if (partitionsRemaining == 1) {
            break;
        }

        partitionsRemaining -= 1;
        samplesInPartition = blockSize / (1 << partitionOrder);
    }

    return true;
}

// Reads and seeks past the residual for the sub-frame the decoder is currently sitting on. This function should be called
// when the decoder is sitting at the very start of the RESIDUAL block. The first <order> residuals will be set to 0. The
// <blockSize> and <order> parameters are used to determine how many residual values need to be decoded.
static bool drflac__read_and_seek_residual(drflac* pFlac, unsigned int blockSize, unsigned int order)
{
    assert(pFlac != NULL);
    assert(blockSize != 0);

    unsigned char residualMethod;
    if (!drflac__read_uint8(pFlac, 2, &residualMethod)) {
        return false;
    }

    if (residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE && residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
        return false;    // Unknown or unsupported residual coding method.
    }

    unsigned char partitionOrder;
    if (!drflac__read_uint8(pFlac, 4, &partitionOrder)) {
        return false;
    }

    unsigned int samplesInPartition = (blockSize / (1 << partitionOrder)) - order;
    unsigned int partitionsRemaining = (1 << partitionOrder);
    for (;;)
    {
        unsigned char riceParam = 0;
        if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE) {
            if (!drflac__read_uint8(pFlac, 4, &riceParam)) {
                return false;
            }
            if (riceParam == 16) {
                riceParam = 0xFF;
            }
        } else if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
            if (!drflac__read_uint8(pFlac, 5, &riceParam)) {
                return false;
            }
            if (riceParam == 32) {
                riceParam = 0xFF;
            }
        }

        if (riceParam != 0xFF) {
            if (!drflac__read_and_seek_residual__rice(pFlac, samplesInPartition, riceParam)) {
                return false;
            }
        } else {
            unsigned char unencodedBitsPerSample = 0;
            if (!drflac__read_uint8(pFlac, 5, &unencodedBitsPerSample)) {
                return false;
            }

            if (!drflac__seek_bits(pFlac, unencodedBitsPerSample * samplesInPartition)) {
                return false;
            }
        }


        if (partitionsRemaining == 1) {
            break;
        }

        partitionsRemaining -= 1;
        samplesInPartition = blockSize / (1 << partitionOrder);
    }

    return true;
}


static bool drflac__decode_samples__constant(drflac* pFlac, drflac_subframe* pSubframe)
{
    // Only a single sample needs to be decoded here.
    int sample;
    if (!drflac__read_int32(pFlac, pSubframe->bitsPerSample, &sample)) {
        return false;
    }

    // We don't really need to expand this, but it does simplify the process of reading samples. If this becomes a performance issue (unlikely)
    // we'll want to look at a more efficient way.
    for (unsigned int i = 0; i < pFlac->currentFrame.blockSize; ++i) {
        pSubframe->pDecodedSamples[i] = sample;
    }

    return true;
}

static bool drflac__decode_samples__verbatim(drflac* pFlac, drflac_subframe* pSubframe)
{
    for (unsigned int i = 0; i < pFlac->currentFrame.blockSize; ++i) {
        int sample;
        if (!drflac__read_int32(pFlac, pSubframe->bitsPerSample, &sample)) {
            return false;
        }

        pSubframe->pDecodedSamples[i] = sample;
    }

    return true;
}

static bool drflac__decode_samples__fixed(drflac* pFlac, drflac_subframe* pSubframe)
{
    short lpcCoefficientsTable[5][4] = {
        {0,  0, 0,  0},
        {1,  0, 0,  0},
        {2, -1, 0,  0},
        {3, -3, 1,  0},
        {4, -6, 4, -1}
    };

    // Warm up samples and coefficients.
    for (unsigned int i = 0; i < pSubframe->lpcOrder; ++i) {
        int sample;
        if (!drflac__read_int32(pFlac, pSubframe->bitsPerSample, &sample)) {
            return false;
        }

        pSubframe->pDecodedSamples[i] = sample;
    }


    if (!drflac__decode_samples_with_residual(pFlac, pFlac->currentFrame.blockSize, pSubframe->lpcOrder, 0, lpcCoefficientsTable[pSubframe->lpcOrder], pSubframe->pDecodedSamples)) {
        return false;
    }

    return true;
}

static bool drflac__decode_samples__lpc(drflac* pFlac, drflac_subframe* pSubframe)
{
    // Warm up samples.
    for (unsigned int i = 0; i < pSubframe->lpcOrder; ++i) {
        int sample;
        if (!drflac__read_int32(pFlac, pSubframe->bitsPerSample, &sample)) {
            return false;
        }

        pSubframe->pDecodedSamples[i] = sample;
    }

    unsigned char lpcPrecision;
    if (!drflac__read_uint8(pFlac, 4, &lpcPrecision)) {
        return false;
    }
    if (lpcPrecision == 15) {
        return false;    // Invalid.
    }
    lpcPrecision += 1;


    signed char lpcShift;
    if (!drflac__read_int8(pFlac, 5, &lpcShift)) {
        return false;
    }


    short coefficients[32];
    for (unsigned int i = 0; i < pSubframe->lpcOrder; ++i) {
        if (!drflac__read_int16(pFlac, lpcPrecision, coefficients + i)) {
            return false;
        }
    }

    if (!drflac__decode_samples_with_residual(pFlac, pFlac->currentFrame.blockSize, pSubframe->lpcOrder, lpcShift, coefficients, pSubframe->pDecodedSamples)) {
        return false;
    }

    return true;
}


static bool drflac__read_next_frame_header(drflac* pFlac)
{
    assert(pFlac != NULL);
    assert(pFlac->onRead != NULL);

    // At the moment the sync code is as a form of basic validation. The CRC is stored, but is unused at the moment. This
    // should probably be handled better in the future.

    const int sampleRateTable[12]       = {0, 88200, 176400, 192000, 8000, 16000, 22050, 24000, 32000, 44100, 48000, 96000};
    const uint8_t bitsPerSampleTable[8] = {0, 8, 12, (uint8_t)-1, 16, 20, 24, (uint8_t)-1};   // -1 = reserved.

    unsigned short syncCode = 0;
    if (!drflac__read_uint16(pFlac, 14, &syncCode)) {
        return false;
    }

    if (syncCode != 0x3FFE) {
        // TODO: Try and recover by attempting to seek to and read the next frame?
        return false;
    }

    unsigned char reserved;
    if (!drflac__read_uint8(pFlac, 1, &reserved)) {
        return false;
    }

    unsigned char blockingStrategy = 0;
    if (!drflac__read_uint8(pFlac, 1, &blockingStrategy)) {
        return false;
    }


    unsigned char blockSize = 0;
    if (!drflac__read_uint8(pFlac, 4, &blockSize)) {
        return false;
    }

    unsigned char sampleRate = 0;
    if (!drflac__read_uint8(pFlac, 4, &sampleRate)) {
        return false;
    }

    unsigned char channelAssignment = 0;
    if (!drflac__read_uint8(pFlac, 4, &channelAssignment)) {
        return false;
    }

    unsigned char bitsPerSample = 0;
    if (!drflac__read_uint8(pFlac, 3, &bitsPerSample)) {
        return false;
    }

    if (!drflac__read_uint8(pFlac, 1, &reserved)) {
        return false;
    }


    unsigned char isVariableBlockSize = blockingStrategy == 1;
    if (isVariableBlockSize) {
        pFlac->currentFrame.frameNumber = 0;
        if (!drflac__read_utf8_coded_number(pFlac, &pFlac->currentFrame.sampleNumber)) {
            return false;
        }
    } else {
        unsigned long long frameNumber = 0;
        if (!drflac__read_utf8_coded_number(pFlac, &frameNumber)) {
            return false;
        }
        pFlac->currentFrame.frameNumber  = (unsigned int)frameNumber;   // <-- Safe cast.
        pFlac->currentFrame.sampleNumber = 0;
    }


    if (blockSize == 1) {
        pFlac->currentFrame.blockSize = 192;
    } else if (blockSize >= 2 && blockSize <= 5) {
        pFlac->currentFrame.blockSize = 576 * (1 << (blockSize - 2));
    } else if (blockSize == 6) {
        if (!drflac__read_uint16(pFlac, 8, &pFlac->currentFrame.blockSize)) {
            return false;
        }
        pFlac->currentFrame.blockSize += 1;
    } else if (blockSize == 7) {
        if (!drflac__read_uint16(pFlac, 16, &pFlac->currentFrame.blockSize)) {
            return false;
        }
        pFlac->currentFrame.blockSize += 1;
    } else {
        pFlac->currentFrame.blockSize = 256 * (1 << (blockSize - 8));
    }


    if (sampleRate <= 11) {
        pFlac->currentFrame.sampleRate = sampleRateTable[sampleRate];
    } else if (sampleRate == 12) {
        if (!drflac__read_uint32(pFlac, 8, &pFlac->currentFrame.sampleRate)) {
            return false;
        }
        pFlac->currentFrame.sampleRate *= 1000;
    } else if (sampleRate == 13) {
        if (!drflac__read_uint32(pFlac, 16, &pFlac->currentFrame.sampleRate)) {
            return false;
        }
    } else if (sampleRate == 14) {
        if (!drflac__read_uint32(pFlac, 16, &pFlac->currentFrame.sampleRate)) {
            return false;
        }
        pFlac->currentFrame.sampleRate *= 10;
    } else {
        return false;  // Invalid.
    }


    pFlac->currentFrame.channelAssignment = channelAssignment;

    pFlac->currentFrame.bitsPerSample = bitsPerSampleTable[bitsPerSample];
    if (pFlac->currentFrame.bitsPerSample == 0) {
        pFlac->currentFrame.bitsPerSample = pFlac->bitsPerSample;
    }

    if (drflac__read_uint8(pFlac, 8, &pFlac->currentFrame.crc8) != 1) {
        return false;
    }

    memset(pFlac->currentFrame.subframes, 0, sizeof(pFlac->currentFrame.subframes));

    return true;
}

static bool drflac__read_subframe_header(drflac* pFlac, drflac_subframe* pSubframe)
{
    unsigned char header;
    if (!drflac__read_uint8(pFlac, 8, &header)) {
        return false;
    }

    // First bit should always be 0.
    if ((header & 0x80) != 0) {
        return false;
    }

    int type = (header & 0x7E) >> 1;
    if (type == 0) {
        pSubframe->subframeType = DRFLAC_SUBFRAME_CONSTANT;
    } else if (type == 1) {
        pSubframe->subframeType = DRFLAC_SUBFRAME_VERBATIM;
    } else {
        if ((type & 0x20) != 0) {
            pSubframe->subframeType = DRFLAC_SUBFRAME_LPC;
            pSubframe->lpcOrder = (type & 0x1F) + 1;
        } else if ((type & 0x08) != 0) {
            pSubframe->subframeType = DRFLAC_SUBFRAME_FIXED;
            pSubframe->lpcOrder = (type & 0x07);
            if (pSubframe->lpcOrder > 4) {
                pSubframe->subframeType = DRFLAC_SUBFRAME_RESERVED;
                pSubframe->lpcOrder = 0;
            }
        } else {
            pSubframe->subframeType = DRFLAC_SUBFRAME_RESERVED;
        }
    }

    if (pSubframe->subframeType == DRFLAC_SUBFRAME_RESERVED) {
        return false;
    }

    // Wasted bits per sample.
    pSubframe->wastedBitsPerSample = 0;
    if ((header & 0x01) == 1) {
        unsigned int wastedBitsPerSample;
        if (!drflac__seek_past_next_set_bit(pFlac, &wastedBitsPerSample)) {
            return false;
        }
        pSubframe->wastedBitsPerSample = (unsigned char)wastedBitsPerSample + 1;
    }

    return true;
}

static bool drflac__decode_subframe(drflac* pFlac, int subframeIndex)
{
    assert(pFlac != NULL);

    drflac_subframe* pSubframe = pFlac->currentFrame.subframes + subframeIndex;
    if (!drflac__read_subframe_header(pFlac, pSubframe)) {
        return false;
    }

    // Side channels require an extra bit per sample. Took a while to figure that one out...
    pSubframe->bitsPerSample = pFlac->currentFrame.bitsPerSample;
    if ((pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE || pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE) && subframeIndex == 1) {
        pSubframe->bitsPerSample += 1;
    } else if (pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE && subframeIndex == 0) {
        pSubframe->bitsPerSample += 1;
    }

    // Need to handle wasted bits per sample.
    pSubframe->bitsPerSample -= pSubframe->wastedBitsPerSample;
    pSubframe->pDecodedSamples = pFlac->pDecodedSamples + (pFlac->currentFrame.blockSize * subframeIndex);

    switch (pSubframe->subframeType)
    {
        case DRFLAC_SUBFRAME_CONSTANT:
        {
            drflac__decode_samples__constant(pFlac, pSubframe);
        } break;

        case DRFLAC_SUBFRAME_VERBATIM:
        {
            drflac__decode_samples__verbatim(pFlac, pSubframe);
        } break;

        case DRFLAC_SUBFRAME_FIXED:
        {
            drflac__decode_samples__fixed(pFlac, pSubframe);
        } break;

        case DRFLAC_SUBFRAME_LPC:
        {
            drflac__decode_samples__lpc(pFlac, pSubframe);
        } break;

        default: return false;
    }

    return true;
}

static bool drflac__seek_subframe(drflac* pFlac, int subframeIndex)
{
    assert(pFlac != NULL);

    drflac_subframe* pSubframe = pFlac->currentFrame.subframes + subframeIndex;
    if (!drflac__read_subframe_header(pFlac, pSubframe)) {
        return false;
    }

    // Side channels require an extra bit per sample. Took a while to figure that one out...
    pSubframe->bitsPerSample = pFlac->currentFrame.bitsPerSample;
    if ((pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE || pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE) && subframeIndex == 1) {
        pSubframe->bitsPerSample += 1;
    } else if (pFlac->currentFrame.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE && subframeIndex == 0) {
        pSubframe->bitsPerSample += 1;
    }

    // Need to handle wasted bits per sample.
    pSubframe->bitsPerSample -= pSubframe->wastedBitsPerSample;
    pSubframe->pDecodedSamples = pFlac->pDecodedSamples + (pFlac->currentFrame.blockSize * subframeIndex);

    switch (pSubframe->subframeType)
    {
        case DRFLAC_SUBFRAME_CONSTANT:
        {
            if (!drflac__seek_bits(pFlac, pSubframe->bitsPerSample)) {
                return false;
            }
        } break;

        case DRFLAC_SUBFRAME_VERBATIM:
        {
            unsigned int bitsToSeek = pFlac->currentFrame.blockSize * pSubframe->bitsPerSample;
            if (!drflac__seek_bits(pFlac, bitsToSeek)) {
                return false;
            }
        } break;

        case DRFLAC_SUBFRAME_FIXED:
        {
            unsigned int bitsToSeek = pSubframe->lpcOrder * pSubframe->bitsPerSample;
            if (!drflac__seek_bits(pFlac, bitsToSeek)) {
                return false;
            }

            if (!drflac__read_and_seek_residual(pFlac, pFlac->currentFrame.blockSize, pSubframe->lpcOrder)) {
                return false;
            }
        } break;

        case DRFLAC_SUBFRAME_LPC:
        {
            unsigned int bitsToSeek = pSubframe->lpcOrder * pSubframe->bitsPerSample;
            if (!drflac__seek_bits(pFlac, bitsToSeek)) {
                return false;
            }

            unsigned char lpcPrecision;
            if (!drflac__read_uint8(pFlac, 4, &lpcPrecision)) {
                return false;
            }
            if (lpcPrecision == 15) {
                return false;    // Invalid.
            }
            lpcPrecision += 1;


            bitsToSeek = (pSubframe->lpcOrder * lpcPrecision) + 5;    // +5 for shift.
            if (!drflac__seek_bits(pFlac, bitsToSeek)) {
                return false;
            }

            if (!drflac__read_and_seek_residual(pFlac, pFlac->currentFrame.blockSize, pSubframe->lpcOrder)) {
                return false;
            }
        } break;

        default: return false;
    }

    return true;
}


static DRFLAC_INLINE int drflac__get_channel_count_from_channel_assignment(int channelAssignment)
{
    assert(channelAssignment <= 10);

    int lookup[] = {1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2};
    return lookup[channelAssignment];
}

static bool drflac__decode_frame(drflac* pFlac)
{
    // This function should be called while the stream is sitting on the first byte after the frame header.

    int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);
    for (int i = 0; i < channelCount; ++i)
    {
        if (!drflac__decode_subframe(pFlac, i)) {
            return false;
        }
    }

    // At the end of the frame sits the padding and CRC. We don't use these so we can just seek past.
    if (!drflac__seek_bits(pFlac, (DRFLAC_CACHE_L1_BITS_REMAINING & 7) + 16)) {
        return false;
    }


    pFlac->currentFrame.samplesRemaining = pFlac->currentFrame.blockSize * channelCount;

    return true;
}

static bool drflac__seek_frame(drflac* pFlac)
{
    int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);
    for (int i = 0; i < channelCount; ++i)
    {
        if (!drflac__seek_subframe(pFlac, i)) {
            return false;
        }
    }

    // Padding and CRC.
    return drflac__seek_bits(pFlac, (DRFLAC_CACHE_L1_BITS_REMAINING & 7) + 16);
}

static bool drflac__read_and_decode_next_frame(drflac* pFlac)
{
    assert(pFlac != NULL);

    if (!drflac__read_next_frame_header(pFlac)) {
        return false;
    }

    return drflac__decode_frame(pFlac);
}

static unsigned int drflac__read_block_header(drflac* pFlac, unsigned int* pBlockSizeOut, bool* pIsLastBlockOut)    // Returns the block type.
{
    assert(pFlac != NULL);

    unsigned char isLastBlock = 1;
    unsigned char blockType = DRFLAC_BLOCK_TYPE_INVALID;
    unsigned int blockSize = 0;

    if (!drflac__read_uint8(pFlac, 1, &isLastBlock)) {
        goto done_reading_block_header;
    }

    if (!drflac__read_uint8(pFlac, 7, &blockType)) {
        goto done_reading_block_header;
    }

    if (!drflac__read_uint32(pFlac, 24, &blockSize)) {
        goto done_reading_block_header;
    }


done_reading_block_header:
    if (pBlockSizeOut) {
        *pBlockSizeOut = blockSize;
    }

    if (pIsLastBlockOut) {
        *pIsLastBlockOut = (isLastBlock != 0);
    }

    return blockType;
}


static void drflac__get_current_frame_sample_range(drflac* pFlac, uint64_t* pFirstSampleInFrameOut, uint64_t* pLastSampleInFrameOut)
{
    assert(pFlac != NULL);

    unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);

    uint64_t firstSampleInFrame = pFlac->currentFrame.sampleNumber;
    if (firstSampleInFrame == 0) {
        firstSampleInFrame = pFlac->currentFrame.frameNumber * pFlac->maxBlockSize*channelCount;
    }

    uint64_t lastSampleInFrame = firstSampleInFrame + (pFlac->currentFrame.blockSize*channelCount);
    if (lastSampleInFrame > 0) {
        lastSampleInFrame -= 1; // Needs to be zero based.
    }


    if (pFirstSampleInFrameOut) {
        *pFirstSampleInFrameOut = firstSampleInFrame;
    }
    if (pLastSampleInFrameOut) {
        *pLastSampleInFrameOut = lastSampleInFrame;
    }
}

static bool drflac__seek_to_first_frame(drflac* pFlac)
{
    assert(pFlac != NULL);

    bool result = drflac__seek_to_byte(pFlac, (long long)pFlac->firstFramePos);
    pFlac->consumedBits = DRFLAC_CACHE_L1_SIZE_BITS;
    pFlac->cache = 0;

    memset(&pFlac->currentFrame, 0, sizeof(pFlac->currentFrame));


    return result;
}

static DRFLAC_INLINE bool drflac__seek_to_next_frame(drflac* pFlac)
{
    // This function should only ever be called while the decoder is sitting on the first byte past the FRAME_HEADER section.
    assert(pFlac != NULL);
    return drflac__seek_frame(pFlac);
}

static bool drflac__seek_to_frame_containing_sample(drflac* pFlac, uint64_t sampleIndex)
{
    assert(pFlac != NULL);

    if (!drflac__seek_to_first_frame(pFlac)) {
        return false;
    }

    uint64_t firstSampleInFrame = 0;
    uint64_t lastSampleInFrame = 0;
    for (;;)
    {
        // We need to read the frame's header in order to determine the range of samples it contains.
        if (!drflac__read_next_frame_header(pFlac)) {
            return false;
        }

        drflac__get_current_frame_sample_range(pFlac, &firstSampleInFrame, &lastSampleInFrame);
        if (sampleIndex >= firstSampleInFrame && sampleIndex <= lastSampleInFrame) {
            break;  // The sample is in this frame.
        }

        if (!drflac__seek_to_next_frame(pFlac)) {
            return false;
        }
    }

    // If we get here we should be right at the start of the frame containing the sample.
    return true;
}

static bool drflac__seek_to_sample__brute_force(drflac* pFlac, uint64_t sampleIndex)
{
    if (!drflac__seek_to_frame_containing_sample(pFlac, sampleIndex)) {
        return false;
    }

    // At this point we should be sitting on the first byte of the frame containing the sample. We need to decode every sample up to (but
    // not including) the sample we're seeking to.
    uint64_t firstSampleInFrame = 0;
    drflac__get_current_frame_sample_range(pFlac, &firstSampleInFrame, NULL);

    assert(firstSampleInFrame <= sampleIndex);
    size_t samplesToDecode = (size_t)(sampleIndex - firstSampleInFrame);    // <-- Safe cast because the maximum number of samples in a frame is 65535.
    if (samplesToDecode == 0) {
        return true;
    }

    // At this point we are just sitting on the byte after the frame header. We need to decode the frame before reading anything from it.
    if (!drflac__decode_frame(pFlac)) {
        return false;
    }

    return (drflac_read_s16(pFlac, samplesToDecode, NULL) != 0);
}

static bool drflac__seek_to_sample__seek_table(drflac* pFlac, uint64_t sampleIndex)
{
    assert(pFlac != NULL);

    if (pFlac->seektableBlock.pos == 0) {
        return false;
    }

    if (!drflac__seek_to_byte(pFlac, pFlac->seektableBlock.pos)) {
        return false;
    }

    // The number of seek points is derived from the size of the SEEKTABLE block.
    unsigned int seekpointCount = pFlac->seektableBlock.sizeInBytes / 18;   // 18 = the size of each seek point.
    if (seekpointCount == 0) {
        return false;   // Would this ever happen?
    }


    drflac_seekpoint closestSeekpoint = {0};

    unsigned int seekpointsRemaining = seekpointCount;
    while (seekpointsRemaining > 0)
    {
        drflac_seekpoint seekpoint;
        if (!drflac__read_uint64(pFlac, 64, &seekpoint.firstSample)) {
            break;
        }
        if (!drflac__read_uint64(pFlac, 64, &seekpoint.frameOffset)) {
            break;
        }
        if (!drflac__read_uint16(pFlac, 16, &seekpoint.sampleCount)) {
            break;
        }

        if (seekpoint.firstSample * pFlac->channels > sampleIndex) {
            break;
        }

        closestSeekpoint = seekpoint;
        seekpointsRemaining -= 1;
    }

    // At this point we should have found the seekpoint closest to our sample. We need to seek to it using basically the same
    // technique as we use with the brute force method.
    drflac__seek_to_byte(pFlac, pFlac->firstFramePos + closestSeekpoint.frameOffset);

    uint64_t firstSampleInFrame = 0;
    uint64_t lastSampleInFrame = 0;
    for (;;)
    {
        // We need to read the frame's header in order to determine the range of samples it contains.
        if (!drflac__read_next_frame_header(pFlac)) {
            return false;
        }

        drflac__get_current_frame_sample_range(pFlac, &firstSampleInFrame, &lastSampleInFrame);
        if (sampleIndex >= firstSampleInFrame && sampleIndex <= lastSampleInFrame) {
            break;  // The sample is in this frame.
        }

        if (!drflac__seek_to_next_frame(pFlac)) {
            return false;
        }
    }

    assert(firstSampleInFrame <= sampleIndex);

    // At this point we are just sitting on the byte after the frame header. We need to decode the frame before reading anything from it.
    if (!drflac__decode_frame(pFlac)) {
        return false;
    }

    size_t samplesToDecode = (size_t)(sampleIndex - firstSampleInFrame);    // <-- Safe cast because the maximum number of samples in a frame is 65535.
    return drflac_read_s16(pFlac, samplesToDecode, NULL) == samplesToDecode;
}


static drflac* drflac_open(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData)
{
    if (onRead == NULL || onSeek == NULL) {
        return NULL;
    }

    unsigned char id[4];
    if (onRead(pUserData, id, 4) != 4 || id[0] != 'f' || id[1] != 'L' || id[2] != 'a' || id[3] != 'C') {
        return NULL;    // Not a FLAC stream.
    }

    drflac tempFlac;
    memset(&tempFlac, 0, sizeof(tempFlac));
    tempFlac.onRead         = onRead;
    tempFlac.onSeek         = onSeek;
    tempFlac.pUserData      = pUserData;
    tempFlac.currentBytePos = 4;
    tempFlac.nextL2Line     = sizeof(tempFlac.cacheL2) / sizeof(tempFlac.cacheL2[0]); // <-- Initialize to this to force a client-side data retrieval right from the start.
    tempFlac.consumedBits   = sizeof(tempFlac.cache)*8;

    // The first metadata block should be the STREAMINFO block. We don't care about everything in here.
    unsigned int blockSize;
    bool isLastBlock;
    int blockType = drflac__read_block_header(&tempFlac, &blockSize, &isLastBlock);
    if (blockType != DRFLAC_BLOCK_TYPE_STREAMINFO && blockSize != 34) {
        return NULL;
    }

    if (!drflac__seek_bits(&tempFlac, 16)) {   // minBlockSize
        return NULL;
    }
    if (!drflac__read_uint16(&tempFlac, 16, &tempFlac.maxBlockSize)) {
        return NULL;
    }
    if (!drflac__seek_bits(&tempFlac, 48)) {   // minFrameSize + maxFrameSize
        return NULL;
    }
    if (!drflac__read_uint32(&tempFlac, 20, &tempFlac.sampleRate)) {
        return NULL;
    }
    if (!drflac__read_uint8(&tempFlac, 3, &tempFlac.channels)) {
        return NULL;
    }
    if (!drflac__read_uint8(&tempFlac, 5, &tempFlac.bitsPerSample)) {
        return NULL;
    }
    if (!drflac__read_uint64(&tempFlac, 36, &tempFlac.totalSampleCount)) {
        return NULL;
    }
    if (!drflac__seek_bits(&tempFlac, 128)) {  // MD5
        return NULL;
    }

    tempFlac.channels += 1;
    tempFlac.bitsPerSample += 1;
    tempFlac.totalSampleCount *= tempFlac.channels;

    while (!isLastBlock)
    {
        blockType = drflac__read_block_header(&tempFlac, &blockSize, &isLastBlock);

        switch (blockType)
        {
            case DRFLAC_BLOCK_TYPE_APPLICATION:
            {
                tempFlac.applicationBlock.pos = drflac__tell(&tempFlac);
                tempFlac.applicationBlock.sizeInBytes = blockSize;
            } break;

            case DRFLAC_BLOCK_TYPE_SEEKTABLE:
            {
                tempFlac.seektableBlock.pos = drflac__tell(&tempFlac);
                tempFlac.seektableBlock.sizeInBytes = blockSize;
            } break;

            case DRFLAC_BLOCK_TYPE_VORBIS_COMMENT:
            {
                tempFlac.vorbisCommentBlock.pos = drflac__tell(&tempFlac);
                tempFlac.vorbisCommentBlock.sizeInBytes = blockSize;
            } break;

            case DRFLAC_BLOCK_TYPE_CUESHEET:
            {
                tempFlac.cuesheetBlock.pos = drflac__tell(&tempFlac);
                tempFlac.cuesheetBlock.sizeInBytes = blockSize;
            } break;

            case DRFLAC_BLOCK_TYPE_PICTURE:
            {
                tempFlac.pictureBlock.pos = drflac__tell(&tempFlac);
                tempFlac.pictureBlock.sizeInBytes = blockSize;
            } break;


            // These blocks we either don't care about or aren't supporting.
            case DRFLAC_BLOCK_TYPE_PADDING:
            case DRFLAC_BLOCK_TYPE_INVALID:
            default: break;
        }

        if (!drflac__seek_bits(&tempFlac, blockSize*8)) {
            return NULL;
        }
    }


    // At this point we should be sitting right at the start of the very first frame.
    tempFlac.firstFramePos = drflac__tell(&tempFlac);

    drflac* pFlac = (drflac*)malloc(sizeof(*pFlac) - sizeof(pFlac->pExtraData) + (tempFlac.maxBlockSize * tempFlac.channels * sizeof(int32_t)));
    memcpy(pFlac, &tempFlac, sizeof(tempFlac) - sizeof(pFlac->pExtraData));
    pFlac->pDecodedSamples = (int32_t*)pFlac->pExtraData;

    return pFlac;
}

static void drflac_close(drflac* pFlac)
{
    if (pFlac == NULL) {
        return;
    }

#ifndef DR_FLAC_NO_STDIO
    // If we opened the file with drflac_open_file() we will want to close the file handle. We can know whether or not drflac_open_file()
    // was used by looking at the callbacks.
    if (pFlac->onRead == drflac__on_read_stdio) {
#if defined(DR_OPUS_NO_WIN32_IO) || !defined(_WIN32)
        fclose((FILE*)pFlac->pUserData);
#else
        CloseHandle((HANDLE)pFlac->pUserData);
#endif
    }
#endif

    // If we opened the file with drflac_open_memory() we will want to free() the user data.
    if (pFlac->onRead == drflac__on_read_memory) {
        free(pFlac->pUserData);
    }

    free(pFlac);
}

static uint64_t drflac__read_s16__misaligned(drflac* pFlac, uint64_t samplesToRead, int16_t* bufferOut)
{
    unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);

    // We should never be calling this when the number of samples to read is >= the sample count.
    assert(samplesToRead < channelCount);
    assert(pFlac->currentFrame.samplesRemaining > 0 && samplesToRead <= pFlac->currentFrame.samplesRemaining);


    uint64_t samplesRead = 0;
    while (samplesToRead > 0)
    {
        uint64_t totalSamplesInFrame = pFlac->currentFrame.blockSize * channelCount;
        uint64_t samplesReadFromFrameSoFar = totalSamplesInFrame - pFlac->currentFrame.samplesRemaining;
        unsigned int channelIndex = samplesReadFromFrameSoFar % channelCount;

        unsigned long long nextSampleInFrame = samplesReadFromFrameSoFar / channelCount;

        int decodedSample = 0;
        switch (pFlac->currentFrame.channelAssignment)
        {
            case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
            {
                if (channelIndex == 0) {
                    decodedSample = pFlac->currentFrame.subframes[channelIndex].pDecodedSamples[nextSampleInFrame];
                } else {
                    int side = pFlac->currentFrame.subframes[channelIndex + 0].pDecodedSamples[nextSampleInFrame];
                    int left = pFlac->currentFrame.subframes[channelIndex - 1].pDecodedSamples[nextSampleInFrame];
                    decodedSample = left - side;
                }

            } break;

            case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
            {
                if (channelIndex == 0) {
                    int side  = pFlac->currentFrame.subframes[channelIndex + 0].pDecodedSamples[nextSampleInFrame];
                    int right = pFlac->currentFrame.subframes[channelIndex + 1].pDecodedSamples[nextSampleInFrame];
                    decodedSample = side + right;
                } else {
                    decodedSample = pFlac->currentFrame.subframes[channelIndex].pDecodedSamples[nextSampleInFrame];
                }

            } break;

            case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
            {
                int mid;
                int side;
                if (channelIndex == 0) {
                    mid  = pFlac->currentFrame.subframes[channelIndex + 0].pDecodedSamples[nextSampleInFrame];
                    side = pFlac->currentFrame.subframes[channelIndex + 1].pDecodedSamples[nextSampleInFrame];

                    mid = (((unsigned int)mid) << 1) | (side & 0x01);
                    decodedSample = (mid + side) >> 1;
                } else {
                    mid  = pFlac->currentFrame.subframes[channelIndex - 1].pDecodedSamples[nextSampleInFrame];
                    side = pFlac->currentFrame.subframes[channelIndex + 0].pDecodedSamples[nextSampleInFrame];

                    mid = (((unsigned int)mid) << 1) | (side & 0x01);
                    decodedSample = (mid - side) >> 1;
                }

            } break;

            case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
            default:
            {
                decodedSample = pFlac->currentFrame.subframes[channelIndex].pDecodedSamples[nextSampleInFrame];
            } break;
        }

        int shift = (16 - pFlac->bitsPerSample) + pFlac->currentFrame.subframes[channelIndex].wastedBitsPerSample;
        if (shift >= 0) {
            decodedSample <<= shift;
        } else {
            decodedSample >>= -shift;
        }

        if (bufferOut) {
            *bufferOut++ = decodedSample;
        }

        samplesRead += 1;
        pFlac->currentFrame.samplesRemaining -= 1;
        samplesToRead -= 1;
    }

    return samplesRead;
}

static uint64_t drflac__seek_forward_by_samples(drflac* pFlac, uint64_t samplesToRead)
{
    uint64_t samplesRead = 0;
    while (samplesToRead > 0)
    {
        if (pFlac->currentFrame.samplesRemaining == 0)
        {
            if (!drflac__read_and_decode_next_frame(pFlac)) {
                break;  // Couldn't read the next frame, so just break from the loop and return.
            }
        }
        else
        {
            samplesRead += 1;
            pFlac->currentFrame.samplesRemaining -= 1;
            samplesToRead -= 1;
        }
    }

    return samplesRead;
}

static uint64_t drflac_read_s16(drflac* pFlac, uint64_t samplesToRead, int16_t* bufferOut)
{
    // Note that <bufferOut> is allowed to be null, in which case this will be treated as something like a seek.
    if (pFlac == NULL || samplesToRead == 0) {
        return 0;
    }

    if (bufferOut == NULL) {
        return drflac__seek_forward_by_samples(pFlac, samplesToRead);
    }


    uint64_t samplesRead = 0;
    while (samplesToRead > 0)
    {
        // If we've run out of samples in this frame, go to the next.
        if (pFlac->currentFrame.samplesRemaining == 0)
        {
            if (!drflac__read_and_decode_next_frame(pFlac)) {
                break;  // Couldn't read the next frame, so just break from the loop and return.
            }
        }
        else
        {
            // Here is where we grab the samples and interleave them.

            unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.channelAssignment);
            uint64_t totalSamplesInFrame = pFlac->currentFrame.blockSize * channelCount;
            uint64_t samplesReadFromFrameSoFar = totalSamplesInFrame - pFlac->currentFrame.samplesRemaining;

            int misalignedSampleCount = samplesReadFromFrameSoFar % channelCount;
            if (misalignedSampleCount > 0) {
                uint64_t misalignedSamplesRead = drflac__read_s16__misaligned(pFlac, misalignedSampleCount, bufferOut);
                samplesRead   += misalignedSamplesRead;
                samplesReadFromFrameSoFar += misalignedSamplesRead;
                bufferOut     += misalignedSamplesRead;
                samplesToRead -= misalignedSamplesRead;
            }


            uint64_t alignedSampleCountPerChannel = samplesToRead / channelCount;
            if (alignedSampleCountPerChannel > pFlac->currentFrame.samplesRemaining / channelCount) {
                alignedSampleCountPerChannel = pFlac->currentFrame.samplesRemaining / channelCount;
            }

            uint64_t firstAlignedSampleInFrame = samplesReadFromFrameSoFar / channelCount;
            int unusedBitsPerSample = 16 - pFlac->bitsPerSample;

            if (unusedBitsPerSample >= 0) {
                int lshift0 = unusedBitsPerSample + pFlac->currentFrame.subframes[0].wastedBitsPerSample;
                int lshift1 = unusedBitsPerSample + pFlac->currentFrame.subframes[1].wastedBitsPerSample;

                switch (pFlac->currentFrame.channelAssignment)
                {
                    case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
                    {
                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;

                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
                            int left  = pDecodedSamples0[i];
                            int side  = pDecodedSamples1[i];
                            int right = left - side;

                            bufferOut[i*2+0] = left  << lshift0;
                            bufferOut[i*2+1] = right << lshift1;
                        }
                    } break;

                    case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
                    {
                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;

                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
                            int side  = pDecodedSamples0[i];
                            int right = pDecodedSamples1[i];
                            int left  = right + side;

                            bufferOut[i*2+0] = left  << lshift0;
                            bufferOut[i*2+1] = right << lshift1;
                        }
                    } break;

                    case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
                    {
                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;

                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
                            int side = pDecodedSamples1[i];
                            int mid  = (((uint32_t)pDecodedSamples0[i]) << 1) | (side & 0x01);

                            bufferOut[i*2+0] = ((mid + side) >> 1) << lshift0;
                            bufferOut[i*2+1] = ((mid - side) >> 1) << lshift1;
                        }
                    } break;

                    case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
                    default:
                    {
                        if (pFlac->currentFrame.channelAssignment == 1) // 1 = Stereo
                        {
                            // Stereo optimized inner loop unroll.
                            const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
                            const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;

                            for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
                                bufferOut[i*2+0] = pDecodedSamples0[i] << lshift0;
                                bufferOut[i*2+1] = pDecodedSamples1[i] << lshift1;
                            }
                        }
                        else
                        {
                            // Generic interleaving.
                            for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
                                for (unsigned int j = 0; j < channelCount; ++j) {
                                    bufferOut[(i*channelCount)+j] = (pFlac->currentFrame.subframes[j].pDecodedSamples[firstAlignedSampleInFrame + i]) << (unusedBitsPerSample + pFlac->currentFrame.subframes[j].wastedBitsPerSample);
                                }
                            }
                        }
                    } break;
                }
            } else {
                int rshift0 = -unusedBitsPerSample + pFlac->currentFrame.subframes[0].wastedBitsPerSample;
                int rshift1 = -unusedBitsPerSample + pFlac->currentFrame.subframes[1].wastedBitsPerSample;

                switch (pFlac->currentFrame.channelAssignment)
                {
                    case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
                    {
                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;

                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
                            int left  = pDecodedSamples0[i];
                            int side  = pDecodedSamples1[i];
                            int right = left - side;

                            bufferOut[i*2+0] = left  >> rshift0;
                            bufferOut[i*2+1] = right >> rshift1;
                        }
                    } break;

                    case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
                    {
                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;

                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
                            int side  = pDecodedSamples0[i];
                            int right = pDecodedSamples1[i];
                            int left  = right + side;

                            bufferOut[i*2+0] = left  >> rshift0;
                            bufferOut[i*2+1] = right >> rshift1;
                        }
                    } break;

                    case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
                    {
                        const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
                        const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;

                        for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
                            int side = pDecodedSamples1[i];
                            int mid  = (((uint32_t)pDecodedSamples0[i]) << 1) | (side & 0x01);

                            bufferOut[i*2+0] = ((mid + side) >> 1) >> rshift0;
                            bufferOut[i*2+1] = ((mid - side) >> 1) >> rshift1;
                        }
                    } break;

                    case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
                    default:
                    {
                        if (pFlac->currentFrame.channelAssignment == 1) // 1 = Stereo
                        {
                            // Stereo optimized inner loop unroll.
                            const int* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + firstAlignedSampleInFrame;
                            const int* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + firstAlignedSampleInFrame;

                            for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
                                bufferOut[i*2+0] = pDecodedSamples0[i] >> rshift0;
                                bufferOut[i*2+1] = pDecodedSamples1[i] >> rshift1;
                            }
                        }
                        else
                        {
                            // Generic interleaving.
                            for (uint64_t i = 0; i < alignedSampleCountPerChannel; ++i) {
                                for (unsigned int j = 0; j < channelCount; ++j) {
                                    bufferOut[(i*channelCount)+j] = (pFlac->currentFrame.subframes[j].pDecodedSamples[firstAlignedSampleInFrame + i]) >> (pFlac->currentFrame.subframes[j].wastedBitsPerSample - unusedBitsPerSample);
                                }
                            }
                        }
                    } break;
                }
            }

            uint64_t alignedSamplesRead = alignedSampleCountPerChannel * channelCount;
            samplesRead   += alignedSamplesRead;
            samplesReadFromFrameSoFar += alignedSamplesRead;
            bufferOut     += alignedSamplesRead;
            samplesToRead -= alignedSamplesRead;
            pFlac->currentFrame.samplesRemaining -= (unsigned int)alignedSamplesRead;


            // At this point we may still have some excess samples left to read.
            if (samplesToRead > 0 && pFlac->currentFrame.samplesRemaining > 0)
            {
                uint64_t excessSamplesRead = 0;
                if (samplesToRead < pFlac->currentFrame.samplesRemaining) {
                    excessSamplesRead = drflac__read_s16__misaligned(pFlac, samplesToRead, bufferOut);
                } else {
                    excessSamplesRead = drflac__read_s16__misaligned(pFlac, pFlac->currentFrame.samplesRemaining, bufferOut);
                }

                samplesRead   += excessSamplesRead;
                samplesReadFromFrameSoFar += excessSamplesRead;
                bufferOut     += excessSamplesRead;
                samplesToRead -= excessSamplesRead;
            }
        }
    }

    return samplesRead;
}

static bool drflac_seek_to_sample(drflac* pFlac, uint64_t sampleIndex)
{
    if (pFlac == NULL) {
        return false;
    }

    if (sampleIndex == 0) {
        return drflac__seek_to_first_frame(pFlac);
    }

    // Clamp the sample to the end.
    if (sampleIndex >= pFlac->totalSampleCount) {
        sampleIndex  = pFlac->totalSampleCount - 1;
    }


    // First try seeking via the seek table. If this fails, fall back to a brute force seek which is much slower.
    if (!drflac__seek_to_sample__seek_table(pFlac, sampleIndex)) {
        return drflac__seek_to_sample__brute_force(pFlac, sampleIndex);
    }

    return true;
}


#endif  //DR_FLAC_IMPLEMENTATION


/*
This is free and unencumbered software released into the public domain.

Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.

In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.

For more information, please refer to <http://unlicense.org/>
*/