Fast implementation uses only #define directives. The queue is accelerated in memory when the number of elements is not more than INT_MAX value for the currently selected type ($`2^{16}`$ for unsigned short).

#define pkr_fast_queue(type, size)\
    struct {\
    type buf[size];\
    unsigned int tail;\
    unsigned int head;\
  }

#define pkr_fq_elem_count(queue) (queue.head - queue.tail)

#define pkr_fq_size(queue) (sizeof(queue.buf) / sizeof(queue.buf[0]))

#define pkr_fq_full(queue) (pkr_fq_elem_count(queue) == pkr_fq_size(queue))

#define pkr_fq_empty(queue) (queue.tail == queue.head)

#define pkr_fq_free(queue) (pkr_fq_size(queue) - pkr_fq_count(queue))

#define pkr_fq_push(queue, elem) \
  {\
    queue.buf[queue.head & (pkr_fq_size(queue) - 1)] = elem;\
    queue.head++;\
  }

#define pkr_fq_front(queue) (queue.buf[queue.tail & (pkr_fq_size(queue) - 1)])

#define pkr_fq_pop(queue) \
  {\
      queue.tail++; \
  }

#define pkr_fq_flush(queue) \
  {\
    queue.tail=0;\
    queue.head=0;\
  }