FFmpeg: libavcodec/alacenc.c Source File

00001 /*
00002  * ALAC audio encoder
00003  * Copyright (c) 2008  Jaikrishnan Menon <realityman@gmx.net>
00004  *
00005  * This file is part of FFmpeg.
00006  *
00007  * FFmpeg is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU Lesser General Public
00009  * License as published by the Free Software Foundation; either
00010  * version 2.1 of the License, or (at your option) any later version.
00011  *
00012  * FFmpeg is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015  * Lesser General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU Lesser General Public
00018  * License along with FFmpeg; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00020  */
00021 
00022 #include "avcodec.h"
00023 #include "put_bits.h"
00024 #include "dsputil.h"
00025 #include "internal.h"
00026 #include "lpc.h"
00027 #include "mathops.h"
00028 
00029 #define DEFAULT_FRAME_SIZE        4096
00030 #define DEFAULT_SAMPLE_SIZE       16
00031 #define MAX_CHANNELS              8
00032 #define ALAC_EXTRADATA_SIZE       36
00033 #define ALAC_FRAME_HEADER_SIZE    55
00034 #define ALAC_FRAME_FOOTER_SIZE    3
00035 
00036 #define ALAC_ESCAPE_CODE          0x1FF
00037 #define ALAC_MAX_LPC_ORDER        30
00038 #define DEFAULT_MAX_PRED_ORDER    6
00039 #define DEFAULT_MIN_PRED_ORDER    4
00040 #define ALAC_MAX_LPC_PRECISION    9
00041 #define ALAC_MAX_LPC_SHIFT        9
00042 
00043 #define ALAC_CHMODE_LEFT_RIGHT    0
00044 #define ALAC_CHMODE_LEFT_SIDE     1
00045 #define ALAC_CHMODE_RIGHT_SIDE    2
00046 #define ALAC_CHMODE_MID_SIDE      3
00047 
00048 typedef struct RiceContext {
00049     int history_mult;
00050     int initial_history;
00051     int k_modifier;
00052     int rice_modifier;
00053 } RiceContext;
00054 
00055 typedef struct AlacLPCContext {
00056     int lpc_order;
00057     int lpc_coeff[ALAC_MAX_LPC_ORDER+1];
00058     int lpc_quant;
00059 } AlacLPCContext;
00060 
00061 typedef struct AlacEncodeContext {
00062     int frame_size;                     
00063     int verbatim;                       
00064     int compression_level;
00065     int min_prediction_order;
00066     int max_prediction_order;
00067     int max_coded_frame_size;
00068     int write_sample_size;
00069     int32_t sample_buf[MAX_CHANNELS][DEFAULT_FRAME_SIZE];
00070     int32_t predictor_buf[DEFAULT_FRAME_SIZE];
00071     int interlacing_shift;
00072     int interlacing_leftweight;
00073     PutBitContext pbctx;
00074     RiceContext rc;
00075     AlacLPCContext lpc[MAX_CHANNELS];
00076     LPCContext lpc_ctx;
00077     AVCodecContext *avctx;
00078 } AlacEncodeContext;
00079 
00080 
00081 static void init_sample_buffers(AlacEncodeContext *s,
00082                                 const int16_t *input_samples)
00083 {
00084     int ch, i;
00085 
00086     for (ch = 0; ch < s->avctx->channels; ch++) {
00087         const int16_t *sptr = input_samples + ch;
00088         for (i = 0; i < s->frame_size; i++) {
00089             s->sample_buf[ch][i] = *sptr;
00090             sptr += s->avctx->channels;
00091         }
00092     }
00093 }
00094 
00095 static void encode_scalar(AlacEncodeContext *s, int x,
00096                           int k, int write_sample_size)
00097 {
00098     int divisor, q, r;
00099 
00100     k = FFMIN(k, s->rc.k_modifier);
00101     divisor = (1<<k) - 1;
00102     q = x / divisor;
00103     r = x % divisor;
00104 
00105     if (q > 8) {
00106         // write escape code and sample value directly
00107         put_bits(&s->pbctx, 9, ALAC_ESCAPE_CODE);
00108         put_bits(&s->pbctx, write_sample_size, x);
00109     } else {
00110         if (q)
00111             put_bits(&s->pbctx, q, (1<<q) - 1);
00112         put_bits(&s->pbctx, 1, 0);
00113 
00114         if (k != 1) {
00115             if (r > 0)
00116                 put_bits(&s->pbctx, k, r+1);
00117             else
00118                 put_bits(&s->pbctx, k-1, 0);
00119         }
00120     }
00121 }
00122 
00123 static void write_frame_header(AlacEncodeContext *s)
00124 {
00125     int encode_fs = 0;
00126 
00127     if (s->frame_size < DEFAULT_FRAME_SIZE)
00128         encode_fs = 1;
00129 
00130     put_bits(&s->pbctx, 3,  s->avctx->channels-1);  // No. of channels -1
00131     put_bits(&s->pbctx, 16, 0);                     // Seems to be zero
00132     put_bits(&s->pbctx, 1,  encode_fs);             // Sample count is in the header
00133     put_bits(&s->pbctx, 2,  0);                     // FIXME: Wasted bytes field
00134     put_bits(&s->pbctx, 1,  s->verbatim);           // Audio block is verbatim
00135     if (encode_fs)
00136         put_bits32(&s->pbctx, s->frame_size);       // No. of samples in the frame
00137 }
00138 
00139 static void calc_predictor_params(AlacEncodeContext *s, int ch)
00140 {
00141     int32_t coefs[MAX_LPC_ORDER][MAX_LPC_ORDER];
00142     int shift[MAX_LPC_ORDER];
00143     int opt_order;
00144 
00145     if (s->compression_level == 1) {
00146         s->lpc[ch].lpc_order = 6;
00147         s->lpc[ch].lpc_quant = 6;
00148         s->lpc[ch].lpc_coeff[0] =  160;
00149         s->lpc[ch].lpc_coeff[1] = -190;
00150         s->lpc[ch].lpc_coeff[2] =  170;
00151         s->lpc[ch].lpc_coeff[3] = -130;
00152         s->lpc[ch].lpc_coeff[4] =   80;
00153         s->lpc[ch].lpc_coeff[5] =  -25;
00154     } else {
00155         opt_order = ff_lpc_calc_coefs(&s->lpc_ctx, s->sample_buf[ch],
00156                                       s->frame_size,
00157                                       s->min_prediction_order,
00158                                       s->max_prediction_order,
00159                                       ALAC_MAX_LPC_PRECISION, coefs, shift,
00160                                       FF_LPC_TYPE_LEVINSON, 0,
00161                                       ORDER_METHOD_EST, ALAC_MAX_LPC_SHIFT, 1);
00162 
00163         s->lpc[ch].lpc_order = opt_order;
00164         s->lpc[ch].lpc_quant = shift[opt_order-1];
00165         memcpy(s->lpc[ch].lpc_coeff, coefs[opt_order-1], opt_order*sizeof(int));
00166     }
00167 }
00168 
00169 static int estimate_stereo_mode(int32_t *left_ch, int32_t *right_ch, int n)
00170 {
00171     int i, best;
00172     int32_t lt, rt;
00173     uint64_t sum[4];
00174     uint64_t score[4];
00175 
00176     /* calculate sum of 2nd order residual for each channel */
00177     sum[0] = sum[1] = sum[2] = sum[3] = 0;
00178     for (i = 2; i < n; i++) {
00179         lt =  left_ch[i] - 2 *  left_ch[i - 1] +  left_ch[i - 2];
00180         rt = right_ch[i] - 2 * right_ch[i - 1] + right_ch[i - 2];
00181         sum[2] += FFABS((lt + rt) >> 1);
00182         sum[3] += FFABS(lt - rt);
00183         sum[0] += FFABS(lt);
00184         sum[1] += FFABS(rt);
00185     }
00186 
00187     /* calculate score for each mode */
00188     score[0] = sum[0] + sum[1];
00189     score[1] = sum[0] + sum[3];
00190     score[2] = sum[1] + sum[3];
00191     score[3] = sum[2] + sum[3];
00192 
00193     /* return mode with lowest score */
00194     best = 0;
00195     for (i = 1; i < 4; i++) {
00196         if (score[i] < score[best])
00197             best = i;
00198     }
00199     return best;
00200 }
00201 
00202 static void alac_stereo_decorrelation(AlacEncodeContext *s)
00203 {
00204     int32_t *left = s->sample_buf[0], *right = s->sample_buf[1];
00205     int i, mode, n = s->frame_size;
00206     int32_t tmp;
00207 
00208     mode = estimate_stereo_mode(left, right, n);
00209 
00210     switch (mode) {
00211     case ALAC_CHMODE_LEFT_RIGHT:
00212         s->interlacing_leftweight = 0;
00213         s->interlacing_shift      = 0;
00214         break;
00215     case ALAC_CHMODE_LEFT_SIDE:
00216         for (i = 0; i < n; i++)
00217             right[i] = left[i] - right[i];
00218         s->interlacing_leftweight = 1;
00219         s->interlacing_shift      = 0;
00220         break;
00221     case ALAC_CHMODE_RIGHT_SIDE:
00222         for (i = 0; i < n; i++) {
00223             tmp = right[i];
00224             right[i] = left[i] - right[i];
00225             left[i]  = tmp + (right[i] >> 31);
00226         }
00227         s->interlacing_leftweight = 1;
00228         s->interlacing_shift      = 31;
00229         break;
00230     default:
00231         for (i = 0; i < n; i++) {
00232             tmp = left[i];
00233             left[i]  = (tmp + right[i]) >> 1;
00234             right[i] =  tmp - right[i];
00235         }
00236         s->interlacing_leftweight = 1;
00237         s->interlacing_shift      = 1;
00238         break;
00239     }
00240 }
00241 
00242 static void alac_linear_predictor(AlacEncodeContext *s, int ch)
00243 {
00244     int i;
00245     AlacLPCContext lpc = s->lpc[ch];
00246 
00247     if (lpc.lpc_order == 31) {
00248         s->predictor_buf[0] = s->sample_buf[ch][0];
00249 
00250         for (i = 1; i < s->frame_size; i++) {
00251             s->predictor_buf[i] = s->sample_buf[ch][i    ] -
00252                                   s->sample_buf[ch][i - 1];
00253         }
00254 
00255         return;
00256     }
00257 
00258     // generalised linear predictor
00259 
00260     if (lpc.lpc_order > 0) {
00261         int32_t *samples  = s->sample_buf[ch];
00262         int32_t *residual = s->predictor_buf;
00263 
00264         // generate warm-up samples
00265         residual[0] = samples[0];
00266         for (i = 1; i <= lpc.lpc_order; i++)
00267             residual[i] = samples[i] - samples[i-1];
00268 
00269         // perform lpc on remaining samples
00270         for (i = lpc.lpc_order + 1; i < s->frame_size; i++) {
00271             int sum = 1 << (lpc.lpc_quant - 1), res_val, j;
00272 
00273             for (j = 0; j < lpc.lpc_order; j++) {
00274                 sum += (samples[lpc.lpc_order-j] - samples[0]) *
00275                        lpc.lpc_coeff[j];
00276             }
00277 
00278             sum >>= lpc.lpc_quant;
00279             sum += samples[0];
00280             residual[i] = sign_extend(samples[lpc.lpc_order+1] - sum,
00281                                       s->write_sample_size);
00282             res_val = residual[i];
00283 
00284             if (res_val) {
00285                 int index = lpc.lpc_order - 1;
00286                 int neg = (res_val < 0);
00287 
00288                 while (index >= 0 && (neg ? (res_val < 0) : (res_val > 0))) {
00289                     int val  = samples[0] - samples[lpc.lpc_order - index];
00290                     int sign = (val ? FFSIGN(val) : 0);
00291 
00292                     if (neg)
00293                         sign *= -1;
00294 
00295                     lpc.lpc_coeff[index] -= sign;
00296                     val *= sign;
00297                     res_val -= (val >> lpc.lpc_quant) * (lpc.lpc_order - index);
00298                     index--;
00299                 }
00300             }
00301             samples++;
00302         }
00303     }
00304 }
00305 
00306 static void alac_entropy_coder(AlacEncodeContext *s)
00307 {
00308     unsigned int history = s->rc.initial_history;
00309     int sign_modifier = 0, i, k;
00310     int32_t *samples = s->predictor_buf;
00311 
00312     for (i = 0; i < s->frame_size;) {
00313         int x;
00314 
00315         k = av_log2((history >> 9) + 3);
00316 
00317         x  = -2 * (*samples) -1;
00318         x ^= x >> 31;
00319 
00320         samples++;
00321         i++;
00322 
00323         encode_scalar(s, x - sign_modifier, k, s->write_sample_size);
00324 
00325         history += x * s->rc.history_mult -
00326                    ((history * s->rc.history_mult) >> 9);
00327 
00328         sign_modifier = 0;
00329         if (x > 0xFFFF)
00330             history = 0xFFFF;
00331 
00332         if (history < 128 && i < s->frame_size) {
00333             unsigned int block_size = 0;
00334 
00335             k = 7 - av_log2(history) + ((history + 16) >> 6);
00336 
00337             while (*samples == 0 && i < s->frame_size) {
00338                 samples++;
00339                 i++;
00340                 block_size++;
00341             }
00342             encode_scalar(s, block_size, k, 16);
00343             sign_modifier = (block_size <= 0xFFFF);
00344             history = 0;
00345         }
00346 
00347     }
00348 }
00349 
00350 static int write_frame(AlacEncodeContext *s, AVPacket *avpkt,
00351                        const int16_t *samples)
00352 {
00353     int i, j;
00354     int prediction_type = 0;
00355     PutBitContext *pb = &s->pbctx;
00356 
00357     init_put_bits(pb, avpkt->data, avpkt->size);
00358 
00359     if (s->verbatim) {
00360         write_frame_header(s);
00361         for (i = 0; i < s->frame_size * s->avctx->channels; i++)
00362             put_sbits(pb, 16, *samples++);
00363     } else {
00364         init_sample_buffers(s, samples);
00365         write_frame_header(s);
00366 
00367         if (s->avctx->channels == 2)
00368             alac_stereo_decorrelation(s);
00369         put_bits(pb, 8, s->interlacing_shift);
00370         put_bits(pb, 8, s->interlacing_leftweight);
00371 
00372         for (i = 0; i < s->avctx->channels; i++) {
00373             calc_predictor_params(s, i);
00374 
00375             put_bits(pb, 4, prediction_type);
00376             put_bits(pb, 4, s->lpc[i].lpc_quant);
00377 
00378             put_bits(pb, 3, s->rc.rice_modifier);
00379             put_bits(pb, 5, s->lpc[i].lpc_order);
00380             // predictor coeff. table
00381             for (j = 0; j < s->lpc[i].lpc_order; j++)
00382                 put_sbits(pb, 16, s->lpc[i].lpc_coeff[j]);
00383         }
00384 
00385         // apply lpc and entropy coding to audio samples
00386 
00387         for (i = 0; i < s->avctx->channels; i++) {
00388             alac_linear_predictor(s, i);
00389 
00390             // TODO: determine when this will actually help. for now it's not used.
00391             if (prediction_type == 15) {
00392                 // 2nd pass 1st order filter
00393                 for (j = s->frame_size - 1; j > 0; j--)
00394                     s->predictor_buf[j] -= s->predictor_buf[j - 1];
00395             }
00396 
00397             alac_entropy_coder(s);
00398         }
00399     }
00400     put_bits(pb, 3, 7);
00401     flush_put_bits(pb);
00402     return put_bits_count(pb) >> 3;
00403 }
00404 
00405 static av_always_inline int get_max_frame_size(int frame_size, int ch, int bps)
00406 {
00407     int header_bits = 23 + 32 * (frame_size < DEFAULT_FRAME_SIZE);
00408     return FFALIGN(header_bits + bps * ch * frame_size + 3, 8) / 8;
00409 }
00410 
00411 static av_cold int alac_encode_close(AVCodecContext *avctx)
00412 {
00413     AlacEncodeContext *s = avctx->priv_data;
00414     ff_lpc_end(&s->lpc_ctx);
00415     av_freep(&avctx->extradata);
00416     avctx->extradata_size = 0;
00417     av_freep(&avctx->coded_frame);
00418     return 0;
00419 }
00420 
00421 static av_cold int alac_encode_init(AVCodecContext *avctx)
00422 {
00423     AlacEncodeContext *s = avctx->priv_data;
00424     int ret;
00425     uint8_t *alac_extradata;
00426 
00427     avctx->frame_size = s->frame_size = DEFAULT_FRAME_SIZE;
00428 
00429     if (avctx->sample_fmt != AV_SAMPLE_FMT_S16) {
00430         av_log(avctx, AV_LOG_ERROR, "only pcm_s16 input samples are supported\n");
00431         return -1;
00432     }
00433 
00434     /* TODO: Correctly implement multi-channel ALAC.
00435              It is similar to multi-channel AAC, in that it has a series of
00436              single-channel (SCE), channel-pair (CPE), and LFE elements. */
00437     if (avctx->channels > 2) {
00438         av_log(avctx, AV_LOG_ERROR, "only mono or stereo input is currently supported\n");
00439         return AVERROR_PATCHWELCOME;
00440     }
00441 
00442     // Set default compression level
00443     if (avctx->compression_level == FF_COMPRESSION_DEFAULT)
00444         s->compression_level = 2;
00445     else
00446         s->compression_level = av_clip(avctx->compression_level, 0, 2);
00447 
00448     // Initialize default Rice parameters
00449     s->rc.history_mult    = 40;
00450     s->rc.initial_history = 10;
00451     s->rc.k_modifier      = 14;
00452     s->rc.rice_modifier   = 4;
00453 
00454     s->max_coded_frame_size = get_max_frame_size(avctx->frame_size,
00455                                                  avctx->channels,
00456                                                  DEFAULT_SAMPLE_SIZE);
00457 
00458     // FIXME: consider wasted_bytes
00459     s->write_sample_size  = DEFAULT_SAMPLE_SIZE + avctx->channels - 1;
00460 
00461     avctx->extradata = av_mallocz(ALAC_EXTRADATA_SIZE + FF_INPUT_BUFFER_PADDING_SIZE);
00462     if (!avctx->extradata) {
00463         ret = AVERROR(ENOMEM);
00464         goto error;
00465     }
00466     avctx->extradata_size = ALAC_EXTRADATA_SIZE;
00467 
00468     alac_extradata = avctx->extradata;
00469     AV_WB32(alac_extradata,    ALAC_EXTRADATA_SIZE);
00470     AV_WB32(alac_extradata+4,  MKBETAG('a','l','a','c'));
00471     AV_WB32(alac_extradata+12, avctx->frame_size);
00472     AV_WB8 (alac_extradata+17, DEFAULT_SAMPLE_SIZE);
00473     AV_WB8 (alac_extradata+21, avctx->channels);
00474     AV_WB32(alac_extradata+24, s->max_coded_frame_size);
00475     AV_WB32(alac_extradata+28,
00476             avctx->sample_rate * avctx->channels * DEFAULT_SAMPLE_SIZE); // average bitrate
00477     AV_WB32(alac_extradata+32, avctx->sample_rate);
00478 
00479     // Set relevant extradata fields
00480     if (s->compression_level > 0) {
00481         AV_WB8(alac_extradata+18, s->rc.history_mult);
00482         AV_WB8(alac_extradata+19, s->rc.initial_history);
00483         AV_WB8(alac_extradata+20, s->rc.k_modifier);
00484     }
00485 
00486     s->min_prediction_order = DEFAULT_MIN_PRED_ORDER;
00487     if (avctx->min_prediction_order >= 0) {
00488         if (avctx->min_prediction_order < MIN_LPC_ORDER ||
00489            avctx->min_prediction_order > ALAC_MAX_LPC_ORDER) {
00490             av_log(avctx, AV_LOG_ERROR, "invalid min prediction order: %d\n",
00491                    avctx->min_prediction_order);
00492             ret = AVERROR(EINVAL);
00493             goto error;
00494         }
00495 
00496         s->min_prediction_order = avctx->min_prediction_order;
00497     }
00498 
00499     s->max_prediction_order = DEFAULT_MAX_PRED_ORDER;
00500     if (avctx->max_prediction_order >= 0) {
00501         if (avctx->max_prediction_order < MIN_LPC_ORDER ||
00502             avctx->max_prediction_order > ALAC_MAX_LPC_ORDER) {
00503             av_log(avctx, AV_LOG_ERROR, "invalid max prediction order: %d\n",
00504                    avctx->max_prediction_order);
00505             ret = AVERROR(EINVAL);
00506             goto error;
00507         }
00508 
00509         s->max_prediction_order = avctx->max_prediction_order;
00510     }
00511 
00512     if (s->max_prediction_order < s->min_prediction_order) {
00513         av_log(avctx, AV_LOG_ERROR,
00514                "invalid prediction orders: min=%d max=%d\n",
00515                s->min_prediction_order, s->max_prediction_order);
00516         ret = AVERROR(EINVAL);
00517         goto error;
00518     }
00519 
00520     avctx->coded_frame = avcodec_alloc_frame();
00521     if (!avctx->coded_frame) {
00522         ret = AVERROR(ENOMEM);
00523         goto error;
00524     }
00525 
00526     s->avctx = avctx;
00527 
00528     if ((ret = ff_lpc_init(&s->lpc_ctx, avctx->frame_size,
00529                            s->max_prediction_order,
00530                            FF_LPC_TYPE_LEVINSON)) < 0) {
00531         goto error;
00532     }
00533 
00534     return 0;
00535 error:
00536     alac_encode_close(avctx);
00537     return ret;
00538 }
00539 
00540 static int alac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
00541                              const AVFrame *frame, int *got_packet_ptr)
00542 {
00543     AlacEncodeContext *s = avctx->priv_data;
00544     int out_bytes, max_frame_size, ret;
00545     const int16_t *samples = (const int16_t *)frame->data[0];
00546 
00547     s->frame_size = frame->nb_samples;
00548 
00549     if (avctx->frame_size < DEFAULT_FRAME_SIZE)
00550         max_frame_size = get_max_frame_size(s->frame_size, avctx->channels,
00551                                             DEFAULT_SAMPLE_SIZE);
00552     else
00553         max_frame_size = s->max_coded_frame_size;
00554 
00555     if ((ret = ff_alloc_packet2(avctx, avpkt, 2 * max_frame_size)))
00556         return ret;
00557 
00558     /* use verbatim mode for compression_level 0 */
00559     s->verbatim = !s->compression_level;
00560 
00561     out_bytes = write_frame(s, avpkt, samples);
00562 
00563     if (out_bytes > max_frame_size) {
00564         /* frame too large. use verbatim mode */
00565         s->verbatim = 1;
00566         out_bytes = write_frame(s, avpkt, samples);
00567     }
00568 
00569     avpkt->size = out_bytes;
00570     *got_packet_ptr = 1;
00571     return 0;
00572 }
00573 
00574 AVCodec ff_alac_encoder = {
00575     .name           = "alac",
00576     .type           = AVMEDIA_TYPE_AUDIO,
00577     .id             = CODEC_ID_ALAC,
00578     .priv_data_size = sizeof(AlacEncodeContext),
00579     .init           = alac_encode_init,
00580     .encode2        = alac_encode_frame,
00581     .close          = alac_encode_close,
00582     .capabilities   = CODEC_CAP_SMALL_LAST_FRAME,
00583     .sample_fmts    = (const enum AVSampleFormat[]){ AV_SAMPLE_FMT_S16,
00584                                                      AV_SAMPLE_FMT_NONE },
00585     .long_name      = NULL_IF_CONFIG_SMALL("ALAC (Apple Lossless Audio Codec)"),
00586 };