Home Reference Source

src/utils/mp4-tools.ts

  1. import { sliceUint8 } from './typed-array';
  2. import { ElementaryStreamTypes } from '../loader/fragment';
  3.  
  4. type Mp4BoxData = {
  5. data: Uint8Array;
  6. start: number;
  7. end: number;
  8. };
  9.  
  10. const UINT32_MAX = Math.pow(2, 32) - 1;
  11. const push = [].push;
  12.  
  13. export function bin2str(buffer: Uint8Array): string {
  14. return String.fromCharCode.apply(null, buffer);
  15. }
  16.  
  17. export function readUint16(
  18. buffer: Uint8Array | Mp4BoxData,
  19. offset: number
  20. ): number {
  21. if ('data' in buffer) {
  22. offset += buffer.start;
  23. buffer = buffer.data;
  24. }
  25.  
  26. const val = (buffer[offset] << 8) | buffer[offset + 1];
  27.  
  28. return val < 0 ? 65536 + val : val;
  29. }
  30.  
  31. export function readUint32(
  32. buffer: Uint8Array | Mp4BoxData,
  33. offset: number
  34. ): number {
  35. if ('data' in buffer) {
  36. offset += buffer.start;
  37. buffer = buffer.data;
  38. }
  39.  
  40. const val =
  41. (buffer[offset] << 24) |
  42. (buffer[offset + 1] << 16) |
  43. (buffer[offset + 2] << 8) |
  44. buffer[offset + 3];
  45. return val < 0 ? 4294967296 + val : val;
  46. }
  47.  
  48. export function writeUint32(
  49. buffer: Uint8Array | Mp4BoxData,
  50. offset: number,
  51. value: number
  52. ) {
  53. if ('data' in buffer) {
  54. offset += buffer.start;
  55. buffer = buffer.data;
  56. }
  57. buffer[offset] = value >> 24;
  58. buffer[offset + 1] = (value >> 16) & 0xff;
  59. buffer[offset + 2] = (value >> 8) & 0xff;
  60. buffer[offset + 3] = value & 0xff;
  61. }
  62.  
  63. // Find the data for a box specified by its path
  64. export function findBox(
  65. input: Uint8Array | Mp4BoxData,
  66. path: Array<string>
  67. ): Array<Mp4BoxData> {
  68. const results = [] as Array<Mp4BoxData>;
  69. if (!path.length) {
  70. // short-circuit the search for empty paths
  71. return results;
  72. }
  73.  
  74. let data: Uint8Array;
  75. let start;
  76. let end;
  77. if ('data' in input) {
  78. data = input.data;
  79. start = input.start;
  80. end = input.end;
  81. } else {
  82. data = input;
  83. start = 0;
  84. end = data.byteLength;
  85. }
  86.  
  87. for (let i = start; i < end; ) {
  88. const size = readUint32(data, i);
  89. const type = bin2str(data.subarray(i + 4, i + 8));
  90. const endbox = size > 1 ? i + size : end;
  91.  
  92. if (type === path[0]) {
  93. if (path.length === 1) {
  94. // this is the end of the path and we've found the box we were
  95. // looking for
  96. results.push({ data: data, start: i + 8, end: endbox });
  97. } else {
  98. // recursively search for the next box along the path
  99. const subresults = findBox(
  100. { data: data, start: i + 8, end: endbox },
  101. path.slice(1)
  102. );
  103. if (subresults.length) {
  104. push.apply(results, subresults);
  105. }
  106. }
  107. }
  108. i = endbox;
  109. }
  110.  
  111. // we've finished searching all of data
  112. return results;
  113. }
  114.  
  115. type SidxInfo = {
  116. earliestPresentationTime: number;
  117. timescale: number;
  118. version: number;
  119. referencesCount: number;
  120. references: any[];
  121. moovEndOffset: number | null;
  122. };
  123.  
  124. export function parseSegmentIndex(initSegment: Uint8Array): SidxInfo | null {
  125. const moovBox = findBox(initSegment, ['moov']);
  126. const moov = moovBox ? moovBox[0] : null;
  127. const moovEndOffset = moov ? moov.end : null; // we need this in case we need to chop of garbage of the end of current data
  128.  
  129. const sidxBox = findBox(initSegment, ['sidx']);
  130.  
  131. if (!sidxBox || !sidxBox[0]) {
  132. return null;
  133. }
  134.  
  135. const references: any[] = [];
  136. const sidx = sidxBox[0];
  137.  
  138. const version = sidx.data[0];
  139.  
  140. // set initial offset, we skip the reference ID (not needed)
  141. let index = version === 0 ? 8 : 16;
  142.  
  143. const timescale = readUint32(sidx, index);
  144. index += 4;
  145.  
  146. // TODO: parse earliestPresentationTime and firstOffset
  147. // usually zero in our case
  148. const earliestPresentationTime = 0;
  149. const firstOffset = 0;
  150.  
  151. if (version === 0) {
  152. index += 8;
  153. } else {
  154. index += 16;
  155. }
  156.  
  157. // skip reserved
  158. index += 2;
  159.  
  160. let startByte = sidx.end + firstOffset;
  161.  
  162. const referencesCount = readUint16(sidx, index);
  163. index += 2;
  164.  
  165. for (let i = 0; i < referencesCount; i++) {
  166. let referenceIndex = index;
  167.  
  168. const referenceInfo = readUint32(sidx, referenceIndex);
  169. referenceIndex += 4;
  170.  
  171. const referenceSize = referenceInfo & 0x7fffffff;
  172. const referenceType = (referenceInfo & 0x80000000) >>> 31;
  173.  
  174. if (referenceType === 1) {
  175. // eslint-disable-next-line no-console
  176. console.warn('SIDX has hierarchical references (not supported)');
  177. return null;
  178. }
  179.  
  180. const subsegmentDuration = readUint32(sidx, referenceIndex);
  181. referenceIndex += 4;
  182.  
  183. references.push({
  184. referenceSize,
  185. subsegmentDuration, // unscaled
  186. info: {
  187. duration: subsegmentDuration / timescale,
  188. start: startByte,
  189. end: startByte + referenceSize - 1,
  190. },
  191. });
  192.  
  193. startByte += referenceSize;
  194.  
  195. // Skipping 1 bit for |startsWithSap|, 3 bits for |sapType|, and 28 bits
  196. // for |sapDelta|.
  197. referenceIndex += 4;
  198.  
  199. // skip to next ref
  200. index = referenceIndex;
  201. }
  202.  
  203. return {
  204. earliestPresentationTime,
  205. timescale,
  206. version,
  207. referencesCount,
  208. references,
  209. moovEndOffset,
  210. };
  211. }
  212.  
  213. /**
  214. * Parses an MP4 initialization segment and extracts stream type and
  215. * timescale values for any declared tracks. Timescale values indicate the
  216. * number of clock ticks per second to assume for time-based values
  217. * elsewhere in the MP4.
  218. *
  219. * To determine the start time of an MP4, you need two pieces of
  220. * information: the timescale unit and the earliest base media decode
  221. * time. Multiple timescales can be specified within an MP4 but the
  222. * base media decode time is always expressed in the timescale from
  223. * the media header box for the track:
  224. * ```
  225. * moov > trak > mdia > mdhd.timescale
  226. * moov > trak > mdia > hdlr
  227. * ```
  228. * @param initSegment {Uint8Array} the bytes of the init segment
  229. * @return {InitData} a hash of track type to timescale values or null if
  230. * the init segment is malformed.
  231. */
  232.  
  233. interface InitDataTrack {
  234. timescale: number;
  235. id: number;
  236. codec: string;
  237. }
  238.  
  239. type HdlrType = ElementaryStreamTypes.AUDIO | ElementaryStreamTypes.VIDEO;
  240.  
  241. export interface InitData extends Array<any> {
  242. [index: number]:
  243. | {
  244. timescale: number;
  245. type: HdlrType;
  246. default?: {
  247. duration: number;
  248. flags: number;
  249. };
  250. }
  251. | undefined;
  252. audio?: InitDataTrack;
  253. video?: InitDataTrack;
  254. }
  255.  
  256. export function parseInitSegment(initSegment: Uint8Array): InitData {
  257. const result: InitData = [];
  258. const traks = findBox(initSegment, ['moov', 'trak']);
  259. for (let i = 0; i < traks.length; i++) {
  260. const trak = traks[i];
  261. const tkhd = findBox(trak, ['tkhd'])[0];
  262. if (tkhd) {
  263. let version = tkhd.data[tkhd.start];
  264. let index = version === 0 ? 12 : 20;
  265. const trackId = readUint32(tkhd, index);
  266. const mdhd = findBox(trak, ['mdia', 'mdhd'])[0];
  267. if (mdhd) {
  268. version = mdhd.data[mdhd.start];
  269. index = version === 0 ? 12 : 20;
  270. const timescale = readUint32(mdhd, index);
  271. const hdlr = findBox(trak, ['mdia', 'hdlr'])[0];
  272. if (hdlr) {
  273. const hdlrType = bin2str(
  274. hdlr.data.subarray(hdlr.start + 8, hdlr.start + 12)
  275. );
  276. const type: HdlrType = {
  277. soun: ElementaryStreamTypes.AUDIO,
  278. vide: ElementaryStreamTypes.VIDEO,
  279. }[hdlrType];
  280. if (type) {
  281. // TODO: Parse codec details to be able to build MIME type.
  282. const codexBoxes = findBox(trak, ['mdia', 'minf', 'stbl', 'stsd']);
  283. let codec;
  284. if (codexBoxes.length) {
  285. const codecBox = codexBoxes[0];
  286. codec = bin2str(
  287. codecBox.data.subarray(codecBox.start + 12, codecBox.start + 16)
  288. );
  289. }
  290. result[trackId] = { timescale, type };
  291. result[type] = { timescale, id: trackId, codec };
  292. }
  293. }
  294. }
  295. }
  296. }
  297.  
  298. const trex = findBox(initSegment, ['moov', 'mvex', 'trex']);
  299. trex.forEach((trex) => {
  300. const trackId = readUint32(trex, 4);
  301. const track = result[trackId];
  302. if (track) {
  303. track.default = {
  304. duration: readUint32(trex, 12),
  305. flags: readUint32(trex, 20),
  306. };
  307. }
  308. });
  309.  
  310. return result;
  311. }
  312.  
  313. /**
  314. * Determine the base media decode start time, in seconds, for an MP4
  315. * fragment. If multiple fragments are specified, the earliest time is
  316. * returned.
  317. *
  318. * The base media decode time can be parsed from track fragment
  319. * metadata:
  320. * ```
  321. * moof > traf > tfdt.baseMediaDecodeTime
  322. * ```
  323. * It requires the timescale value from the mdhd to interpret.
  324. *
  325. * @param initData {InitData} a hash of track type to timescale values
  326. * @param fmp4 {Uint8Array} the bytes of the mp4 fragment
  327. * @return {number} the earliest base media decode start time for the
  328. * fragment, in seconds
  329. */
  330. export function getStartDTS(initData: InitData, fmp4: Uint8Array): number {
  331. // we need info from two children of each track fragment box
  332. return (
  333. findBox(fmp4, ['moof', 'traf']).reduce((result: number | null, traf) => {
  334. const tfdt = findBox(traf, ['tfdt'])[0];
  335. const version = tfdt.data[tfdt.start];
  336. const start = findBox(traf, ['tfhd']).reduce(
  337. (result: number | null, tfhd) => {
  338. // get the track id from the tfhd
  339. const id = readUint32(tfhd, 4);
  340. const track = initData[id];
  341. if (track) {
  342. let baseTime = readUint32(tfdt, 4);
  343. if (version === 1) {
  344. baseTime *= Math.pow(2, 32);
  345. baseTime += readUint32(tfdt, 8);
  346. }
  347. // assume a 90kHz clock if no timescale was specified
  348. const scale = track.timescale || 90e3;
  349. // convert base time to seconds
  350. const startTime = baseTime / scale;
  351. if (
  352. isFinite(startTime) &&
  353. (result === null || startTime < result)
  354. ) {
  355. return startTime;
  356. }
  357. }
  358. return result;
  359. },
  360. null
  361. );
  362. if (
  363. start !== null &&
  364. isFinite(start) &&
  365. (result === null || start < result)
  366. ) {
  367. return start;
  368. }
  369. return result;
  370. }, null) || 0
  371. );
  372. }
  373.  
  374. /*
  375. For Reference:
  376. aligned(8) class TrackFragmentHeaderBox
  377. extends FullBox(‘tfhd’, 0, tf_flags){
  378. unsigned int(32) track_ID;
  379. // all the following are optional fields
  380. unsigned int(64) base_data_offset;
  381. unsigned int(32) sample_description_index;
  382. unsigned int(32) default_sample_duration;
  383. unsigned int(32) default_sample_size;
  384. unsigned int(32) default_sample_flags
  385. }
  386. */
  387. export function getDuration(data: Uint8Array, initData: InitData) {
  388. let rawDuration = 0;
  389. let videoDuration = 0;
  390. let audioDuration = 0;
  391. const trafs = findBox(data, ['moof', 'traf']);
  392. for (let i = 0; i < trafs.length; i++) {
  393. const traf = trafs[i];
  394. // There is only one tfhd & trun per traf
  395. // This is true for CMAF style content, and we should perhaps check the ftyp
  396. // and only look for a single trun then, but for ISOBMFF we should check
  397. // for multiple track runs.
  398. const tfhd = findBox(traf, ['tfhd'])[0];
  399. // get the track id from the tfhd
  400. const id = readUint32(tfhd, 4);
  401. const track = initData[id];
  402. if (!track) {
  403. continue;
  404. }
  405. const trackDefault = track.default;
  406. const tfhdFlags = readUint32(tfhd, 0) | trackDefault?.flags!;
  407. let sampleDuration: number | undefined = trackDefault?.duration;
  408. if (tfhdFlags & 0x000008) {
  409. // 0x000008 indicates the presence of the default_sample_duration field
  410. if (tfhdFlags & 0x000002) {
  411. // 0x000002 indicates the presence of the sample_description_index field, which precedes default_sample_duration
  412. // If present, the default_sample_duration exists at byte offset 12
  413. sampleDuration = readUint32(tfhd, 12);
  414. } else {
  415. // Otherwise, the duration is at byte offset 8
  416. sampleDuration = readUint32(tfhd, 8);
  417. }
  418. }
  419. // assume a 90kHz clock if no timescale was specified
  420. const timescale = track.timescale || 90e3;
  421. const truns = findBox(traf, ['trun']);
  422. for (let j = 0; j < truns.length; j++) {
  423. if (sampleDuration) {
  424. const sampleCount = readUint32(truns[j], 4);
  425. rawDuration = sampleDuration * sampleCount;
  426. } else {
  427. rawDuration = computeRawDurationFromSamples(truns[j]);
  428. }
  429. if (track.type === ElementaryStreamTypes.VIDEO) {
  430. videoDuration += rawDuration / timescale;
  431. } else if (track.type === ElementaryStreamTypes.AUDIO) {
  432. audioDuration += rawDuration / timescale;
  433. }
  434. }
  435. }
  436. if (videoDuration === 0 && audioDuration === 0) {
  437. // If duration samples are not available in the traf use sidx subsegment_duration
  438. const sidx = parseSegmentIndex(data);
  439. if (sidx?.references) {
  440. return sidx.references.reduce(
  441. (dur, ref) => dur + ref.info.duration || 0,
  442. 0
  443. );
  444. }
  445. }
  446. if (videoDuration) {
  447. return videoDuration;
  448. }
  449. return audioDuration;
  450. }
  451.  
  452. /*
  453. For Reference:
  454. aligned(8) class TrackRunBox
  455. extends FullBox(‘trun’, version, tr_flags) {
  456. unsigned int(32) sample_count;
  457. // the following are optional fields
  458. signed int(32) data_offset;
  459. unsigned int(32) first_sample_flags;
  460. // all fields in the following array are optional
  461. {
  462. unsigned int(32) sample_duration;
  463. unsigned int(32) sample_size;
  464. unsigned int(32) sample_flags
  465. if (version == 0)
  466. { unsigned int(32)
  467. else
  468. { signed int(32)
  469. }[ sample_count ]
  470. }
  471. */
  472. export function computeRawDurationFromSamples(trun): number {
  473. const flags = readUint32(trun, 0);
  474. // Flags are at offset 0, non-optional sample_count is at offset 4. Therefore we start 8 bytes in.
  475. // Each field is an int32, which is 4 bytes
  476. let offset = 8;
  477. // data-offset-present flag
  478. if (flags & 0x000001) {
  479. offset += 4;
  480. }
  481. // first-sample-flags-present flag
  482. if (flags & 0x000004) {
  483. offset += 4;
  484. }
  485.  
  486. let duration = 0;
  487. const sampleCount = readUint32(trun, 4);
  488. for (let i = 0; i < sampleCount; i++) {
  489. // sample-duration-present flag
  490. if (flags & 0x000100) {
  491. const sampleDuration = readUint32(trun, offset);
  492. duration += sampleDuration;
  493. offset += 4;
  494. }
  495. // sample-size-present flag
  496. if (flags & 0x000200) {
  497. offset += 4;
  498. }
  499. // sample-flags-present flag
  500. if (flags & 0x000400) {
  501. offset += 4;
  502. }
  503. // sample-composition-time-offsets-present flag
  504. if (flags & 0x000800) {
  505. offset += 4;
  506. }
  507. }
  508. return duration;
  509. }
  510.  
  511. export function offsetStartDTS(
  512. initData: InitData,
  513. fmp4: Uint8Array,
  514. timeOffset: number
  515. ) {
  516. findBox(fmp4, ['moof', 'traf']).forEach(function (traf) {
  517. findBox(traf, ['tfhd']).forEach(function (tfhd) {
  518. // get the track id from the tfhd
  519. const id = readUint32(tfhd, 4);
  520. const track = initData[id];
  521. if (!track) {
  522. return;
  523. }
  524. // assume a 90kHz clock if no timescale was specified
  525. const timescale = track.timescale || 90e3;
  526. // get the base media decode time from the tfdt
  527. findBox(traf, ['tfdt']).forEach(function (tfdt) {
  528. const version = tfdt.data[tfdt.start];
  529. let baseMediaDecodeTime = readUint32(tfdt, 4);
  530. if (version === 0) {
  531. writeUint32(tfdt, 4, baseMediaDecodeTime - timeOffset * timescale);
  532. } else {
  533. baseMediaDecodeTime *= Math.pow(2, 32);
  534. baseMediaDecodeTime += readUint32(tfdt, 8);
  535. baseMediaDecodeTime -= timeOffset * timescale;
  536. baseMediaDecodeTime = Math.max(baseMediaDecodeTime, 0);
  537. const upper = Math.floor(baseMediaDecodeTime / (UINT32_MAX + 1));
  538. const lower = Math.floor(baseMediaDecodeTime % (UINT32_MAX + 1));
  539. writeUint32(tfdt, 4, upper);
  540. writeUint32(tfdt, 8, lower);
  541. }
  542. });
  543. });
  544. });
  545. }
  546.  
  547. // TODO: Check if the last moof+mdat pair is part of the valid range
  548. export function segmentValidRange(data: Uint8Array): SegmentedRange {
  549. const segmentedRange: SegmentedRange = {
  550. valid: null,
  551. remainder: null,
  552. };
  553.  
  554. const moofs = findBox(data, ['moof']);
  555. if (!moofs) {
  556. return segmentedRange;
  557. } else if (moofs.length < 2) {
  558. segmentedRange.remainder = data;
  559. return segmentedRange;
  560. }
  561. const last = moofs[moofs.length - 1];
  562. // Offset by 8 bytes; findBox offsets the start by as much
  563. segmentedRange.valid = sliceUint8(data, 0, last.start - 8);
  564. segmentedRange.remainder = sliceUint8(data, last.start - 8);
  565. return segmentedRange;
  566. }
  567.  
  568. export interface SegmentedRange {
  569. valid: Uint8Array | null;
  570. remainder: Uint8Array | null;
  571. }
  572.  
  573. export function appendUint8Array(
  574. data1: Uint8Array,
  575. data2: Uint8Array
  576. ): Uint8Array {
  577. const temp = new Uint8Array(data1.length + data2.length);
  578. temp.set(data1);
  579. temp.set(data2, data1.length);
  580.  
  581. return temp;
  582. }