rfc9628xml2.original.xml | rfc9628.xml | |||
---|---|---|---|---|
<?xml version="1.0" encoding="US-ASCII"?> | <?xml version="1.0" encoding="UTF-8"?> | |||
<!DOCTYPE rfc SYSTEM "rfc2629.dtd" [ | ||||
<!ENTITY rfc2119 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
ce.RFC.2119.xml"> | ||||
<!ENTITY rfc3264 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
ce.RFC.3264.xml"> | ||||
<!ENTITY rfc3550 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
ce.RFC.3550.xml"> | ||||
<!ENTITY rfc3551 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
ce.RFC.3551.xml"> | ||||
<!ENTITY rfc3711 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
ce.RFC.3711.xml"> | ||||
<!ENTITY rfc3984 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
ce.RFC.3984.xml"> | ||||
<!ENTITY rfc4855 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
ce.RFC.4855.xml"> | ||||
<!ENTITY rfc4585 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
ce.RFC.4585.xml"> | ||||
<!ENTITY rfc5104 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
ce.RFC.5104.xml"> | ||||
<!ENTITY rfc5124 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
ce.RFC.5124.xml"> | ||||
<!ENTITY rfc6386 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
ce.RFC.6386.xml"> | ||||
<!ENTITY rfc6838 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
ce.RFC.6838.xml"> | ||||
<!ENTITY rfc7201 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
ce.RFC.7201.xml"> | ||||
<!ENTITY rfc7202 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
ce.RFC.7202.xml"> | ||||
<!ENTITY rfc7667 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
ce.RFC.7667.xml"> | ||||
<!ENTITY rfc8174 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
ce.RFC.8174.xml"> | ||||
<!ENTITY rfc8866 SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referen | ||||
ce.RFC.8866.xml"> | ||||
<!ENTITY lrr SYSTEM "http://xml2rfc.tools.ietf.org/public/rfc/bibxml3/reference. | ||||
I-D.ietf-avtext-lrr.xml"> | ||||
<!DOCTYPE rfc [ | ||||
<!ENTITY nbsp " "> | ||||
<!ENTITY zwsp "​"> | ||||
<!ENTITY nbhy "‑"> | ||||
<!ENTITY wj "⁠"> | ||||
]> | ]> | |||
<rfc category="std" docName="draft-ietf-payload-vp9-16" ipr="trust200902"> | ||||
<?rfc symrefs="yes" ?> | ||||
<?rfc sortrefs="yes" ?> | <rfc xmlns:xi="http://www.w3.org/2001/XInclude" docName="draft-ietf-payload-vp9- | |||
16" number="9628" ipr="trust200902" obsoletes="" updates="" submissionType="IETF | ||||
<!-- alphabetize the references --> | " category="std" consensus="true" xml:lang="en" symRefs="true" sortRefs="true" t | |||
ocInclude="true" version="3"> | ||||
<?rfc comments="no"?> | ||||
<!-- show comments --> | ||||
<?rfc inline="yes" ?> | ||||
<!-- comments are inline --> | ||||
<?rfc toc="yes" ?> | ||||
<!-- generate table of contents --> | ||||
<front> | <front> | |||
<title abbrev="RTP Payload Format for VP9">RTP Payload Format for VP9 | <title abbrev="RTP Payload Format for VP9">RTP Payload Format for VP9 | |||
Video</title> | Video</title> | |||
<seriesInfo name="RFC" value="9628"/> | ||||
<author fullname="Justin Uberti" initials="J." surname="Uberti"> | <author fullname="Justin Uberti" initials="J." surname="Uberti"> | |||
<organization abbrev="Google">Google, Inc.</organization> | <organization abbrev="Google">Google, Inc.</organization> | |||
<address> | <address> | |||
<postal> | <postal> | |||
<street>747 6th Street South</street> | <street>747 6th Street South</street> | |||
<city>Kirkland</city> | <city>Kirkland</city> | |||
<region>WA</region> | <region>WA</region> | |||
<code>98033</code> | <code>98033</code> | |||
<country>United States of America</country> | ||||
<country>USA</country> | ||||
</postal> | </postal> | |||
<email>justin@uberti.name</email> | <email>justin@uberti.name</email> | |||
</address> | </address> | |||
</author> | </author> | |||
<author fullname="Stefan Holmer" initials="S." surname="Holmer"> | <author fullname="Stefan Holmer" initials="S." surname="Holmer"> | |||
<organization abbrev="Google">Google, Inc.</organization> | <organization abbrev="Google">Google, Inc.</organization> | |||
<address> | <address> | |||
<postal> | <postal> | |||
<street>Kungsbron 2</street> | <street>Kungsbron 2</street> | |||
<code>111 22</code> | <code>111 22</code> | |||
<city>Stockholm</city> | <city>Stockholm</city> | |||
<country>Sweden</country> | <country>Sweden</country> | |||
</postal> | </postal> | |||
<email>holmer@google.com</email> | <email>holmer@google.com</email> | |||
</address> | </address> | |||
</author> | </author> | |||
<author fullname="Magnus Flodman" initials="M." surname="Flodman"> | <author fullname="Magnus Flodman" initials="M." surname="Flodman"> | |||
<organization abbrev="Google">Google, Inc.</organization> | <organization abbrev="Google">Google, Inc.</organization> | |||
<address> | <address> | |||
<postal> | <postal> | |||
<street>Kungsbron 2</street> | <street>Kungsbron 2</street> | |||
<code>111 22</code> | <code>111 22</code> | |||
<city>Stockholm</city> | <city>Stockholm</city> | |||
<country>Sweden</country> | <country>Sweden</country> | |||
</postal> | </postal> | |||
<email>mflodman@google.com</email> | <email>mflodman@google.com</email> | |||
</address> | </address> | |||
</author> | </author> | |||
<author fullname="Danny Hong" initials="D." surname="Hong"> | ||||
<author fullname="Danny Hong" initials="D." surname="Hong"> | ||||
<organization abbrev="Google">Google, Inc.</organization> | <organization abbrev="Google">Google, Inc.</organization> | |||
<address> | <address> | |||
<postal> | <postal> | |||
<street>1585 Charleston Road</street> | <street>1585 Charleston Road</street> | |||
<city>Mountain View</city> | <city>Mountain View</city> | |||
<region>CA</region> | <region>CA</region> | |||
<code>94043</code> | <code>94043</code> | |||
<country>United States of America</country> | ||||
<country>US</country> | ||||
</postal> | </postal> | |||
<email>dannyhong@google.com</email> | <email>dannyhong@google.com</email> | |||
</address> | </address> | |||
</author> | </author> | |||
<author fullname="Jonathan Lennox" initials="J." surname="Lennox"> | <author fullname="Jonathan Lennox" initials="J." surname="Lennox"> | |||
<organization abbrev="8x8 / Jitsi">8x8, Inc. / Jitsi</organization> | <organization abbrev="8x8 / Jitsi">8x8, Inc. / Jitsi</organization> | |||
<address> | <address> | |||
<postal> | <postal> | |||
<street/> | <street/> | |||
<city>Jersey City</city> | <city>Jersey City</city> | |||
<region>NJ</region> | <region>NJ</region> | |||
<code>07302</code> | <code>07302</code> | |||
<country>United States of America</country> | ||||
<country>US</country> | ||||
</postal> | </postal> | |||
<email>jonathan.lennox@8x8.com</email> | <email>jonathan.lennox@8x8.com</email> | |||
</address> | </address> | |||
</author> | </author> | |||
<date year="2024" month="October" /> | ||||
<date/> | ||||
<area>RAI</area> | <area>RAI</area> | |||
<workgroup>AVTCore Working Group</workgroup> | <workgroup>AVTCore Working Group</workgroup> | |||
<keyword>RFC</keyword> | ||||
<keyword>Request for Comments</keyword> | ||||
<keyword>RTP</keyword> | <keyword>RTP</keyword> | |||
<keyword>VP9</keyword> | <keyword>VP9</keyword> | |||
<keyword>WebM</keyword> | <keyword>WebM</keyword> | |||
<abstract> | <abstract> | |||
<t>This specification describes an RTP payload format for the VP9 video co dec. | <t>This specification describes an RTP payload format for the VP9 video co dec. | |||
The payload format has wide applicability, as it supports applications | The payload format has wide applicability as it supports applications | |||
from low bit-rate peer-to-peer usage, to high bit-rate video | from low bitrate peer-to-peer usage to high bitrate video | |||
conferences. It includes provisions for temporal and spatial scalability. </t> | conferences. It includes provisions for temporal and spatial scalability. </t> | |||
</abstract> | </abstract> | |||
</front> | </front> | |||
<middle> | <middle> | |||
<section anchor="intro" title="Introduction"> | <section anchor="intro" numbered="true" toc="default"> | |||
<t>This specification describes an <xref target="RFC3550">RTP</xref> paylo | <name>Introduction</name> | |||
ad specification applicable to the | ||||
transmission of video streams encoded using the VP9 video codec <xref | ||||
target="VP9-BITSTREAM"/>. The format described in this document can be use | ||||
d | ||||
both in peer-to-peer and video conferencing applications.</t> | ||||
<t>The VP9 video codec was developed by Google, and is the | <t>This document describes an <xref target="RFC3550" | |||
successor to its earlier <xref target="RFC6386">VP8</xref> | format="default">RTP</xref> payload specification applicable to the | |||
codec. Above the compression improvements and other general | transmission of video streams encoded using the VP9 video codec <xref | |||
enhancements above VP8, VP9 is also designed in a way that | target="VP9-BITSTREAM" format="default"/>. The format described in this | |||
allows spatially-scalable video encoding.</t> | document can be used both in peer-to-peer and video conferencing | |||
applications.</t> | ||||
<t>The VP9 video codec was developed by Google and is the successor to | ||||
its earlier <xref target="RFC6386" format="default">VP8</xref> codec. | ||||
Above the compression improvements and other general enhancements to | ||||
VP8, VP9 is also designed in a way that allows spatially scalable video | ||||
encoding.</t> | ||||
</section> | </section> | |||
<section anchor="conventions" | <section anchor="conventions" numbered="true" toc="default"> | |||
title="Conventions, Definitions and Acronyms"> | <name>Conventions</name> | |||
<t>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", | <t> | |||
"SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT | The key words "<bcp14>MUST</bcp14>", "<bcp14>MUST NOT</bcp14>", | |||
RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be | "<bcp14>REQUIRED</bcp14>", "<bcp14>SHALL</bcp14>", "<bcp14>SHALL | |||
interpreted as described in BCP 14 <xref target="RFC2119"/> | NOT</bcp14>", "<bcp14>SHOULD</bcp14>", "<bcp14>SHOULD NOT</bcp14>", | |||
<xref target="RFC8174"/> when, and only when, | "<bcp14>RECOMMENDED</bcp14>", "<bcp14>NOT RECOMMENDED</bcp14>", | |||
they appear in all capitals, as shown here.</t> | "<bcp14>MAY</bcp14>", and "<bcp14>OPTIONAL</bcp14>" in this document are | |||
</section> | to be interpreted as described in BCP 14 <xref target="RFC2119"/> | |||
<xref target="RFC8174"/> when, and only when, they appear in all capitals, | ||||
as shown here. | ||||
</t> | ||||
<section anchor="mediaFormatDescription" title="Media Format Description"> | </section> | |||
<section anchor="mediaFormatDescription" numbered="true" toc="default"> | ||||
<name>Media Format Description</name> | ||||
<t>The VP9 codec can maintain up to eight reference frames, of | <t>The VP9 codec can maintain up to eight reference frames, of | |||
which up to three can be referenced by any new frame.</t> | which up to three can be referenced by any new frame.</t> | |||
<t>VP9 also allows a frame to use another frame of a different | <t>VP9 also allows a frame to use another frame of a different | |||
resolution as a reference frame. (Specifically, a frame may use | resolution as a reference frame. (Specifically, a frame may use | |||
any references whose width and height are between 1/16th that of | any references whose width and height are between 1/16th that of | |||
the current frame and twice that of the current frame, | the current frame and twice that of the current frame, | |||
inclusive.) This allows internal resolution changes without | inclusive.) This allows internal resolution changes without | |||
requiring the use of key frames.</t> | requiring the use of keyframes.</t> | |||
<t>These features together enable an encoder to | <t>These features together enable an encoder to | |||
implement various forms of coarse-grained scalability, | implement various forms of coarse-grained scalability, | |||
including temporal, spatial and quality scalability modes, as | including temporal, spatial, and quality scalability modes, as | |||
well as combinations of these, without the need for explicit | well as combinations of these, without the need for explicit | |||
scalable coding tools.</t> | scalable coding tools.</t> | |||
<t>Temporal layers define different frame rates of video; | <t>Temporal layers define different frame rates of video; | |||
spatial and quality layers define different and possibly dependent | spatial and quality layers define different and possibly dependent | |||
representations of a single input frame. Spatial layers allow | representations of a single input frame. Spatial layers allow | |||
a frame to be encoded at different resolutions, whereas | a frame to be encoded at different resolutions, whereas | |||
quality layers allow a frame to be encoded at the same | quality layers allow a frame to be encoded at the same | |||
resolution but at different qualities (and thus with different | resolution but at different qualities (and, thus, with different | |||
amounts of coding error). VP9 supports quality layers as | amounts of coding error). VP9 supports quality layers as | |||
spatial layers without any resolution changes; hereinafter, | spatial layers without any resolution changes; hereinafter, | |||
the term "spatial layer" is used to represent both spatial and | the term "spatial layer" is used to represent both spatial and | |||
quality layers.</t> | quality layers.</t> | |||
<t>This payload format specification defines how such | <t>This payload format specification defines how such | |||
temporal and spatial scalability layers can be described and | temporal and spatial scalability layers can be described and | |||
communicated.</t> | communicated.</t> | |||
<t>Temporal and spatial scalability layers are associated with | ||||
<t>Temporal and spatial scalability layers are associated with | ||||
non-negative integer IDs. The lowest layer of either type has an | non-negative integer IDs. The lowest layer of either type has an | |||
ID of 0, and is sometimes referred to as the "base" temporal or | ID of 0 and is sometimes referred to as the "base" temporal or | |||
spatial layer.</t> | spatial layer.</t> | |||
<t>Layers are designed, and <bcp14>MUST</bcp14> be encoded, such that if | ||||
<t>Layers are designed, and MUST be encoded, such that if | ||||
any layer, and all higher layers, are removed from the bitstream | any layer, and all higher layers, are removed from the bitstream | |||
along either the spatial or temporal dimension, the remaining bitstream is | along either the spatial or temporal dimension, the remaining bitstream is | |||
still correctly decodable.</t> | still correctly decodable.</t> | |||
<t>For terminology, this document uses the term "frame" to refer | <t>For terminology, this document uses the term "frame" to refer to a | |||
to a single encoded VP9 frame for a particular resolution/quality, and | single encoded VP9 frame for a particular resolution and/or quality, and | |||
"picture" to refer to all the representations (frames) at a single | "picture" to refer to all the representations (frames) at a single | |||
instant in time. A picture thus consists of one or more frames, | instant in time. Thus, a picture consists of one or more frames, | |||
encoding different spatial layers.</t> | encoding different spatial layers.</t> | |||
<t>Within a picture, a frame with spatial layer ID equal to SID, | <t>Within a picture, a frame with | |||
where SID > 0, can depend on a frame of the same picture with a lower spat | spatial-layer ID equal to S, where S > 0, can depend on a frame | |||
ial layer ID. This | of the same picture with a lower spatial-layer ID. This "inter-layer" | |||
"inter-layer" dependency can result in additional coding gain | dependency can result in additional coding gain compared to the case | |||
compared to the case where only | where only "inter-picture" dependency is used, where a frame | |||
traditional "inter-picture" dependency is used, where a frame depends on p | depends on a previously coded frame in time. For simplicity, this | |||
reviously | payload format assumes that, within a picture and if inter-layer | |||
coded frame in time. For simplicity, this payload format assumes that, | dependency is used, a spatial-layer S frame can depend only on the | |||
within a picture and if inter-layer dependency is used, a spatial layer SI | immediately previous spatial-layer S-1 frame, when S > 0. | |||
D frame | Additionally, if inter-picture dependency is used, a spatial-layer S | |||
can depend only on the immediately previous spatial layer SID-1 frame, whe | frame is assumed to only depend on a previously coded spatial-layer S | |||
n S > 0. Additionally, if | frame.</t> | |||
inter-picture dependency is used, a spatial layer SID frame is assumed to | ||||
only | ||||
depend on a previously coded spatial layer SID frame.</t> | ||||
<t>Given above simplifications for inter-layer and inter-picture | ||||
dependencies, a flag (the D bit described below) is used to indicate wheth | ||||
er a | ||||
spatial layer SID frame depends on the spatial layer SID-1 frame. Given t | ||||
he D bit, a receiver | ||||
only needs to additionally know the inter-picture dependency structure for | ||||
a given | ||||
spatial layer frame in order to determine its decodability. Two modes | ||||
of describing the inter-picture dependency structure are possible: | ||||
"flexible mode" and "non-flexible mode". An encoder can only switch | ||||
between the two on the first packet of a key frame with temporal | ||||
layer ID equal to 0.</t> | ||||
<t>In flexible mode, each packet can contain up to 3 reference | ||||
indices, which identify all frames referenced by the frame | ||||
transmitted in the current packet for inter-picture prediction. | ||||
This (along with the D bit) enables a receiver to identify if a frame | ||||
is decodable or not and helps it understand the temporal layer | ||||
structure. | ||||
Since this is signaled in | ||||
each packet it makes it possible to have very flexible temporal layer | ||||
hierarchies, and scalability structures which are changing dynamically.</t | ||||
> | ||||
<t>Given the above simplifications for inter-layer and inter-picture | ||||
dependencies, a flag (the D bit described below) is used to indicate | ||||
whether a spatial-layer SID frame depends on the spatial-layer SID-1 | ||||
frame. Given the D bit, a receiver only needs to additionally know the | ||||
inter-picture dependency structure for a given spatial-layer frame in | ||||
order to determine its decodability. Two modes of describing the | ||||
inter-picture dependency structure are possible: "flexible mode" and | ||||
"non-flexible mode". An encoder can only switch between the two on the | ||||
first packet of a keyframe with a temporal-layer ID equal to 0.</t> | ||||
<t>In flexible mode, each packet can contain up to three reference indices | ||||
, | ||||
which identify all frames referenced by the frame transmitted in the | ||||
current packet for inter-picture prediction. This (along with the D | ||||
bit) enables a receiver to identify if a frame is decodable or not and | ||||
helps it understand the temporal-layer structure. Since this is | ||||
signaled in each packet, it makes it possible to have very flexible | ||||
temporal-layer hierarchies and scalability structures, which are | ||||
changing dynamically.</t> | ||||
<t>In non-flexible mode, frames are encoded using a fixed, recurring patte rn of dependencies; | <t>In non-flexible mode, frames are encoded using a fixed, recurring patte rn of dependencies; | |||
the set of pictures that recur in this pattern is known as a Picture Group (PG). | the set of pictures that recur in this pattern is known as a "Picture Grou p" (or "PG"). | |||
In this mode, the inter-picture dependencies (the reference | In this mode, the inter-picture dependencies (the reference | |||
indices) of the Picture Group MUST be pre-specified as part of the | indices) of the PG <bcp14>MUST</bcp14> be pre-specified as part of the | |||
scalability structure (SS) data. | Scalability Structure (SS) data. | |||
Each | Each | |||
packet has an index to refer to one of the described pictures | packet has an index to refer to one of the described pictures | |||
in the PG, from which the pictures referenced by the picture transmitted i n the current packet | in the PG from which the pictures referenced by the picture transmitted in the current packet | |||
for inter-picture prediction can be identified.</t> | for inter-picture prediction can be identified.</t> | |||
<t>(Note: A "Picture Group", as used in this document, | <aside> | |||
<t>Note: A "Picture Group" or "PG", as used in this document, | ||||
is not the same thing as the term "Group of Pictures" as | is not the same thing as the term "Group of Pictures" as | |||
it is traditionally used in video coding, i.e. to mean an | it is commonly used in video coding, i.e., to mean an | |||
independently-decoadable run of pictures beginning with a | independently decodable run of pictures beginning with a | |||
keyframe.)</t> | keyframe.</t> | |||
<t>The SS data can also be used to specify the resolution of each | <t>The SS data can also be used to specify the resolution of each | |||
spatial layer present in the VP9 stream for both flexible and non-flexible | spatial layer present in the VP9 stream for both flexible and non-flexible | |||
modes.</t> | modes.</t></aside> | |||
</section> | </section> | |||
<section anchor="payloadFormat" numbered="true" toc="default"> | ||||
<name>Payload Format</name> | ||||
<section anchor="payloadFormat" title="Payload Format"> | ||||
<t>This section describes how the encoded VP9 bitstream is encapsulated | <t>This section describes how the encoded VP9 bitstream is encapsulated | |||
in RTP. To handle network losses usage of RTP/AVPF <xref | in RTP. To handle network losses, usage of RTP/AVPF <xref target="RFC4585" | |||
target="RFC4585"/> is RECOMMENDED. All integer fields in the | format="default"/> is <bcp14>RECOMMENDED</bcp14>. All integer fields in this | |||
specifications are encoded as unsigned integers in network octet | specification are encoded as unsigned integers in network octet | |||
order.</t> | order.</t> | |||
<section anchor="RTPHeaderUsage" numbered="true" toc="default"> | ||||
<name>RTP Header Usage</name> | ||||
<t keepWithNext="true">The general RTP payload format for VP9 is depicte | ||||
d | ||||
below.</t> | ||||
<section anchor="RTPHeaderUsage" title="RTP Header Usage"> | <figure anchor="figureRTPHeader" title="General RTP Payload Format for | |||
<figure anchor="figureRTPHeader"> | VP"> | |||
<preamble>The general RTP payload format for VP9 is depicted | <artwork type="" align="left" alt=""><![CDATA[ | |||
below.</preamble> | ||||
<artwork><![CDATA[ | ||||
0 1 2 3 | 0 1 2 3 | |||
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
|V=2|P|X| CC |M| PT | sequence number | | |V=2|P|X| CC |M| PT | sequence number | | |||
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
| timestamp | | | timestamp | | |||
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
| synchronization source (SSRC) identifier | | | synchronization source (SSRC) identifier | | |||
+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+ | +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+ | |||
| contributing source (CSRC) identifiers | | | contributing source (CSRC) identifiers | | |||
skipping to change at line 322 ¶ | skipping to change at line 253 ¶ | |||
| +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
| : | | | : | | |||
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | | |||
| | | | | | |||
+ | | + | | |||
: VP9 payload : | : VP9 payload : | |||
| | | | | | |||
| +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
| : OPTIONAL RTP padding | | | : OPTIONAL RTP padding | | |||
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
]]></artwork> | ]]></artwork> | |||
<postamble>The VP9 payload descriptor will be | ||||
described in <xref target="VP9payloadDescriptor"/>; the VP9 payload is | ||||
described | ||||
in <xref target="VP9-BITSTREAM"/>. | ||||
OPTIONAL RTP padding MUST NOT be included unless the P bi | ||||
t is set.</postamble> | ||||
</figure> | </figure> | |||
<t keepWithPrevious="true">See <xref target="VP9payloadDescriptor" forma | ||||
t="default"/> for more information on the VP9 payload descriptor; | ||||
the VP9 payload is described in <xref target="VP9-BITSTREAM" | ||||
format="default"/>. <bcp14>OPTIONAL</bcp14> RTP padding <bcp14>MUST | ||||
NOT</bcp14> be included unless the P bit is set.</t> | ||||
<t><list style="hanging"> | <dl newline="false" spacing="normal"> | |||
<t hangText="Marker bit (M):">MUST be set to 1 for the final packet | <dt>Marker bit (M):</dt> | |||
of the highest spatial layer frame (the final packet of the picture) | <dd>This bit <bcp14>MUST</bcp14> be set to 1 for the final packet | |||
, | of the highest spatial-layer frame (the final packet of the picture) | |||
and 0 otherwise. Unless spatial scalability is in use for this pict | ; otherwise, it is 0. Unless spatial scalability is in use for this picture, | |||
ure, | this bit will have the same value as the E bit described in <xref ta | |||
this will have the same value as the E bit described below. Note th | rget="VP9payloadDescriptor"/>. Note this bit | |||
is bit | <bcp14>MUST</bcp14> be set to 1 for the target spatial-layer frame | |||
MUST be set to 1 for the target spatial layer frame | if a stream is being rewritten to remove higher spatial layers.</dd> | |||
if a stream is being rewritten to remove higher spatial layers.</t> | <dt>Payload Type (PT):</dt> | |||
<dd>In line with the policy in <xref target="RFC3551" | ||||
<t hangText="Payload Type (PT):">In line with the policy | sectionFormat="of" section="3" format="default"/>, applications using | |||
in Section 3 of <xref target='RFC3551'/>, applications | the VP9 RTP payload profile <bcp14>MUST</bcp14> assign a dynamic | |||
using the VP9 RTP payload | payload type number to be used in each RTP session and provide a | |||
profile MUST assign a dynamic payload type number to be | mechanism to indicate the mapping. See <xref target="SDPParameters" | |||
used in each RTP session and provide a mechanism to | format="default"/> for the mechanism to be used with the <xref | |||
indicate the mapping. See <xref target="SDPParameters" | target="RFC8866" format="default">Session Description Protocol | |||
/> for the mechanism | (SDP)</xref>.</dd> | |||
to be used with the <xref target='RFC8866'>Session | <dt>Timestamp:</dt> | |||
Description Protocol (SDP)</xref>.</t> | <dd>The <xref target="RFC3550" format="default">RTP timestamp</xref> i | |||
ndicates the time when | ||||
<t hangText="Timestamp:">The <xref target="RFC3550">RTP timestamp</x | ||||
ref> indicates the time when | ||||
the input frame was sampled, at a clock rate of 90 kHz. If the | the input frame was sampled, at a clock rate of 90 kHz. If the | |||
input picture is encoded with multiple layer frames, all of the | input picture is encoded with multiple-layer frames, all of the | |||
frames of the picture MUST have the same timestamp.</t> | frames of the picture <bcp14>MUST</bcp14> have the same timestamp.</ | |||
dd> | ||||
<t>If a frame has the VP9 show_frame field set to 0 (i.e. | <dt/> | |||
, it is meant only to | <dd>If a frame has the VP9 show_frame field set to 0 (i.e., it is | |||
populate a reference buffer, without being output) its | meant only to populate a reference buffer without being output), its | |||
timestamp MAY alternatively be set | timestamp <bcp14>MAY</bcp14> alternatively be set to be the same as | |||
to be the same as the subsequent frame with show_frame | the subsequent frame with show_frame equal to 1. (This will be | |||
equal to 1. (This will | convenient for playing out pre-encoded content packaged with VP9 | |||
be convenient for playing out pre-encoded content packa | "superframes", which typically bundle show_frame==0 frames with a | |||
ged with VP9 "superframes", which | subsequent show_frame==1 frame.) Every frame with show_frame==1, | |||
typically bundle show_frame==0 frames with a subsequent | however, <bcp14>MUST</bcp14> have a unique timestamp modulo the 2<sup> | |||
show_frame==1 frame.) Every | 32</sup> | |||
frame with show_frame==1, however, MUST have a unique t | wrap of the field.</dd> | |||
imestamp modulo the 2^32 wrap of | </dl> | |||
the field.</t> | <t>The remaining RTP Fixed Header Fields (V, P, X, CC, sequence | |||
number, SSRC, and CSRC identifiers) are used as specified in <xref | ||||
</list></t> | target="RFC3550" sectionFormat="of" section="5.1" | |||
<t>The remaining RTP Fixed Header Fields (V, P, X, CC, | format="default"/>.</t> | |||
sequence number, SSRC and CSRC identifiers) are used as | ||||
specified in Section 5.1 of <xref | ||||
target="RFC3550"/>.</t> | ||||
</section> | </section> | |||
<section anchor="VP9payloadDescriptor" numbered="true" toc="default"> | ||||
<name>VP9 Payload Descriptor</name> | ||||
<section anchor="VP9payloadDescriptor" title="VP9 Payload Descriptor"> | <!--[rfced] Section 4.2: It seems the descriptions following Figure 3 | |||
<figure anchor="figureVP9payloadDescriptor"> | apply to both Figures 2 and 3. If that is so, might a note of this appear | |||
<preamble>In flexible mode (with the F bit below set to 1), the first | somewhere earlier in that section for the ease of the reader?--> | |||
octets | ||||
after the RTP header are the VP9 payload descriptor, with the followin | ||||
g | ||||
structure.</preamble> | ||||
<artwork><![CDATA[ | <t keepWithNext="true">In flexible mode (with the F bit below set to 1), | |||
the first octets | ||||
after the RTP header are the VP9 payload descriptor, with the followin | ||||
g | ||||
structure.</t> | ||||
<figure anchor="figureVP9payloadDescriptor" title="Flexible Mode Format | ||||
for VP9 Payload Descriptor"> | ||||
<artwork name="" type="" align="left" alt=""><![CDATA[ | ||||
0 1 2 3 4 5 6 7 | 0 1 2 3 4 5 6 7 | |||
+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
|I|P|L|F|B|E|V|Z| (REQUIRED) | |I|P|L|F|B|E|V|Z| (REQUIRED) | |||
+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
I: |M| PICTURE ID | (REQUIRED) | I: |M| PICTURE ID | (REQUIRED) | |||
+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
M: | EXTENDED PID | (RECOMMENDED) | M: | EXTENDED PID | (RECOMMENDED) | |||
+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
L: | TID |U| SID |D| (Conditionally RECOMMENDED) | L: | TID |U| SID |D| (Conditionally RECOMMENDED) | |||
+-+-+-+-+-+-+-+-+ -\ | +-+-+-+-+-+-+-+-+ -\ | |||
P,F: | P_DIFF |N| (Conditionally REQUIRED) - up to 3 times | P,F: | P_DIFF |N| (Conditionally REQUIRED) - up to 3 times | |||
+-+-+-+-+-+-+-+-+ -/ | +-+-+-+-+-+-+-+-+ -/ | |||
V: | SS | | V: | SS | | |||
| .. | | | .. | | |||
+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
]]></artwork> | ||||
]]></artwork> | ||||
</figure> | </figure> | |||
<t keepWithNext="true">In non-flexible mode (with the F bit below set to | ||||
<figure anchor="figureVP9payloadDescriptorNonFlexible"> | 0), the first octets | |||
<preamble>In non-flexible mode (with the F bit below set to 0), the fi | ||||
rst octets | ||||
after the RTP header are the VP9 payload descriptor, with the followin g | after the RTP header are the VP9 payload descriptor, with the followin g | |||
structure.</preamble> | structure.</t> | |||
<figure anchor="figureVP9payloadDescriptorNonFlexible" title="Non-flexib | ||||
<artwork><![CDATA[ | le Mode Format for VP9 Payload Descriptor"> | |||
<artwork name="" type="" align="left" alt=""><![CDATA[ | ||||
0 1 2 3 4 5 6 7 | 0 1 2 3 4 5 6 7 | |||
+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
|I|P|L|F|B|E|V|Z| (REQUIRED) | |I|P|L|F|B|E|V|Z| (REQUIRED) | |||
+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
I: |M| PICTURE ID | (RECOMMENDED) | I: |M| PICTURE ID | (RECOMMENDED) | |||
+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
M: | EXTENDED PID | (RECOMMENDED) | M: | EXTENDED PID | (RECOMMENDED) | |||
+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
L: | TID |U| SID |D| (Conditionally RECOMMENDED) | L: | TID |U| SID |D| (Conditionally RECOMMENDED) | |||
+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
| TL0PICIDX | (Conditionally REQUIRED) | | TL0PICIDX | (Conditionally REQUIRED) | |||
+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
V: | SS | | V: | SS | | |||
| .. | | | .. | | |||
+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
]]></artwork> | ||||
]]></artwork> | ||||
</figure> | </figure> | |||
<dl newline="false" spacing="normal"> | ||||
<dt>I:</dt> | ||||
<dd>Picture ID (PID) present. When set to 1, the | ||||
<bcp14>OPTIONAL</bcp14> PID <bcp14>MUST</bcp14> be present after the | ||||
mandatory first octet and specified as below. Otherwise, PID | ||||
<bcp14>MUST NOT</bcp14> be present. If the V bit was set in the | ||||
stream's most recent start of a keyframe (i.e., the SS field was | ||||
present) and the F bit is set to 0 (i.e., non-flexible scalability | ||||
mode is in use), then this bit <bcp14>MUST</bcp14> be set on every | ||||
packet.</dd> | ||||
<dt>P:</dt> | ||||
<dd>Inter-picture predicted frame. When set to 0, the frame does | ||||
not utilize inter-picture prediction. In this case, up-switching to | ||||
a current spatial layer's frame is possible from a directly lower | ||||
spatial-layer frame. P <bcp14>SHOULD</bcp14> also be set to 0 when | ||||
encoding a layer synchronization frame in response to a <xref target=" | ||||
RFC9627" format="default">Layer Refresh Request (LRR)</xref> | ||||
message (see <xref target="LRR" format="default"/>). When P is set | ||||
to 0, the TID field (described below) <bcp14>MUST</bcp14> also be | ||||
set to 0 (if present). Note that the P bit does not forbid | ||||
intra-picture, inter-layer prediction from earlier frames of the | ||||
same picture, if any.</dd> | ||||
<dt>L:</dt> | ||||
<t><list style="hanging"> | <dd>Layer indices present. When set to 1, the one or two octets | |||
<t hangText="I:">Picture ID (PID) present. When set to one, the | following the mandatory first octet and the PID (if present) is as | |||
OPTIONAL PID MUST be present after the mandatory first octet and | described by "Layer indices" below. If the F bit (described below) | |||
specified as below. Otherwise, PID MUST NOT be present. If the V bit | is set to 1 (indicating flexible mode), then only one octet is | |||
was set in the | present for the layer indices. Otherwise, if the F bit is set to 0 | |||
stream's most recent start of a keyframe (i.e. the SS field was presen | (indicating non-flexible mode), then two octets are present for the | |||
t) and the F bit | layer indices.</dd> | |||
is set to 0 (i.e. non-flexible scalability mode is in use), | <dt>F:</dt> | |||
then this bit MUST be set on every packet.</t> | <dd>Flexible mode. When set to 1, this indicates flexible mode; if th | |||
e | ||||
<t hangText="P:">Inter-picture predicted frame. When set to zero, the | P bit is also set to 1, then the octets following the mandatory | |||
frame does not utilize inter-picture prediction. In this case, | first octet, the PID, and layer indices (if present) are as | |||
up-switching to a current spatial layer's frame is possible from direc | described by "reference indices" below. This bit <bcp14>MUST</bcp14> | |||
tly | only be set to 1 if the I bit is also set to 1; if the I bit is | |||
lower spatial layer frame. P SHOULD also be set to zero when | set to 0, then this bit <bcp14>MUST</bcp14> also be set to 0 and | |||
encoding a layer synchronization frame in response to an <xref target= | ignored by receivers. (Flexible mode's reference indices are defined | |||
'I-D.ietf-avtext-lrr'>LRR</xref> message (see <xref target='LRR'/>). | as offsets from the Picture ID field, so they would have no meaning | |||
When P is set to zero, the TID field (described below) MUST also | if I were not set.) The value of the F bit <bcp14>MUST</bcp14> | |||
be set to 0 (if present). Note that the P bit does not | only change on the first packet of a key picture. A "key picture" is | |||
forbid intra-picture, inter-layer prediction from earlier | a picture whose base spatial-layer frame is a keyframe, and thus one w | |||
frames of the same picture, if any.</t> | hich | |||
completely resets the encoder state. This packet will have its | ||||
<t hangText="L:">Layer indices present. When set to one, | P bit equal to 0, SID or L bit (described below) equal to 0, | |||
the one or two octets following the mandatory first octet and the PID | and B bit (described below) equal to 1.</dd> | |||
(if present) is as described by "Layer indices" below. If the F bit ( | <dt>B:</dt> | |||
described below) | <dd>Start of a frame. This bit <bcp14>MUST</bcp14> be set to 1 if | |||
is set to 1 (indicating flexible mode), then only one octet is present | ||||
for the | ||||
layer indices. Otherwise if the F bit is set to 0 (indicating non-flex | ||||
ible mode), | ||||
then two octets are present for the layer indices.</t> | ||||
<t hangText="F:">Flexible mode. F set to one indicates | ||||
flexible mode and if the P bit is also set to one, then the octets fol | ||||
lowing | ||||
the mandatory first octet, the PID, and layer indices (if present) are | ||||
as described by "Reference indices" below. This MUST only be set to 1 | ||||
if the I | ||||
bit is also set to one; if the I bit is set to zero, then this MUST al | ||||
so be | ||||
set to zero and ignored by receivers. (Flexible mode's Reference indic | ||||
es are defined as offsets | ||||
from the Picture ID field, so they would have no meaning if I were not | ||||
set.) | ||||
The value of this F | ||||
bit MUST only change | ||||
on the first packet of a key picture. A key picture is a | ||||
picture whose base spatial layer frame is a key frame, and | ||||
which thus completely resets the encoder state. This | ||||
packet will have its P bit | ||||
equal to zero, SID or L bit (described below) equal to zero, and B bit | ||||
(described below) | ||||
equal to 1.</t> | ||||
<t hangText="B:">Start of a frame. MUST be set to 1 if | ||||
the first payload octet of the RTP packet is the beginning of a | the first payload octet of the RTP packet is the beginning of a | |||
new VP9 frame, and MUST NOT be 1 otherwise. Note that this | new VP9 frame; otherwise, it <bcp14>MUST NOT</bcp14> be 1. Note that t | |||
frame might not be the first frame of a picture.</t> | his | |||
frame might not be the first frame of a picture.</dd> | ||||
<t hangText="E:">End of a frame. MUST be set to 1 for the final | <dt>E:</dt> | |||
RTP packet of a VP9 frame, and 0 otherwise. This enables a | <dd>End of a frame. This bit <bcp14>MUST</bcp14> be set to 1 for the | |||
final | ||||
RTP packet of a VP9 frame; otherwise, it is 0. This enables a | ||||
decoder to finish decoding the frame, where it otherwise may need to | decoder to finish decoding the frame, where it otherwise may need to | |||
wait for the next packet to explicitly know that the frame is complete . | wait for the next packet to explicitly know that the frame is complete . | |||
Note that, if spatial scalability is in use, more frames from the | Note that, if spatial scalability is in use, more frames from the | |||
same picture may follow; see the description of the B bit above.</t> | same picture may follow; see the description of the B bit above.</dd> | |||
<dt>V:</dt> | ||||
<t hangText="V:">Scalability structure (SS) data present. When set | <dd>Scalability Structure (SS) data present. When set | |||
to one, the OPTIONAL SS data MUST be present in the payload descriptor | to 1, the <bcp14>OPTIONAL</bcp14> SS data <bcp14>MUST</bcp14> be prese | |||
. | nt in the payload descriptor. | |||
Otherwise, the SS data MUST NOT be present.</t> | Otherwise, the SS data <bcp14>MUST NOT</bcp14> be present.</dd> | |||
<dt>Z:</dt> | ||||
<t hangText="Z:">Not a reference frame for upper spatial | <dd>Not a reference frame for upper spatial layers. If set to 1, | |||
layers. If set to 1, indicates that frames with higher | indicates that frames with higher spatial layers SID+1 and greater | |||
spatial layers SID+1 and greater of the current and following pictures | of the current and following pictures do not depend on the current | |||
do not depend on the current spatial layer SID frame. This | spatial-layer SID frame. This enables a decoder that is targeting a | |||
enables a decoder which is targeting a higher spatial layer | higher spatial layer to know that it can safely discard this | |||
to know that it can safely discard this packet's frame | packet's frame without processing it, without having to wait for the | |||
without processing it, without having to wait for the "D" | D bit in the higher-layer frame (see below).</dd> | |||
bit in the higher-layer frame (see below).</t> | </dl> | |||
</list></t> | ||||
<t>The mandatory first octet is followed by the extension data fields th at | <t>The mandatory first octet is followed by the extension data fields th at | |||
are enabled:<list style="hanging"> | are enabled:</t> | |||
<t hangText="M:">The most significant bit of the first octet is an | <dl newline="false" spacing="normal"> | |||
extension flag. The field MUST be present if the I bit is equal to | <dt>M:</dt> | |||
one. If M is set, the PID field MUST contain 15 bits; otherwise, it MU | ||||
ST | ||||
contain 7 bits. See PID below.</t> | ||||
<t hangText="Picture ID (PID):">Picture ID represented in 7 or 15 bits | ||||
, | ||||
depending on the M bit. This is a running index of the pictures, where | ||||
the | ||||
sender increments the value by 1 for each picture it sends. (Note how | ||||
ever that | ||||
because a middlebox can discard pictures where permitted by the scalab | ||||
ility structure, Picture IDs | ||||
as received by a receiver might not be contiguous.) This | ||||
field MUST be present if the I bit is equal to one. If M is set to zer | ||||
o, | ||||
7 bits carry the PID; else if M is set to one, 15 bits carry | ||||
the PID in network byte order. | ||||
The sender may choose between a 7- or 15-bit index. The PID SHOULD sta | ||||
rt on a | ||||
random number, and MUST wrap after reaching the maximum ID (0x7f or 0x | ||||
7fff depending on | ||||
the index size chosen). The receiver | ||||
MUST NOT assume that the number of bits in PID stay the same through t | ||||
he | ||||
session. If this field transitions from 7-bits to 15-bits, the value | ||||
is zero-extended | ||||
(i.e. the value after 0x6e is 0x006f); if the field transitions from 1 | ||||
5 bits to 7 bits, | ||||
it is truncated (i.e. the value after 0x1bbe is 0xbf). | ||||
</t> | ||||
<t>In the non-flexible mode (when the F bit is set to 0), this PID is | ||||
used | ||||
as an index to the picture group (PG) specified in the SS data below. | ||||
In this mode, the | ||||
PID of the key frame corresponds to the first specified frame in the | ||||
PG. Then subsequent PIDs are mapped to subsequently specified frames | ||||
in | ||||
the PG (modulo N_G, specified in the SS data below), respectively.</t> | ||||
<t>All frames of the same picture MUST have the same PID value. | ||||
</t> | ||||
<t>Frames (and their corresponding pictures) with the VP9 show_ | ||||
frame field equal to 0 MUST | ||||
have distinct PID values from subsequent pictures with sh | ||||
ow_frame equal to 1. Thus, | ||||
a Picture as defined in this specification is different t | ||||
han a VP9 Superframe.</t> | ||||
<t>All frames of the same picture MUST have the same value for | ||||
show_frame.</t> | ||||
<t hangText="Layer indices:">This information is optional but RECOMMEN | ||||
DED | ||||
whenever encoding with layers. For both flexible and non-flexible mod | ||||
es, | ||||
one octet is used to specify a layer frame's temporal layer ID (TID) a | ||||
nd spatial layer ID (SID) | ||||
as shown both in <xref target="figureVP9payloadDescriptor"/> and <xref | ||||
target="figureVP9payloadDescriptorNonFlexible"/>. | ||||
Additionally, a bit (U) is used to indicate that the current frame is | ||||
a | ||||
"switching up point" frame. Another bit (D) is used to indicate wheth | ||||
er inter-layer | ||||
prediction is used for the current frame.</t> | ||||
<t>In the non-flexible mode (when the F bit is set to 0), another octe | ||||
t is used | ||||
to represent temporal layer 0 index (TL0PICIDX), as depicted in <xref | ||||
target="figureVP9payloadDescriptorNonFlexible"/>. | ||||
The TL0PICIDX is present so that all minimally required frames - the b | ||||
ase temporal layer frames - can be tracked.</t> | ||||
<t>The TID and SID fields indicate the temporal and spatial layers and | ||||
can help middleboxes and | ||||
endpoints quickly identify which layer a packet belongs to. | ||||
<list style="hanging"> | ||||
<t hangText="TID:">The temporal layer ID of current frame. In the c | ||||
ase of non-flexible mode, | ||||
if PID is mapped to a picture in a specified PG, then | ||||
the value of TID MUST match the corresponding TID value of the mappe | ||||
d picture in the PG.</t> | ||||
<t hangText="U:">Switching up point. If this bit is set to 1 for th | ||||
e current picture with temporal | ||||
layer ID equal to TID, then "switch up" to a higher frame rate is po | ||||
ssible as subsequent higher temporal | ||||
layer pictures will not depend on any picture before the current pic | ||||
ture (in coding order) with temporal layer | ||||
ID greater than TID.</t> | ||||
<t hangText="SID:">The spatial layer ID of current frame. Note that | <dd>The most significant bit of the first octet is an extension | |||
frames with spatial layer SID > 0 | flag. The field <bcp14>MUST</bcp14> be present if the I bit is equal | |||
may be dependent on decoded spatial layer SID-1 frame within the sam | to one. If M is set, the PID field <bcp14>MUST</bcp14> contain 15 | |||
e picture. Different | bits; otherwise, it <bcp14>MUST</bcp14> contain 7 bits. See PID | |||
frames of the same picture MUST have distinct spatial lay | below.</dd> | |||
er IDs, and frames' spatial layers | <dt>Picture ID (PID):</dt> | |||
MUST appear in increasing order within the frame.</t> | <dd>Picture ID represented in 7 or 15 bits, depending on the M | |||
bit. This is a running index of the pictures, where the sender | ||||
increments the value by 1 for each picture it sends. (Note, | ||||
however, that because a middlebox can discard pictures where | ||||
permitted by the SS, Picture IDs as received by a | ||||
receiver might not be contiguous.) This field <bcp14>MUST</bcp14> | ||||
be present if the I bit is equal to one. If M is set to 0, 7 bits | ||||
carry the PID; else, if M is set to 1, 15 bits carry the PID in | ||||
network byte order. The sender may choose between a 7- or 15-bit | ||||
index. The PID <bcp14>SHOULD</bcp14> start on a random number and | ||||
<bcp14>MUST</bcp14> wrap after reaching the maximum ID (0x7f or | ||||
0x7fff depending on the index size chosen). The receiver <bcp14>MUST | ||||
NOT</bcp14> assume that the number of bits in the PID stays the same | ||||
through the session. If this field transitions from 7 bits to 15 | ||||
bits, the value is zero-extended (i.e., the value after 0x6e is | ||||
0x006f); if the field transitions from 15 bits to 7 bits, it is | ||||
truncated (i.e., the value after 0x1bbe is 0xbf). | ||||
</dd> | ||||
<dt/> | ||||
<dd>In the non-flexible mode (when the F bit is set to 0), this PID | ||||
is used as an index to the PG specified in the SS | ||||
data below. In this mode, the PID of the keyframe corresponds to | ||||
the first specified frame in the PG. Then subsequent PIDs are | ||||
mapped to subsequently specified frames in the PG (modulo N_G, | ||||
specified in the SS data below), respectively.</dd> | ||||
<dt/> | ||||
<dd>All frames of the same picture <bcp14>MUST</bcp14> have the same | ||||
PID value.</dd> | ||||
<dt/> | ||||
<dd>Frames (and their corresponding pictures) with the VP9 | ||||
show_frame field equal to 0 <bcp14>MUST</bcp14> have distinct PID | ||||
values from subsequent pictures with show_frame equal to 1. Thus, a | ||||
picture (as defined in this specification) is different than a VP9 | ||||
superframe.</dd> | ||||
<dt/> | ||||
<dd>All frames of the same picture <bcp14>MUST</bcp14> have the same | ||||
value for show_frame.</dd> | ||||
<t hangText="D:">Inter-layer dependency used. MUST be set to one if | <dt>Layer indices:</dt> | |||
and only if the current spatial layer SID frame | <dd>This field is optional but <bcp14>RECOMMENDED</bcp14> | |||
depends on spatial layer SID-1 frame of the same picture, otherwise | whenever encoding with layers. For both flexible and non-flexible | |||
MUST be set to zero. For the base layer frame | modes, one octet is used to specify a layer frame's temporal-layer | |||
(with SID equal to 0), this D bit MUST be set to zero.</t> | ID (TID) and spatial-layer ID (SID) as shown both in <xref | |||
target="figureVP9payloadDescriptor" format="default"/> and <xref | ||||
target="figureVP9payloadDescriptorNonFlexible" format="default"/>. | ||||
Additionally, a bit (U) is used to indicate that the current frame | ||||
is a "switching up point" frame. Another bit (D) is used to | ||||
indicate whether inter-layer prediction is used for the current | ||||
frame.</dd> | ||||
<dt/> | ||||
<dd>In the non-flexible mode (when the F bit is set to 0), another | ||||
octet is used to represent Temporal Layer 0 Picture Index (8 bits) (TL | ||||
0PICIDX), as | ||||
depicted in <xref target="figureVP9payloadDescriptorNonFlexible" | ||||
format="default"/>. The TL0PICIDX is present so that all minimally | ||||
required frames (the base temporal-layer frames) can be | ||||
tracked.</dd> | ||||
<dt/> | ||||
<dd> | ||||
<t>The TID and SID fields indicate the temporal and spatial layers | ||||
and can help middleboxes and endpoints quickly identify which | ||||
layer a packet belongs to. | ||||
<t hangText="TL0PICIDX:">8 bits temporal layer zero index. TL0PICIDX | </t> | |||
is only present | <dl newline="false" spacing="normal"> | |||
in the non-flexible mode (F = 0). This is a running index for the t | <dt>TID:</dt> | |||
emporal | <dd>The temporal-layer ID of the current frame. In the case of | |||
base layer pictures, i.e., the pictures with TID set to 0. If TID i | non-flexible mode, if a PID is mapped to a picture in a specified | |||
s larger than 0, | PG, then the value of the TID <bcp14>MUST</bcp14> match the | |||
TL0PICIDX indicates which temporal base layer picture the current pi | corresponding TID value of the mapped picture in the PG.</dd> | |||
cture depends on. TL0PICIDX MUST be | <dt>U:</dt> | |||
incremented by 1 when TID is equal to 0. The index SHOULD start on | <dd>Switching up point. If this bit is set to 1 for the current | |||
a random number, and MUST restart | picture with a temporal-layer ID equal to value T, then "switching | |||
at 0 after reaching the maximum number 255.</t> | up" | |||
</list></t> | to a higher frame rate is possible as subsequent higher | |||
temporal-layer pictures will not depend on any picture before | ||||
the current picture (in coding order) with a temporal-layer ID | ||||
value greater than T.</dd> | ||||
<dt>SID:</dt> | ||||
<dd>The spatial-layer ID of the current frame. Note that frames | ||||
with spatial-layer SID > 0 may be dependent on decoded | ||||
spatial-layer SID-1 frame within the same picture. Different | ||||
frames of the same picture <bcp14>MUST</bcp14> have distinct | ||||
spatial-layer IDs, and frames' spatial layers | ||||
<bcp14>MUST</bcp14> appear in increasing order within the | ||||
frame.</dd> | ||||
<dt>D:</dt> | ||||
<dd>Inter-layer dependency is used. D <bcp14>MUST</bcp14> be | ||||
set to 1 if and only if the current spatial-layer SID frame | ||||
depends on spatial-layer SID-1 frame of the same picture; | ||||
otherwise, it <bcp14>MUST</bcp14> be set to 0. For the | ||||
base-layer frame (with SID equal to 0), the D bit | ||||
<bcp14>MUST</bcp14> be set to 0.</dd> | ||||
<dt>TL0PICIDX:</dt> | ||||
<dd>Temporal Layer 0 Picture Index (8 bits). TL0PICIDX is only pres | ||||
ent | ||||
in the non-flexible mode (F = 0). This is a running index for | ||||
the temporal base-layer pictures, i.e., the pictures with a TID | ||||
set to 0. If the TID is larger than 0, TL0PICIDX indicates which | ||||
temporal base-layer picture the current picture depends on. | ||||
TL0PICIDX <bcp14>MUST</bcp14> be incremented by 1 when the TID is | ||||
equal to 0. The index <bcp14>SHOULD</bcp14> start on a random | ||||
number and <bcp14>MUST</bcp14> restart at 0 after reaching the | ||||
maximum number 255.</dd> | ||||
</dl> | ||||
</dd> | ||||
<dt>Reference indices:</dt> | ||||
<dd> | ||||
<t>When P and F are both set to 1, indicating a non-keyframe in | ||||
flexible mode, then at least one reference index | ||||
<bcp14>MUST</bcp14> be specified as below. Additional reference | ||||
indices (a total of up to three reference indices are allowed) may b | ||||
e | ||||
specified using the N bit below. When either P or F is set to 0, | ||||
then no reference index is specified. | ||||
</t> | ||||
<dl newline="false" spacing="normal"> | ||||
<dt>P_DIFF:</dt> | ||||
<dd>The reference index (in 7 bits) specified as the relative | ||||
PID from the current picture. For example, when P_DIFF=3 on a | ||||
packet containing the picture with PID 112 means that the | ||||
picture refers back to the picture with PID 109. This | ||||
calculation is done modulo the size of the PID field, i.e., | ||||
either 7 or 15 bits. A P_DIFF value of 0 is invalid.</dd> | ||||
<dt>N:</dt> | ||||
<dd>1 if there is additional P_DIFF following the current P_DIFF.< | ||||
/dd> | ||||
</dl> | ||||
</dd> | ||||
</dl> | ||||
<t hangText="Reference indices:">When P and F are both set to one, ind | <section anchor="VP9payloadDescriptorSS" numbered="true" toc="default"> | |||
icating a non-key frame in | <name>Scalability Structure (SS)</name> | |||
flexible mode, then at least | <t>The SS data describes the resolution of | |||
one reference index MUST be specified as below. Additional reference | each frame within a picture as well as the inter-picture | |||
indices (total of up to | dependencies for a PG. If the VP9 payload | |||
3 reference indices are allowed) may be specified using the N bit belo | descriptor's V bit is set, the SS data is present in the position | |||
w. When either P or F is | indicated in Figures <xref format="counter" target="figureVP9payloadDe | |||
set to zero, then no reference index is specified. | scriptor"/> and <xref target="figureVP9payloadDescriptorNonFlexible" format="cou | |||
<list style="hanging"> | nter"/>.</t> | |||
<t hangText="P_DIFF:">The reference index (in 7 bits) specified as t | ||||
he | ||||
relative PID from the current picture. For example, when P_DIFF=3 | ||||
on a packet containing the picture with PID 112 means | ||||
that the picture refers back to the picture with PID | ||||
109. This calculation is done modulo the size of the PID field, | ||||
i.e., either 7 or 15 bits. A P_DIFF value of 0 is invalid.</t> | ||||
<t hangText="N:">1 if there is additional P_DIFF following the curre | ||||
nt P_DIFF.</t> | ||||
</list></t> | ||||
</list></t> | ||||
<section anchor="VP9payloadDescriptorSS" title="Scalability Structure (SS) | <figure anchor="figureVP9ScalabilityStructure" title="VP9 Scalability | |||
:"> | Structure"> | |||
<t>The scalability structure (SS) data describes the resolution of | <artwork name="" type="" align="left" alt=""><![CDATA[ | |||
each frame within a picture as well as the inter-picture dependencies | ||||
for a picture group (PG). If the VP9 payload descriptor's "V" | ||||
bit is set, the SS data is present in the position indicated in | ||||
<xref target="figureVP9payloadDescriptor"/> and <xref target="figureVP9p | ||||
ayloadDescriptorNonFlexible"/>.</t> | ||||
<figure anchor="figureVP9ScalabilityStructure"> | ||||
<artwork><![CDATA[ | ||||
+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+ | |||
V: | N_S |Y|G|-|-|-| | V: | N_S |Y|G|-|-|-| | |||
+-+-+-+-+-+-+-+-+ -\ | +-+-+-+-+-+-+-+-+ -\ | |||
Y: | WIDTH | (OPTIONAL) . | Y: | WIDTH | (OPTIONAL) . | |||
+ + . | + + . | |||
| | (OPTIONAL) . | | | (OPTIONAL) . | |||
+-+-+-+-+-+-+-+-+ . - N_S + 1 times | +-+-+-+-+-+-+-+-+ . - N_S + 1 times | |||
| HEIGHT | (OPTIONAL) . | | HEIGHT | (OPTIONAL) . | |||
+ + . | + + . | |||
| | (OPTIONAL) . | | | (OPTIONAL) . | |||
+-+-+-+-+-+-+-+-+ -/ | +-+-+-+-+-+-+-+-+ -/ | |||
G: | N_G | (OPTIONAL) | G: | N_G | (OPTIONAL) | |||
+-+-+-+-+-+-+-+-+ -\ | +-+-+-+-+-+-+-+-+ -\ | |||
N_G: | TID |U| R |-|-| (OPTIONAL) . | N_G: | TID |U| R |-|-| (OPTIONAL) . | |||
+-+-+-+-+-+-+-+-+ -\ . - N_G times | +-+-+-+-+-+-+-+-+ -\ . - N_G times | |||
| P_DIFF | (OPTIONAL) . - R times . | | P_DIFF | (OPTIONAL) . - R times . | |||
+-+-+-+-+-+-+-+-+ -/ -/ | +-+-+-+-+-+-+-+-+ -/ -/ | |||
]]></artwork> | ]]></artwork> | |||
</figure> | </figure> | |||
<t><list style="hanging"> | ||||
<t hangText="N_S:">N_S + 1 indicates the number of spatial | ||||
layers present in the VP9 stream.</t> | ||||
<t hangText="Y:">Each spatial layer's frame resolution present. | ||||
When set to one, the OPTIONAL WIDTH (2 octets) and HEIGHT | ||||
(2 octets) MUST be present for each layer frame. Otherwise, the | ||||
resolution MUST NOT be present.</t> | ||||
<t hangText="G:">PG description present flag.</t> | ||||
<t hangText="-:">Bit reserved for future use. MUST be set to | ||||
zero and MUST be ignored by the receiver.</t> | ||||
<t hangText="N_G:">N_G indicates the number of pictures in a | ||||
Picture Group (PG). | ||||
If N_G is greater than 0, then the SS data allows | ||||
the inter-picture dependency structure of the VP9 stream to | ||||
be pre-declared, rather than indicating it on the fly with | ||||
every packet. If N_G is greater than 0, then for N_G | ||||
pictures in the PG, each picture's temporal layer ID (TID), switch up | ||||
point (U), | ||||
and the Reference indices (P_DIFFs) are specified.</t> | ||||
<t>The first picture specified in the PG MUST have TID set to 0.</t> | ||||
<t>G set to 0 or N_G set to 0 indicates that either there is only one | ||||
temporal | ||||
layer (for non-flexible mode) or no fixed inter-picture dependency inf | ||||
ormation is present | ||||
(for flexible mode) going forward in the bitstream.</t> | ||||
<t>Note that for a given picture, all frames follow the | <!--[rfced] We note that not all fields that appear in Figure 4 are | |||
same inter-picture dependency structure. However, the frame rate | described following it. Please review and let us know if text | |||
of each spatial layer can be different from each other and this can | (or a pointer to where the reader can get more information on | |||
be described with the use of the D bit described above. The | these fields) should be added. | |||
specified dependency structure in the SS data MUST be for the highest | ||||
frame rate layer.</t> | ||||
</list></t> | ||||
<t>In a scalable stream sent with a fixed pattern, the SS data | --> | |||
SHOULD be included in the first packet of every key frame. This is a pac | <dl newline="false" spacing="normal"> | |||
ket | <dt>N_S:</dt> | |||
with P bit equal to zero, SID or L bit equal to zero, and B bit equal to | <dd>Number of Spatial Layers Minus 1. N_S + 1 indicates the number | |||
1. | of spatial | |||
The SS data MUST only be changed on the picture that corresponds to the | layers present in the VP9 stream.</dd> | |||
first picture specified in the previous SS data's PG | <dt>Y:</dt> | |||
(if the previous SS data's N_G was greater than 0).</t> | <dd>Each spatial layer's frame resolution is present. | |||
When set to 1, the <bcp14>OPTIONAL</bcp14> WIDTH (2 octets) and HEIGHT | ||||
(2 octets) <bcp14>MUST</bcp14> be present for each layer frame. Other | ||||
wise, the | ||||
resolution <bcp14>MUST NOT</bcp14> be present.</dd> | ||||
<dt>G:</dt> | ||||
<dd>The PG description present flag.</dd> | ||||
<dt>-:</dt> | ||||
<dd>A bit reserved for future use. It <bcp14>MUST</bcp14> be set | ||||
to 0 and <bcp14>MUST</bcp14> be ignored by the receiver.</dd> | ||||
<dt>N_G:</dt> | ||||
<dd>N_G indicates the number of pictures in a PG. | ||||
If N_G is greater than 0, then the SS data allows the | ||||
inter-picture dependency structure of the VP9 stream to be | ||||
pre-declared, rather than indicating it on the fly with every | ||||
packet. If N_G is greater than 0, then for N_G pictures in the | ||||
PG, each picture's temporal-layer ID (TID), switch up point (U), | ||||
and reference indices (P_DIFFs) are specified.</dd> | ||||
<dt/> | ||||
<dd>The first picture specified in the PG <bcp14>MUST</bcp14> have a | ||||
TID set to 0.</dd> | ||||
<dt/> | ||||
<dd>G set to 0 or N_G set to 0 indicates that either there is only | ||||
one temporal layer (for non-flexible mode) or no fixed | ||||
inter-picture dependency information is present (for flexible | ||||
mode) going forward in the bitstream.</dd> | ||||
<dt/> | ||||
<dd>Note that for a given picture, all frames follow the same | ||||
inter-picture dependency structure. However, the frame rate of | ||||
each spatial layer can be different from each other; this can | ||||
be described with the use of the D bit described above. The | ||||
specified dependency structure in the SS data <bcp14>MUST</bcp14> | ||||
be for the highest frame rate layer.</dd> | ||||
</dl> | ||||
<t>In a scalable stream sent with a fixed pattern, the SS data | ||||
<bcp14>SHOULD</bcp14> be included in the first packet of every key | ||||
frame. This is a packet with the P bit equal to 0, SID or L bit equal | ||||
to 0, and B bit equal to 1. The SS data <bcp14>MUST</bcp14> only | ||||
be changed on the picture that corresponds to the first picture | ||||
specified in the previous SS data's PG (if the previous SS data's | ||||
N_G was greater than 0).</t> | ||||
</section> | ||||
</section> | </section> | |||
</section> | <section numbered="true" toc="default"> | |||
<name>Frame Fragmentation</name> | ||||
<section title="Frame Fragmentation"> | <t>VP9 frames are fragmented into packets in RTP sequence number | |||
<t>VP9 frames are fragmented into packets, in RTP sequence | order: beginning with a packet with the B bit set and ending with a | |||
number order, beginning with a | packet with the E bit set. There is no mechanism for finer-grained | |||
packet with the B bit set, and ending with a packet with the | access to parts of a VP9 frame.</t> | |||
E bit set. There is no mechanism for finer-grained | ||||
access to parts of a VP9 frame.</t> | ||||
</section> | </section> | |||
<section numbered="true" toc="default"> | ||||
<section title="Scalable encoding considerations"> | <name>Scalable Encoding Considerations</name> | |||
<t>In addition to the use of reference frames, VP9 has several | ||||
<t>In addition to the use of reference frames, VP9 has several | ||||
additional forms of inter-frame dependencies, largely | additional forms of inter-frame dependencies, largely | |||
involving probability tables for the entropy and tree | involving probability tables for the entropy and tree | |||
encoders. In VP9 syntax, the syntax element | encoders. In VP9 syntax, the syntax element | |||
"error_resilient_mode" resets this additional inter-frame | "error_resilient_mode" resets this additional inter-frame | |||
data, allowing a frame's syntax to be decoded | data, allowing a frame's syntax to be decoded | |||
independently.</t> | independently.</t> | |||
<t>Due to the requirements of scalable streams, a VP9 encoder | ||||
<t>Due to the requirements of scalable streams, a VP9 encoder | ||||
producing a scalable stream needs to ensure that a frame does | producing a scalable stream needs to ensure that a frame does | |||
not depend on a previous frame (of the same or a previous | not depend on a previous frame (of the same or a previous | |||
picture) that can legitimately be removed from the stream. | picture) that can legitimately be removed from the stream. | |||
Thus, a frame that follows a frame that might be removed (in full decode | Thus, a frame that follows a frame that might be removed (in full decode | |||
order) MUST be encoded with "error_resilient_mode" set to | order) <bcp14>MUST</bcp14> be encoded with "error_resilient_mode" set to | |||
true.</t> | true.</t> | |||
<t>For spatially scalable streams, this means that | ||||
<t>For spatially-scalable streams, this means that | ||||
"error_resilient_mode" needs to be turned on for the base | "error_resilient_mode" needs to be turned on for the base | |||
spatial layer; it can however be turned off for higher spatial | spatial layer; however, it can be turned off for higher spatial | |||
layers, assuming they are sent with inter-layer dependency | layers, assuming they are sent with inter-layer dependency | |||
(i.e. with the "D" bit set). For streams that are only | (i.e., with the D bit set). For streams that are only | |||
temporally-scalable without spatial scalability, | temporally scalable without spatial scalability, | |||
"error_resilient_mode" can additionally be turned off for any | "error_resilient_mode" can additionally be turned off for any | |||
picture that immediately follows a temporal layer 0 frame.</t> | picture that immediately follows a temporal-layer 0 frame.</t> | |||
</section> | ||||
</section> | <section numbered="true" toc="default"> | |||
<name>Examples of VP9 RTP Stream</name> | ||||
<section title="Examples of VP9 RTP Stream"> | <section numbered="true" toc="default"> | |||
<section title="Reference picture use for scalable structure"> | <name>Reference Picture Use for Scalable Structure</name> | |||
<t>As discussed in <xref target="mediaFormatDescription" format="defau | ||||
<t>As discussed in <xref target="mediaFormatDescription"/>, the | lt"/>, the | |||
VP9 codec can maintain up to eight reference frames, of | VP9 codec can maintain up to eight reference frames, of | |||
which up to three can be referenced or updated by any new | which up to three can be referenced or updated by any new | |||
frame. This section illustrates one way that a scalable | frame. This section illustrates one way that a scalable | |||
structure (with three spatial layers and three temporal | structure (with three spatial layers and three temporal | |||
layers) can be constructed using these reference | layers) can be constructed using these reference | |||
frames.</t> | frames.</t> | |||
<table align="center"> | ||||
<texttable title="Example scalability structure"> | <name>Example Scalability Structure</name> | |||
<thead> | ||||
<ttcol align="center">Temporal</ttcol> | <tr> | |||
<ttcol align="center">Spatial</ttcol> | <th align="center">Temporal</th> | |||
<ttcol align="center">References</ttcol> | <th align="center">Spatial</th> | |||
<ttcol align="center">Updates</ttcol> | <th align="center">References</th> | |||
<c>0</c><c>0</c><c>0</c><c>0</c> | <th align="center">Updates</th> | |||
<c>0</c><c>1</c><c>0,1</c><c>1</c> | </tr> | |||
<c>0</c><c>2</c><c>1,2</c><c>2</c> | </thead> | |||
<c>2</c><c>0</c><c>0</c><c>6</c> | <tbody> | |||
<c>2</c><c>1</c><c>1,6</c><c>7</c> | <tr> | |||
<c>2</c><c>2</c><c>2,7</c><c>-</c> | <td align="center">0</td> | |||
<c>1</c><c>0</c><c>0</c><c>3</c> | <td align="center">0</td> | |||
<c>1</c><c>1</c><c>1,3</c><c>4</c> | <td align="center">0</td> | |||
<c>1</c><c>2</c><c>2,4</c><c>5</c> | <td align="center">0</td> | |||
<c>2</c><c>0</c><c>3</c><c>6</c> | </tr> | |||
<c>2</c><c>1</c><c>4,6</c><c>7</c> | <tr> | |||
<c>2</c><c>2</c><c>5,7</c><c>-</c> | <td align="center">0</td> | |||
<td align="center">1</td> | ||||
</texttable> | <td align="center">0,1</td> | |||
<td align="center">1</td> | ||||
<t>This structure is constructed such that the "U" bit can | </tr> | |||
<tr> | ||||
<td align="center">0</td> | ||||
<td align="center">2</td> | ||||
<td align="center">1,2</td> | ||||
<td align="center">2</td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">2</td> | ||||
<td align="center">0</td> | ||||
<td align="center">0</td> | ||||
<td align="center">6</td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">2</td> | ||||
<td align="center">1</td> | ||||
<td align="center">1,6</td> | ||||
<td align="center">7</td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">2</td> | ||||
<td align="center">2</td> | ||||
<td align="center">2,7</td> | ||||
<td align="center">-</td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">1</td> | ||||
<td align="center">0</td> | ||||
<td align="center">0</td> | ||||
<td align="center">3</td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">1</td> | ||||
<td align="center">1</td> | ||||
<td align="center">1,3</td> | ||||
<td align="center">4</td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">1</td> | ||||
<td align="center">2</td> | ||||
<td align="center">2,4</td> | ||||
<td align="center">5</td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">2</td> | ||||
<td align="center">0</td> | ||||
<td align="center">3</td> | ||||
<td align="center">6</td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">2</td> | ||||
<td align="center">1</td> | ||||
<td align="center">4,6</td> | ||||
<td align="center">7</td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">2</td> | ||||
<td align="center">2</td> | ||||
<td align="center">5,7</td> | ||||
<td align="center">-</td> | ||||
</tr> | ||||
</tbody> | ||||
</table> | ||||
<t>This structure is constructed such that the U bit can | ||||
always be set.</t> | always be set.</t> | |||
</section> | ||||
</section> | </section> | |||
</section> | ||||
</section> | </section> | |||
<section anchor="Feedback" title="Feedback Messages and Header Extensions"> | <section anchor="Feedback" numbered="true" toc="default"> | |||
<section anchor="RPSI" title="Reference Picture Selection Indication (RPSI | <name>Feedback Messages and Header Extensions</name> | |||
)"> | <section anchor="RPSI" numbered="true" toc="default"> | |||
<t>The reference picture selection index is a payload-specific | ||||
<name>Reference Picture Selection Indication (RPSI)</name> | ||||
<t>The RPSI is a payload-specific | ||||
feedback message defined within the RTCP-based feedback format. The | feedback message defined within the RTCP-based feedback format. The | |||
RPSI message is generated by a receiver and can be used in two ways. | RPSI message is generated by a receiver and can be used in two ways: | |||
Either it can signal a preferred reference picture when a loss has | either it can signal a preferred reference picture when a loss has | |||
been detected by the decoder -- preferably then a reference that the | been detected by the decoder (preferably a reference that the decoder | |||
decoder knows is perfect -- or, it can be used as positive feedback | knows is perfect) or it can be used as positive feedback information | |||
information to acknowledge correct decoding of certain reference | to acknowledge correct decoding of certain reference pictures. The | |||
pictures. The positive feedback method is useful for VP9 used for | positive feedback method is useful for VP9 used for point-to-point | |||
point to point (unicast) communication. The use of RPSI for VP9 is prefe | (unicast) communication. The use of RPSI for VP9 is preferably | |||
rably combined with a special | combined with a special update pattern of the codec's two special | |||
update pattern of the codec's two special reference frames -- the | reference frames -- the golden frame and the altref frame -- in which th | |||
golden frame and the altref frame -- in which they are updated in an | ey | |||
alternating leapfrog fashion. When a receiver has received and | are updated in an alternating leapfrog fashion. When a receiver has | |||
correctly decoded a golden or altref frame, and that frame had a | received and correctly decoded a golden or altref frame, and that | |||
Picture ID in the payload descriptor, the receiver can acknowledge this | frame had a Picture ID in the payload descriptor, the receiver can | |||
simply by sending an RPSI message back to the sender. The message body | acknowledge this simply by sending an RPSI message back to the | |||
(i.e., the "native RPSI bit string" in <xref target="RFC4585"/>) is | sender. The message body (i.e., the "native RPSI bit string" in <xref | |||
simply the (7 or 15 bit) Picture ID of the received frame.</t> | target="RFC4585" format="default"/>) is simply the (7- or 15-bit) | |||
Picture ID of the received frame.</t> | ||||
<t>Note: because all frames of the same picture must have the | <aside> | |||
<t>Note: because all frames of the same picture must have the | ||||
same inter-picture reference structure, there is no need for a | same inter-picture reference structure, there is no need for a | |||
message to specify which frame is being selected.</t> | message to specify which frame is being selected.</t></aside> | |||
</section> | </section> | |||
<section anchor="FIR" numbered="true" toc="default"> | ||||
<section title='Full Intra Request (FIR)' anchor="FIR"> | <name>Full Intra Request (FIR)</name> | |||
<t>The <xref target="RFC5104" format="default">Full Intra Request (FIR)< | ||||
<t>The <xref target='RFC5104'>Full Intra Request (FIR)</xref> | /xref> | |||
RTCP feedback message allows a receiver to request a full state r efresh of an encoded stream.</t> | RTCP feedback message allows a receiver to request a full state r efresh of an encoded stream.</t> | |||
<t>Upon receipt of a FIR request, a VP9 sender <bcp14>MUST</bcp14> | ||||
<t>Upon receipt of an FIR request, a VP9 sender MUST send a | send a picture with a keyframe for its spatial-layer 0 layer frame and | |||
picture with a keyframe for its spatial layer 0 layer | then send frames without inter-picture prediction (P=0) for any | |||
frame, and then send frames without inter-picture prediction | higher-layer frames.</t> | |||
(P=0) for any higher layer frames.</t> | </section> | |||
<section anchor="LRR" numbered="true" toc="default"> | ||||
</section> | <name>Layer Refresh Request (LRR)</name> | |||
<t>The <xref target="RFC9627" format="default">Layer Refresh Request | ||||
<section title="Layer Refresh Request (LRR)" anchor="LRR"> | (LRR)</xref> allows a receiver to request a single layer of a | |||
<t>The <xref target="I-D.ietf-avtext-lrr">Layer Refresh Request ( | spatially or temporally encoded stream to be refreshed without | |||
LRR)</xref> | necessarily affecting the stream's other layers.</t> | |||
allows a receiver to request a single layer of a spatially or | <figure anchor="figureLRRIndexFormat" title="LRR Index Format"> | |||
temporally encoded stream to be refreshed, without necessarily | <artwork name="" type="" align="left" alt=""><![CDATA[ | |||
affecting the stream's other layers.</t> | ||||
<figure anchor="figureLRRIndexFormat"> | ||||
<artwork><![CDATA[ | ||||
+---------------+---------------+ | +---------------+---------------+ | |||
|0|1|2|3|4|5|6|7|0|1|2|3|4|5|6|7| | |0|1|2|3|4|5|6|7|0|1|2|3|4|5|6|7| | |||
+---------------+---------+-----+ | +---------------+---------+-----+ | |||
| RES | TID | RES | SID | | | RES | TID | RES | SID | | |||
+---------------+---------+-----+ | +---------------+---------+-----+ | |||
]]></artwork> | ]]></artwork> | |||
</figure> | </figure> | |||
<t><xref target="figureLRRIndexFormat" format="default"/> shows the form | ||||
<t><xref target="figureLRRIndexFormat"/> shows the format | at | |||
of LRR's layer index fields for VP9 streams. The two "RES" | of an LRR's layer index fields for VP9 streams. The two "RES" | |||
fields MUST be set to 0 on transmission and ingnored on | fields <bcp14>MUST</bcp14> be set to 0 on transmission and ignore | |||
reception. See <xref target="VP9payloadDescriptor"/> for | d on | |||
reception. See <xref target="VP9payloadDescriptor" format="defau | ||||
lt"/> for | ||||
details on the TID and SID fields.</t> | details on the TID and SID fields.</t> | |||
<t>Identification of a layer refresh frame can be derived from the | <t>Identification of a layer refresh frame can be derived from | |||
reference IDs of each frame by backtracking the dependency chain | the reference IDs of each frame by backtracking the dependency | |||
until reaching a point where only decodable frames are being | chain until reaching a point where only decodable frames are | |||
referenced. Therefore it's recommended for both the | being referenced. Therefore, it's recommended for both the | |||
flexible and the non-flexible mode that, when switching up points are | flexible and the non-flexible mode that, when switching up | |||
being encoded in response to a LRR, those packets should contain | points are being encoded in response to an LRR, those packets | |||
layer indices and the reference field(s) so that the decoder or a | contain layer indices and the reference field or fields so | |||
<xref target='RFC7667'>selective forwarding | that the decoder or <xref target="RFC7667" | |||
middleboxes</xref> can make this derivation.</t> | format="default">selective forwarding middleboxes</xref> can | |||
make this derivation.</t> | ||||
<t>Example:</t> | <t>Example:</t> | |||
<t>LRR {1,0}, {2,1} is sent by an MCU when it is currently | <t>LRR {1,0}, {2,1} is sent by a Multipoint Control | |||
relaying {1,0} to a receiver and which wants to upgrade to | Unit (MCU) when it is currently | |||
{2,1}. In response the encoder should encode the next frames | relaying {1,0} to a receiver that wants to upgrade to | |||
{2,1}. In response, the encoder should encode the next frames | ||||
in layers {1,1} and {2,1} by only referring to frames in | in layers {1,1} and {2,1} by only referring to frames in | |||
{1,0}, or {0,0}.</t> | {1,0} or {0,0}.</t> | |||
<t>In the non-flexible mode, periodic upgrade frames can be defined by | ||||
<t>In the non-flexible mode, periodic upgrade frames can be | the layer structure of the SS; thus, periodic upgrade frames can be | |||
defined by the layer structure of the SS, thus periodic upgrade | automatically identified by the Picture ID.</t> | |||
frames can be automatically identified by the picture ID.</t> | </section> | |||
</section> | ||||
</section> | </section> | |||
<section anchor="payloadFormatParameters" | <section anchor="payloadFormatParameters" numbered="true" toc="default"> | |||
title="Payload Format Parameters"> | ||||
<t>This payload format has three optional parameters, "max-fr", "max-fs", | ||||
and "profile-id".</t> | ||||
<t>The max-fr and max-fs | <name>Payload Format Parameters</name> | |||
parameters are used to signal the capabilities of a receiver | <t>This payload format has three optional parameters: max-fr, | |||
implementation. If the implementation is willing to | max-fs, and profile-id.</t> | |||
receive media, both parameters MUST be provided. These parameters MU | <t>The max-fr and max-fs parameters are used to signal the capabilities | |||
ST | of a receiver implementation. If the implementation is willing to | |||
NOT be used for any other purpose. A media sender SHOULD NOT send | receive media, both parameters <bcp14>MUST</bcp14> be provided. These | |||
media with a frame rate or frame size exceeding the max-fr and max-f | parameters <bcp14>MUST NOT</bcp14> be used for any other purpose. A | |||
s | media sender <bcp14>SHOULD NOT</bcp14> send media with a frame rate or | |||
values signaled. (There may be scenarios, such as pre-encoded | frame size exceeding the max-fr and max-fs values signaled. (There may | |||
media or <xref target='RFC7667'>selective forwarding | be scenarios, such as pre-encoded media or <xref target="RFC7667" | |||
middleboxes</xref>, where a media sender does not have media availab | format="default">selective forwarding middleboxes</xref>, where a media | |||
le | sender does not have media available that fits within a receiver's | |||
that fits within a receivers max-fs and max-fr value; in such | max-fs and max-fr values; in such scenarios, a sender <bcp14>MAY</bcp14> | |||
scenarios, a sender MAY exceed the signaled values.) | exceed the signaled values.) | |||
<list style="hanging"> | </t> | |||
<t hangText="max-fr:">The value of max-fr is an integer | <dl newline="false" spacing="normal"> | |||
<dt>max-fr:</dt> | ||||
<dd>The value of max-fr is an integer | ||||
indicating the maximum frame rate in units of frames per | indicating the maximum frame rate in units of frames per | |||
second that the decoder is capable of decoding.</t> | second that the decoder is capable of decoding.</dd> | |||
<dt>max-fs:</dt> | ||||
<t hangText="max-fs:">The value of max-fs is an integer | <dd>The value of max-fs is an integer | |||
indicating the maximum frame size in units of macroblocks that | indicating the maximum frame size in units of macroblocks that | |||
the decoder is capable of decoding.</t> | the decoder is capable of decoding.</dd> | |||
<dt/> | ||||
<t>The decoder is capable of decoding this frame size as long | <dd>The decoder is capable of decoding this frame size as long | |||
as the width and height of the frame in macroblocks are less | as the width and height of the frame in macroblocks are each les | |||
than int(sqrt(max-fs * 8)) - for instance, a max-fs of 1200 | s | |||
than int(sqrt(max-fs * 8)); for instance, a max-fs of 1200 | ||||
(capable of supporting 640x480 resolution) will support widths | (capable of supporting 640x480 resolution) will support widths | |||
and heights up to 1552 pixels (97 macroblocks).</t> | and heights up to 1552 pixels (97 macroblocks).</dd> | |||
<dt>profile-id:</dt> | ||||
<t hangText="profile-id:">The value of profile-id is an integer | <dd>The value of profile-id is an integer indicating the default | |||
indicating the default coding profile, the subset of coding | coding profile (the subset of coding tools that may have been used to | |||
tools that may have been used to generate the stream or that the | generate the stream or that the receiver supports). <xref | |||
receiver supports). <xref target="TableOfProfileIds"/> lists all | target="TableOfProfileIds" format="default"/> lists all of the | |||
of the profiles defined in section 7.2 of <xref target="VP9-BITST | profiles defined in Section 7.2 of <xref target="VP9-BITSTREAM" | |||
REAM"/> | format="default"/> and the corresponding integer values to be | |||
and the corresponding integer values to be used.</t> | used.</dd> | |||
<dt/> | ||||
<t>If no profile-id is present, Profile 0 MUST be inferred. (The | <dd>If no profile-id is present, Profile 0 <bcp14>MUST</bcp14> be inferr ed. (The | |||
profile-id parameter was added relatively late in the developmen t of this | profile-id parameter was added relatively late in the developmen t of this | |||
specification, so some existing implementations may not send it. ) | specification, so some existing implementations may not send it. ) | |||
</t> | </dd> | |||
<dt/> | ||||
<t>Informative note: See <xref target="TableOfProfiles"/> for cap | <dd>Informative note: See <xref target="TableOfProfiles" | |||
abilities | format="default"/> for capabilities of coding profiles defined in Sectio | |||
of coding profiles defined in section 7.2 of <xref target="VP9-BI | n 7.2 of | |||
TSTREAM"/>.</t> | <xref target="VP9-BITSTREAM" format="default"/>.</dd> | |||
</list></t> | </dl> | |||
<t>A receiver <bcp14>MUST</bcp14> ignore any parameter unspecified in this | ||||
<t>A receiver MUST ignore any parameter unspecified in this | specification.</t> | |||
specification.</t> | ||||
<texttable anchor="TableOfProfileIds" title="Table of profile-id | ||||
integer values representing the VP9 profile corresponding to the set of | ||||
coding tools supported."> | ||||
<ttcol align="center">Profile</ttcol> | ||||
<ttcol align="center">profile-id</ttcol> | ||||
<c>0</c><c>0</c> | ||||
<c>1</c><c>1</c> | ||||
<c>2</c><c>2</c> | ||||
<c>3</c><c>3</c> | ||||
</texttable> | ||||
<texttable anchor="TableOfProfiles" title="Table of profile | ||||
capabilities."> | ||||
<ttcol align="center">Profile</ttcol> | ||||
<ttcol align="center">Bit Depth</ttcol> | ||||
<ttcol align="center">SRGB Colorspace</ttcol> | ||||
<ttcol align="center">Chroma Subsampling</ttcol> | ||||
<c>0</c><c>8</c><c>No</c><c>YUV 4:2:0</c> | ||||
<c>1</c><c>8</c><c>Yes</c><c>YUV 4:2:2,4:4:0 or 4:4:4</c> | ||||
<c>2</c><c>10 or 12</c><c>No</c><c>YUV 4:2:0</c> | ||||
<c>3</c><c>10 or 12</c><c>Yes</c><c>YUV 4:2:2,4:4:0 or 4:4:4</c> | ||||
</texttable> | ||||
<section anchor="SDPParameters" title="SDP Parameters"> | ||||
<section title="Mapping of Media Subtype Parameters to SDP"> | ||||
<t>The media type video/VP9 string is mapped to fields in the | ||||
Session Description Protocol (SDP) <xref target="RFC8866"/> as | ||||
follows: <list style="symbols"> | ||||
<t>The media name in the "m=" line of SDP MUST be video.</t> | ||||
<t>The encoding name in the "a=rtpmap" line of SDP MUST be VP9 | <table anchor="TableOfProfileIds" align="center"> | |||
(the media subtype).</t> | <name>Correspondence between profile-id to VP9 Profile Integer</name> | |||
<thead> | ||||
<tr> | ||||
<th align="center">Profile</th> | ||||
<th align="center">profile-id</th> | ||||
</tr> | ||||
</thead> | ||||
<tbody> | ||||
<tr> | ||||
<td align="center">0</td> | ||||
<td align="center">0</td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">1</td> | ||||
<td align="center">1</td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">2</td> | ||||
<td align="center">2</td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">3</td> | ||||
<td align="center">3</td> | ||||
</tr> | ||||
</tbody> | ||||
</table> | ||||
<table anchor="TableOfProfiles" align="center"> | ||||
<name>Profile Capabilities</name> | ||||
<thead> | ||||
<tr> | ||||
<th align="center">Profile</th> | ||||
<th align="center">Bit Depth</th> | ||||
<th align="center">SRGB Colorspace</th> | ||||
<th align="center">Chroma Subsampling</th> | ||||
</tr> | ||||
</thead> | ||||
<tbody> | ||||
<tr> | ||||
<td align="center">0</td> | ||||
<td align="center">8</td> | ||||
<td align="center">No</td> | ||||
<td align="center">YUV 4:2:0</td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">1</td> | ||||
<td align="center">8</td> | ||||
<td align="center">Yes</td> | ||||
<td align="center">YUV 4:2:2,4:4:0 or 4:4:4</td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">2</td> | ||||
<td align="center">10 or 12</td> | ||||
<td align="center">No</td> | ||||
<td align="center">YUV 4:2:0</td> | ||||
</tr> | ||||
<tr> | ||||
<td align="center">3</td> | ||||
<td align="center">10 or 12</td> | ||||
<td align="center">Yes</td> | ||||
<td align="center">YUV 4:2:2,4:4:0 or 4:4:4</td> | ||||
</tr> | ||||
</tbody> | ||||
</table> | ||||
<aside><t keepWithPrevious="true">Note: SRGB (often sRGB) = Standard Red-G | ||||
reen-Blue</t></aside> | ||||
<t>The clock rate in the "a=rtpmap" line MUST be 90000.</t> | <section anchor="SDPParameters" numbered="true" toc="default"> | |||
<name>SDP Parameters</name> | ||||
<section numbered="true" toc="default"> | ||||
<name>Mapping of Media Subtype Parameters to SDP</name> | ||||
<t>The parameters "max-fr" and "max-fs" MUST be included in | <t>The media type video/vp9 string is mapped to fields in the | |||
Session Description Protocol (SDP) <xref target="RFC8866" format="defa | ||||
ult"/> as | ||||
follows: </t> | ||||
<ul spacing="normal"> | ||||
<li>The media name in the "m=" line of SDP <bcp14>MUST</bcp14> be vi | ||||
deo.</li> | ||||
<li>The encoding name in the "a=rtpmap" line of SDP | ||||
<bcp14>MUST</bcp14> be VP9 (the media subtype).</li> | ||||
<li>The clock rate in the "a=rtpmap" line <bcp14>MUST</bcp14> be 900 | ||||
00.</li> | ||||
<li>The parameters max-fr and max-fs <bcp14>MUST</bcp14> be included | ||||
in | ||||
the "a=fmtp" line of SDP if the receiver wishes to declare its rec eiver | the "a=fmtp" line of SDP if the receiver wishes to declare its rec eiver | |||
capabilities. These parameters are expressed as a media subtype | capabilities. These parameters are expressed as a media subtype | |||
string, in the form of a semicolon separated list of | string in the form of a semicolon-separated list of | |||
parameter=value pairs.</t> | parameter=value pairs.</li> | |||
<li>The <bcp14>OPTIONAL</bcp14> parameter profile-id, when present, | ||||
<t>The OPTIONAL parameter profile-id, when present, SHOULD be | <bcp14>SHOULD</bcp14> be | |||
included in the "a=fmtp" line of SDP. This parameter is expressed | included in the "a=fmtp" line of SDP. This parameter is expressed | |||
as a media subtype string, in the form of a parameter=value | as a media subtype string in the form of a parameter=value | |||
pair. When the parameter is not present, a value of 0 MUST be | pair. When the parameter is not present, a value of 0 <bcp14>MUST</ | |||
inferred for profile-id.</t> | bcp14> be | |||
</list></t> | inferred for profile-id.</li> | |||
</ul> | ||||
<section title="Example"> | <section numbered="true" toc="default"> | |||
<name>Example</name> | ||||
<t>An example of media representation in SDP is as follows:</t> | <t>An example of media representation in SDP is as follows:</t> | |||
<sourcecode type="sdp"><![CDATA[m=video 49170 RTP/AVPF 98 | ||||
<figure> | ||||
<artwork>m=video 49170 RTP/AVPF 98 | ||||
a=rtpmap:98 VP9/90000 | a=rtpmap:98 VP9/90000 | |||
a=fmtp:98 max-fr=30;max-fs=3600;profile-id=0 | a=fmtp:98 max-fr=30;max-fs=3600;profile-id=0 | |||
</artwork> | ]]></sourcecode> | |||
</figure> | ||||
</section> | </section> | |||
</section> | </section> | |||
<section numbered="true" toc="default"> | ||||
<section title="Offer/Answer Considerations"> | <name>Offer/Answer Considerations</name> | |||
<t>When VP9 is offered over RTP using SDP in an Offer/Answer model | <t>When VP9 is offered over RTP using SDP in an Offer/Answer model | |||
<xref target="RFC3264"/> for negotiation for unicast usage, the follow | <xref target="RFC3264" format="default"/> for negotiation for unicast | |||
ing | usage, the following | |||
limitations and rules apply: <list style="symbols"> | limitations and rules apply: </t> | |||
<t>The parameter identifying a media format configuration for VP9 i | <ul spacing="normal"> | |||
s | <li>The parameter identifying a media format configuration for VP9 i | |||
profile-id. This media format configuration parameter MUST be used | s | |||
symmetrically; that is, the answerer MUST either maintain this | profile-id. This media format configuration parameter <bcp14>MUST</ | |||
bcp14> be used | ||||
symmetrically; that is, the answerer <bcp14>MUST</bcp14> either mai | ||||
ntain this | ||||
configuration parameter or remove the media format (payload type) | configuration parameter or remove the media format (payload type) | |||
completely if it is not supported.</t> | completely if it is not supported.</li> | |||
<li>The max-fr and max-fs parameters are used declaratively to | ||||
<t>The max-fr and max-fs parameters are used declaratively to | ||||
describe receiver capabilities, even in the Offer/Answer model. | describe receiver capabilities, even in the Offer/Answer model. | |||
The values in an answer are used to describe the answerer's | The values in an answer are used to describe the answerer's | |||
capabilities, and thus their values are set independently of the | capabilities; thus, their values are set independently of the | |||
values in the offer.</t> | values in the offer.</li> | |||
<li>To simplify the handling and matching of these configurations, t | ||||
<t>To simplify the handling and matching of these configurations, | he | |||
the | same RTP payload type number used in the offer <bcp14>SHOULD</bcp1 | |||
same RTP payload type number used in the offer SHOULD also be used | 4> also be used | |||
in the answer and in a subsequent offer, as specified in <xref | in the answer and in a subsequent offer, as specified in <xref tar | |||
target="RFC3264"/>. An answer or subsequent offer | get="RFC3264" format="default"/>. An answer or subsequent offer | |||
MUST NOT contain the payload type number used in the offer unless t | <bcp14>MUST NOT</bcp14> contain the payload type number used in the | |||
he | offer unless the | |||
profile-id value is exactly the same as in the original offer. | profile-id value is exactly the same as in the original offer. | |||
However, max-fr and max-fs parameters MAY be changed in subsequent | However, max-fr and max-fs parameters <bcp14>MAY</bcp14> be change d in subsequent | |||
offers and answers, with the same payload type number, if an endpo int | offers and answers, with the same payload type number, if an endpo int | |||
wishes to change its declared receiver capabilities.</t> | wishes to change its declared receiver capabilities.</li> | |||
</list></t> | </ul> | |||
</section> | </section> | |||
</section> | </section> | |||
</section> | </section> | |||
<section anchor="mediaTypeRegistration" title="Media Type Definition"> | <section anchor="mediaTypeRegistration" numbered="true" toc="default"> | |||
<t>This registration is done using the template defined in <xref | <name>Media Type Definition</name> | |||
target="RFC6838"/> and following <xref target="RFC4855"/>. <list | <t>This registration uses the template defined in <xref target="RFC6838" f | |||
style="hanging"> | ormat="default"/> and following <xref target="RFC4855" format="default"/>. </t> | |||
<t hangText="Type name:">video</t> | ||||
<t hangText="Subtype name:">VP9</t> | ||||
<t hangText="Required parameters:">N/A.</t> | ||||
<t hangText="Optional parameters:"><vspace blankLines="0"/> | <!--[rfced] Please note that after AUTH48 concludes, we will | |||
There are three optional parameters, "max-fr", "max-fs", and "profil | communicate any changes to the media type template in Section 7 | |||
e-id". | to IANA for corresponding updates to | |||
See <xref target='payloadFormatParameters' /> for their definition. | https://www.iana.org/assignments/media-types/video/VP9 to be | |||
</t> | made.--> | |||
<t hangText="Encoding considerations:"><vspace blankLines="0"/> | <dl newline="false" spacing="normal"> | |||
<dt>Type name:</dt> | ||||
<dd>video</dd> | ||||
<dt>Subtype name:</dt> | ||||
<dd>VP9</dd> | ||||
<dt>Required parameters:</dt> | ||||
<dd>N/A</dd> | ||||
<dt>Optional parameters:</dt> | ||||
<dd> | ||||
There are three optional parameters: max-fr, max-fs, and profile-id. | ||||
See <xref target="payloadFormatParameters" format="default"/> for th | ||||
eir definition. | ||||
</dd> | ||||
<dt>Encoding considerations:</dt> | ||||
<dd> | ||||
This media type is framed in RTP and contains binary data; see | This media type is framed in RTP and contains binary data; see | |||
Section 4.8 of <xref target="RFC6838"/>.</t> | <xref target="RFC6838" sectionFormat="of" section="4.8" | |||
format="default"/>.</dd> | ||||
<t hangText="Security considerations:">See <xref | <dt>Security considerations:</dt> | |||
target="securityConsiderations"/> of RFC xxxx. <vspace | <dd> | |||
blankLines="0"/> [RFC Editor: Upon publication as an RFC, please | <t>See <xref target="securityConsiderations" format="default"/> of RFC | |||
replace "XXXX" with the number assigned to this document and | 9628. </t> | |||
remove this note.]</t> | ||||
<t hangText="Interoperability considerations:">None.</t> | ||||
<t hangText="Published specification:">VP9 bitstream format <xref | ||||
target="VP9-BITSTREAM"/> and RFC XXXX. <vspace blankLines="0"/> [RFC | ||||
Editor: Upon publication as an RFC, please replace "XXXX" with the | ||||
number assigned to this document and remove this note.] <vspace | ||||
blankLines="0"/></t> | ||||
<t hangText="Applications which use this media type:"><vspace | ||||
blankLines="0"/> For example: Video over IP, video | ||||
conferencing.</t> | ||||
<t hangText="Fragment identifier considerations:">N/A.</t | ||||
> | ||||
<t hangText="Additional information:">None.</t> | ||||
<t | </dd> | |||
hangText="Person & email address to contact for further informat | <dt>Interoperability considerations:</dt> | |||
ion:"><vspace | <dd>None</dd> | |||
blankLines="0"/> Jonathan Lennox <jonathan.lennox@8x8.com></t> | <dt>Published specification:</dt> | |||
<dd> | ||||
<t>VP9 bitstream format <xref target="VP9-BITSTREAM" format="default"/ | ||||
> and RFC 9628. </t> | ||||
<t hangText="Intended usage:">COMMON</t> | </dd> | |||
<dt>Applications that use this media type:</dt> | ||||
<dd> For example, video over IP, video | ||||
conferencing.</dd> | ||||
<dt>Fragment identifier considerations:</dt> | ||||
<dd>N/A</dd> | ||||
<dt>Additional information:</dt> | ||||
<dd>None</dd> | ||||
<dt>Person & email address to contact for further information:</dt> | ||||
<dd><t><contact fullname="Jonathan Lennox"/> <jonathan.lennox@8x8.com | ||||
></t></dd> | ||||
<dt>Intended usage:</dt> | ||||
<dd>COMMON</dd> | ||||
<dt>Restrictions on usage:</dt> | ||||
<dd> This media type depends on RTP framing; hence, it is only defined | ||||
for transfer via RTP <xref target="RFC3550" format="default"/>.</dd> | ||||
<dt>Author:</dt> | ||||
<dd><t><contact fullname="Jonathan Lennox"/> <jonathan.lennox@8x8.com | ||||
></t></dd> | ||||
<t hangText="Restrictions on usage:"><vspace blankLines="0"/> This | <!--[rfced] Please review the entry for "Change Controller" in Section | |||
media type depends on RTP framing, and hence is only defined for | 7. While we see similar text for the vp8 and vc2 entries, we want to | |||
transfer via RTP <xref target="RFC3550"/>.</t> | confirm that this entry has been reviewed with the following in | |||
mind from | ||||
https://www.iana.org/help/protocol-registration: | ||||
<t hangText="Author:">Jonathan Lennox <jonathan.lennox@8x8.com> | "The IESG shouldn't be listed as a change controller unless the | |||
;</t> | RFC that created the registry (e.g. port numbers, XML namespaces | |||
and schemas) requires it. The IETF should be named instead." | ||||
<t hangText="Change controller:"><vspace blankLines="0"/> IETF | --> | |||
AVTCore Working Group delegated from the IESG.</t> | ||||
</list></t> | ||||
</section> | ||||
<section anchor="securityConsiderations" title="Security Considerations" | <dt>Change controller:</dt> | |||
> | <dd> IETF | |||
AVTCore Working Group delegated from the IESG.</dd> | ||||
</dl> | ||||
</section> | ||||
<section anchor="securityConsiderations" numbered="true" toc="default"> | ||||
<name>Security Considerations</name> | ||||
<t>RTP packets using the payload format defined in this specification | <t>RTP packets using the payload format defined in this specification | |||
are subject to the security considerations discussed in the RTP | are subject to the security considerations discussed in the RTP | |||
specification <xref target="RFC3550"/>, and in any applicable RTP | specification <xref target="RFC3550" format="default"/>, and in any | |||
profile such | applicable RTP profile such as <xref target="RFC3551" | |||
as <xref target='RFC3551'>RTP/AVP</xref>, <xref target='RFC4585'>RTP/AVPF< | format="default">RTP/AVP</xref>, <xref target="RFC4585" | |||
/xref>, | format="default">RTP/AVPF</xref>, <xref target="RFC3711" | |||
<xref target='RFC3711'>RTP/SAVP</xref>, | format="default">RTP/SAVP</xref>, or <xref target="RFC5124" | |||
or <xref target='RFC5124'>RTP/SAVPF</xref>. | format="default">RTP/SAVPF</xref>. However, as "<xref target="RFC7202" fo | |||
However, as <xref target='RFC7202'>"Securing the RTP Protocol | rmat="title"/>" <xref target="RFC7202" | |||
Framework: Why RTP Does Not Mandate a Single Media | format="default"></xref> discusses, it is not an RTP | |||
Security Solution"</xref> discusses, it is not an RTP payload format's res | payload format's responsibility to discuss or mandate what solutions are | |||
ponsibility to | used to meet the basic security goals like confidentiality, integrity, | |||
discuss or mandate what solutions are used to meet the | and source authenticity for RTP in general. This responsibility lies with | |||
basic security goals like confidentiality, integrity and source | ||||
authenticity for RTP in general. This responsibility lays on | ||||
anyone using RTP in an application. They can find guidance on available | anyone using RTP in an application. They can find guidance on available | |||
security mechanisms in <xref target='RFC7201'>Options for Securing | security mechanisms in "<xref target="RFC7201" format="title"/> <xref targ | |||
RTP Sessions</xref>. Applications SHOULD use one or more appropriate | et="RFC7201" format="default"></xref>. Applications <bcp14>SHOULD</bcp14> | |||
strong security mechanisms. The rest of this security | use one or more appropriate strong security mechanisms.</t> | |||
consideration section discusses the security impacting properties of the | <t>Implementations of this RTP payload format need to take appropriate | |||
payload format itself.</t> | security considerations into account. It is extremely important for the | |||
decoder to be robust against malicious or malformed payloads and ensure | ||||
<t>Implementations of this RTP payload format need to take appropriate sec | that they do not cause the decoder to overrun its allocated memory or | |||
urity | otherwise misbehave. An overrun in allocated memory could lead to | |||
considerations into account. It is extremely important for the decoder to | arbitrary code execution by an attacker. The same applies to the | |||
be | encoder, even though problems in encoders are (typically) rarer.</t> | |||
robust against malicious or malformed payloads and ensure that they do not | <t>This RTP payload format and its media decoder do not exhibit any | |||
cause the decoder | significant non-uniformity in the receiver-side computational complexity | |||
to overrun its allocated memory or otherwise mis-behave. An overrun in al | for packet processing; thus, they are unlikely to pose a denial-of-service | |||
located memory could lead to | threat due to the receipt of pathological data. Nor does the RTP payload | |||
arbitrary code execution by an attacker. The same applies to the encoder, | format contain any active content.</t> | |||
even | ||||
though problems in encoders are typically rarer.</t> | ||||
<t>This RTP payload | ||||
format and its media decoder do not exhibit any significant | ||||
non-uniformity in the receiver-side computational complexity for packet | ||||
processing, and thus are unlikely to pose a denial-of-service threat due | ||||
to the receipt of pathological data. Nor does the RTP payload format | ||||
contain any active content.</t> | ||||
</section> | </section> | |||
<section anchor="congestionControl" numbered="true" toc="default"> | ||||
<section anchor="congestionControl" title="Congestion Control"> | <name>Congestion Control</name> | |||
<t>Congestion control for RTP SHALL be used in accordance with RFC 3550 | <t>Congestion control for RTP <bcp14>SHALL</bcp14> be used in accordance | |||
<xref target="RFC3550"/>, and with any applicable RTP profile; e.g., RFC | with <xref target="RFC3550" format="default"/>, and with any | |||
3551 <xref target="RFC3551"/>. The congestion control mechanism can, in | applicable RTP profile, e.g., <xref target="RFC3551" | |||
a real-time encoding scenario, adapt the transmission rate by | format="default"/>. The congestion control mechanism can, in a real-time | |||
instructing the encoder to encode at a certain target rate. Media aware | encoding scenario, adapt the transmission rate by instructing the | |||
network elements MAY use the information in the VP9 payload descriptor | encoder to encode at a certain target rate. Media-aware network elements | |||
in <xref target="VP9payloadDescriptor"/> to identify non-reference | <bcp14>MAY</bcp14> use the information in the VP9 payload descriptor in | |||
frames and discard them in order to reduce network congestion. Note that | <xref target="VP9payloadDescriptor" format="default"/> to identify | |||
discarding of non-reference frames cannot be done if the stream is | non-reference frames and discard them in order to reduce network | |||
encrypted (because the non-reference marker is encrypted).</t> | congestion. Note that discarding of non-reference frames cannot be done | |||
if the stream is encrypted (because the non-reference marker is | ||||
encrypted).</t> | ||||
</section> | </section> | |||
<section anchor="IANAConsiderations" numbered="true" toc="default"> | ||||
<name>IANA Considerations</name> | ||||
<section anchor="IANAConsiderations" title="IANA Considerations"> | <t>IANA has registered the media type registration "video/vp9" | |||
<t>The IANA is requested to register the media type registration | as specified in <xref target="mediaTypeRegistration" format="default"/>. | |||
"video/vp9" as specified in <xref | The media type has also been added to the | |||
target="mediaTypeRegistration"/>. The media type is also | "RTP Payload Format Media Types" <eref | |||
requested to | target="https://www.iana.org/assignments/rtp-parameters" | |||
be added to the IANA registry for "RTP Payload Format MIME types" | brackets="angle"/> subregistry of the "Real-Time Transport Protocol (RTP) | |||
<http://www.iana.org/assignments/rtp-parameters>.</t> | Paramaeters" registry as follows.</t> | |||
</section> | ||||
<section title="Acknowledgments"> | <dl spacing="compact"> | |||
<t>Alex Eleftheriadis, Yuki Ito, Won Kap Jang, Sergio Garcia | <dt>Media Type:</dt><dd>video</dd> | |||
Murillo, Roi Sasson, Timothy Terriberry, Emircan Uysaler, and | <dt>Subtype:</dt><dd>VP9</dd> | |||
Thomas Volkert commented on the development of this document and | <dt>Clock Rate (Hz):</dt><dd>90000</dd> | |||
provided helpful comments and feedback.</t> | <dt>Reference:</dt><dd>RFC 9628</dd> | |||
</dl> | ||||
</section> | </section> | |||
</middle> | </middle> | |||
<back> | <back> | |||
<references title='Normative References'> | <references> | |||
<name>References</name> | ||||
<reference anchor='VP9-BITSTREAM' target='https://storage.googleapis.co | <references> | |||
m/downloads.webmproject.org/docs/vp9/vp9-bitstream-specification-v0.6-20160331-d | <name>Normative References</name> | |||
raft.pdf'> | ||||
<front> | ||||
<title>VP9 Bitstream & Decoding Process Specification</titl | ||||
e> | ||||
<author initials='A' surname='Grange' fullname='Adrian Grange'> | <reference anchor="VP9-BITSTREAM" target="https://storage.googleapis.com | |||
<organization>Google</organization> | /downloads.webmproject.org/docs/vp9/vp9-bitstream-specification-v0.6-20160331-dr | |||
</author> | aft.pdf"> | |||
<author initials='P' surname='de Rivaz' fullname='Peter de Riva | <front> | |||
z'> | <title>VP9 Bitstream & Decoding Process Specification</title> | |||
<organization>Argon Design</organization> | <author initials="A" surname="Grange" fullname="Adrian Grange"> | |||
</author> | <organization>Google</organization> | |||
<author initials='J' surname='Hunt' fullname='Jonathan Hunt'> | </author> | |||
<organization>Argon Design</organization> | <author initials="P" surname="de Rivaz" fullname="Peter de Rivaz"> | |||
</author> | <organization>Argon Design</organization> | |||
<date month='March' day='31' year='2016' /> | </author> | |||
<abstract> | <author initials="J" surname="Hunt" fullname="Jonathan Hunt"> | |||
<t> | <organization>Argon Design</organization> | |||
</author> | ||||
<date month="March" day="31" year="2016"/> | ||||
<abstract> | ||||
<t> | ||||
This document defines the bitstream format and decoding process for the | This document defines the bitstream format and decoding process for the | |||
Google VP9 video codec. | Google VP9 video codec. | |||
</t> | </t> | |||
</abstract> | </abstract> | |||
</front> | ||||
</front> | <seriesInfo name="Version" value="0.6"/> | |||
<seriesInfo name='Version' value='0.6' /> | </reference> | |||
</reference> | <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | |||
FC.2119.xml"/> | ||||
&rfc2119; | <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | |||
FC.8174.xml"/> | ||||
&rfc8174; | <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | |||
FC.4585.xml"/> | ||||
&rfc4585; | <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | |||
FC.3550.xml"/> | ||||
&rfc3550; | <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | |||
FC.8866.xml"/> | ||||
&rfc8866; | <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | |||
FC.6838.xml"/> | ||||
&rfc6838; | <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | |||
FC.4855.xml"/> | ||||
&rfc4855; | <xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | |||
FC.5104.xml"/> | ||||
&rfc5104; | ||||
&lrr; | <!-- [I-D.ietf-avtext-lrr] companion document RFC 9627 --> | |||
<reference anchor="RFC9627" target="https://www.rfc-editor.org/info/rfc9627"> | ||||
<front> | ||||
<title>The Layer Refresh Request (LRR) RTCP Feedback Message</title> | ||||
<author initials="J." surname="Lennox" fullname="Jonathan Lennox"> | ||||
<organization>Vidyo, Inc.</organization> | ||||
</author> | ||||
<author initials="D." surname="Hong" fullname="Danny Hong"> | ||||
<organization>Vidyo, Inc.</organization> | ||||
</author> | ||||
<author initials="J." surname="Uberti" fullname="Justin Uberti"> | ||||
<organization>Google, Inc.</organization> | ||||
</author> | ||||
<author initials="S." surname="Holmer" fullname="Stefan Holmer"> | ||||
<organization>Google, Inc.</organization> | ||||
</author> | ||||
<author initials="M." surname="Flodman" fullname="Magnus Flodman"> | ||||
<organization>Google, Inc.</organization> | ||||
</author> | ||||
<date month="August" year="2024" /> | ||||
</front> | ||||
<seriesInfo name="RFC" value="9627" /> | ||||
<seriesInfo name="DOI" value="10.17487/RFC9627"/> | ||||
&rfc3264; | </reference> | |||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.3264.xml"/> | ||||
</references> | ||||
<references> | ||||
<name>Informative References</name> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.3551.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.5124.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.6386.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.7201.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.7202.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.7667.xml"/> | ||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml/reference.R | ||||
FC.3711.xml"/> | ||||
</references> | ||||
</references> | </references> | |||
<section numbered="false" toc="default"> | ||||
<references title='Informative References'> | <name>Acknowledgments</name> | |||
<t><contact fullname="Alex Eleftheriadis"/>, <contact fullname="Yuki | ||||
&rfc3551; | Ito"/>, <contact fullname="Won Kap Jang"/>, <contact fullname="Sergio | |||
Garcia"/> <contact fullname="Murillo"/>, <contact fullname="Roi | ||||
&rfc5124; | Sasson"/>, <contact fullname="Timothy Terriberry"/>, <contact | |||
fullname="Emircan Uysaler"/>, and <contact fullname="Thomas Volkert"/> | ||||
&rfc6386; | commented on the development of this document and provided helpful | |||
feedback.</t> | ||||
&rfc7201; | </section> | |||
&rfc7202; | ||||
&rfc7667; | ||||
&rfc3711; | ||||
</references> | ||||
</back> | </back> | |||
</rfc> | </rfc> | |||
<!-- LocalWords: PictureID DCT Hadamard WHT SSRC CSRC pyld hdr FI VER RPSI | ||||
--> | ||||
<!-- LocalWords: stPartitionSize SDP AVPF SRTP IANA PID PICIDX TID | ||||
--> | ||||
End of changes. 161 change blocks. | ||||
979 lines changed or deleted | 1034 lines changed or added | |||
This html diff was produced by rfcdiff 1.48. |