rfc9768xml2.original.xml | rfc9768.xml | |||
---|---|---|---|---|
<?xml version="1.0" encoding="US-ASCII"?> | <?xml version='1.0' encoding='UTF-8'?> | |||
<!DOCTYPE rfc [ | <!DOCTYPE rfc [ | |||
<!ENTITY nbsp " "> | <!ENTITY nbsp " "> | |||
<!ENTITY Ouml "Ö"> | ||||
<!ENTITY auml "ä"> | ||||
<!ENTITY uuml "ü"> | ||||
<!ENTITY zwsp "​"> | <!ENTITY zwsp "​"> | |||
<!ENTITY nbhy "‑"> | <!ENTITY nbhy "‑"> | |||
<!ENTITY mdash "—"> | ||||
<!ENTITY wj "⁠"> | <!ENTITY wj "⁠"> | |||
]> | ]> | |||
<?xml-stylesheet type='text/xsl' href='http://xml.resource.org/authoring/rfc2629 | ||||
.xslt' ?> | <rfc xmlns:xi="http://www.w3.org/2001/XInclude" category="std" consensus="true" | |||
<!-- Alterations to I-D/RFC boilerplate --> | docName="draft-ietf-tcpm-accurate-ecn-34" number="9768" submissionType="IETF" ip | |||
<?rfc private="" ?> | r="pre5378Trust200902" updates="3168" obsoletes="" tocInclude="true" tocDepth="4 | |||
<!-- Default private="" Produce an internal memo 2.5pp shorter than an I-D or RF | " symRefs="true" sortRefs="true" version="3" xml:lang="en"> | |||
C --> | ||||
<?rfc rfcprocack="yes" ?> | ||||
<!-- Default rfcprocack="no" add a short sentence acknowledging xml2rfc --> | ||||
<?rfc strict="no" ?> | ||||
<!-- Default strict="no" Don't check I-D nits --> | ||||
<?rfc rfcedstyle="yes" ?> | ||||
<!-- Default rfcedstyle="yes" attempt to closely follow finer details from the l | ||||
atest observable RFC-Editor style --> | ||||
<!-- IETF process --> | ||||
<?rfc iprnotified="no" ?> | ||||
<!-- Default iprnotified="no" I haven't disclosed existence of IPR to IETF --> | ||||
<!-- ToC format --> | ||||
<?rfc toc="yes" ?> | ||||
<!-- Default toc="no" No Table of Contents --> | ||||
<!-- ToC depth --> | ||||
<?rfc tocdepth="4" ?> | ||||
<!-- Default tocDepth="3" Exclude subsections of depth >3 from Table of Contents | ||||
--> | ||||
<!-- Cross referencing, footnotes, comments --> | ||||
<?rfc symrefs="yes"?> | ||||
<!-- Default symrefs="no" Don't use anchors, but use numbers for refs --> | ||||
<?rfc sortrefs="yes"?> | ||||
<!-- Default sortrefs="no" Don't sort references into order --> | ||||
<?rfc comments="yes" ?> | ||||
<!-- Default comments="no" Don't render comments --> | ||||
<?rfc inline="no" ?> | ||||
<!-- Default inline="no" if comments is "yes", then render comments inline; othe | ||||
rwise render them in an `Editorial Comments' section --> | ||||
<!-- Pagination control --> | ||||
<?rfc compact="yes"?> | ||||
<!-- Default compact="no" Start sections on new pages --> | ||||
<?rfc subcompact="no"?> | ||||
<!-- Default subcompact="(as compact setting)" yes/no is not quite as compact as | ||||
yes/yes --> | ||||
<!-- HTML formatting control --> | ||||
<?rfc emoticonic="yes" ?> | ||||
<!-- Default emoticonic="no" Doesn't prettify HTML format --> | ||||
<rfc category="std" consensus="yes" docName="draft-ietf-tcpm-accurate-ecn-34" su | ||||
bmissionType="IETF" | ||||
ipr="pre5378Trust200902" updates="3168" xmlns:xi="http://www.w3.org/2001/XI | ||||
nclude"> | ||||
<front> | <front> | |||
<title abbrev="Accurate TCP-ECN Feedback">More Accurate Explicit | <title abbrev="Accurate TCP-ECN Feedback">More Accurate Explicit | |||
Congestion Notification (AccECN) Feedback in TCP</title> | Congestion Notification (AccECN) Feedback in TCP</title> | |||
<seriesInfo name="RFC" value="9768"/> | ||||
<author fullname="Bob Briscoe" initials="B." surname="Briscoe"> | <author fullname="Bob Briscoe" initials="B." surname="Briscoe"> | |||
<organization>Independent</organization> | <organization>Independent</organization> | |||
<address> | <address> | |||
<postal> | <postal> | |||
<street/> | <country>United Kingdom</country> | |||
<city/> | ||||
<country>UK</country> | ||||
</postal> | </postal> | |||
<email>ietf@bobbriscoe.net</email> | <email>ietf@bobbriscoe.net</email> | |||
<uri>http://bobbriscoe.net/</uri> | <uri>http://bobbriscoe.net/</uri> | |||
</address> | </address> | |||
</author> | </author> | |||
<author fullname="Mirja Kühlewind" initials="M." surname="Kühlewind"> | ||||
<author fullname="Mirja Kühlewind" initials="M." | ||||
surname="Kühlewind"> | ||||
<organization>Ericsson</organization> | <organization>Ericsson</organization> | |||
<address> | <address> | |||
<postal> | <postal> | |||
<street/> | ||||
<country>Germany</country> | <country>Germany</country> | |||
</postal> | </postal> | |||
<email>ietf@kuehlewind.net</email> | <email>ietf@kuehlewind.net</email> | |||
</address> | </address> | |||
</author> | </author> | |||
<author fullname="Richard Scheffenegger" initials="R." surname="Scheffenegge | ||||
<author fullname="Richard Scheffenegger" initials="R." | r"> | |||
surname="Scheffenegger"> | ||||
<organization>NetApp</organization> | <organization>NetApp</organization> | |||
<address> | <address> | |||
<postal> | <postal> | |||
<street/> | ||||
<city>Vienna</city> | <city>Vienna</city> | |||
<region/> | ||||
<code/> | ||||
<country>Austria</country> | <country>Austria</country> | |||
</postal> | </postal> | |||
<email>Richard.Scheffenegger@netapp.com</email> | <email>Richard.Scheffenegger@netapp.com</email> | |||
</address> | </address> | |||
</author> | </author> | |||
<date year="2025" month="August"/> | ||||
<date year=""/> | <area>WIT</area> | |||
<workgroup>tcpm</workgroup> | ||||
<area>Transport</area> | ||||
<workgroup>TCP Maintenance & Minor Extensions (tcpm)</workgroup> | ||||
<keyword>Congestion Control and Management</keyword> | <keyword>Congestion Control and Management</keyword> | |||
<keyword>Congestion Notification</keyword> | <keyword>Congestion Notification</keyword> | |||
<keyword>Feedback</keyword> | <keyword>Feedback</keyword> | |||
<keyword>Reliable</keyword> | <keyword>Reliable</keyword> | |||
<keyword>Ordered</keyword> | <keyword>Ordered</keyword> | |||
<keyword>Protocol</keyword> | <keyword>Protocol</keyword> | |||
<keyword>ECN</keyword> | <keyword>ECN</keyword> | |||
<abstract> | <abstract> | |||
<t>Explicit Congestion Notification (ECN) is a mechanism where network | <t>Explicit Congestion Notification (ECN) is a mechanism by which network | |||
nodes can mark IP packets instead of dropping them to indicate incipient | nodes can mark IP packets instead of dropping them to indicate incipient | |||
congestion to the endpoints. Receivers with an ECN-capable transport | congestion to the endpoints. Receivers with an ECN-capable transport | |||
protocol feed back this information to the sender. ECN was originally | protocol feed back this information to the sender. ECN was originally | |||
specified for TCP in such a way that only one feedback signal can be | specified for TCP in such a way that only one feedback signal can be | |||
transmitted per Round-Trip Time (RTT). Recent new TCP mechanisms like | transmitted per Round-Trip Time (RTT). Newer TCP mechanisms like | |||
Congestion Exposure (ConEx), Data Center TCP (DCTCP) or Low Latency, Low | Congestion Exposure (ConEx), Data Center TCP (DCTCP), or Low Latency, Low | |||
Loss, and Scalable Throughput (L4S) need more Accurate ECN (AccECN) feedba ck | Loss, and Scalable Throughput (L4S) need more Accurate ECN (AccECN) feedba ck | |||
information whenever more than one marking is received in one RTT. This | information whenever more than one marking is received in one RTT. This | |||
document updates the original ECN specification in RFC 3168 to specify a | document updates the original ECN specification defined in RFC 3168 by spe cifying a | |||
scheme that provides more than one feedback signal per RTT in the TCP | scheme that provides more than one feedback signal per RTT in the TCP | |||
header. Given TCP header space is scarce, it allocates a reserved header | header. Given TCP header space is scarce, it allocates a reserved header | |||
bit previously assigned to the ECN-Nonce. It also overloads the two | bit previously assigned to the ECN-nonce. It also overloads the two | |||
existing ECN flags in the TCP header. The resulting extra space is | existing ECN flags in the TCP header. The resulting extra space is | |||
additionally exploited to feed back the IP-ECN field received during | additionally exploited to feed back the IP-ECN field received during | |||
the TCP connection establishment. | the TCP connection establishment. | |||
Supplementary feedback information can optionally be | Supplementary feedback information can optionally be | |||
provided in two new TCP option alternatives, which are never used on the | provided in two new TCP option alternatives, which are never used on the | |||
TCP SYN. The document also specifies the treatment of this updated TCP | TCP SYN. The document also specifies the treatment of this updated TCP | |||
wire protocol by middleboxes.</t> | wire protocol by middleboxes.</t> | |||
</abstract> | </abstract> | |||
</front> | </front> | |||
<!-- ================================================================ --> | ||||
<middle> | <middle> | |||
<!-- ================================================================ --> | <section anchor="accecn_Introduction"> | |||
<name>Introduction</name> | ||||
<section anchor="accecn_Introduction" title="Introduction"> | ||||
<t>Explicit Congestion Notification (ECN) <xref target="RFC3168"/> is a | <t>Explicit Congestion Notification (ECN) <xref target="RFC3168"/> is a | |||
mechanism where network nodes can mark IP packets instead of dropping | mechanism by which network nodes can mark IP packets instead of dropping | |||
them to indicate incipient congestion to the endpoints. Receivers with | them to indicate incipient congestion to the endpoints. Receivers with | |||
an ECN-capable transport protocol feed back this information to the | an ECN-capable transport protocol feed back this information to the | |||
sender. In RFC 3168, ECN was specified for TCP in such a way that only | sender. In RFC 3168, ECN was specified for TCP in such a way that only | |||
one feedback signal could be transmitted per Round-Trip Time (RTT). | one feedback signal could be transmitted per Round-Trip Time (RTT). | |||
This is sufficient for congestion control scheme like Reno <xref | This is sufficient for congestion control schemes like Reno <xref target=" | |||
target="RFC6582"/> and Cubic <xref target="RFC9438"/>, as those schemes | RFC6582"/> and CUBIC <xref target="RFC9438"/>, as those schemes | |||
reduce their congestion window by a fixed factor if congestion occurs | reduce their congestion window by a fixed factor if congestion occurs | |||
within an RTT independent of the number of received congestion markings. | within an RTT independent of the number of received congestion markings. | |||
Recently, proposed mechanisms like Congestion Exposure (ConEx <xref | <!-- [rfced] Because these documents are defined in Informational RFCs, is "prop | |||
target="RFC7713"/>), DCTCP <xref target="RFC8257"/> or L4S <xref | osed" needed here? | |||
target="RFC9330"/> need to know when more than one marking is received | ||||
Original: | ||||
Recently, proposed mechanisms like Congestion Exposure (ConEx | ||||
[RFC7713]), DCTCP [RFC8257] or L4S [RFC9330] need to know when more | ||||
than one marking is received in one RTT, which is information that | ||||
cannot be provided by the feedback scheme as specified in [RFC3168]. | ||||
Perhaps: | ||||
Newer mechanisms like Congestion Exposure (ConEx | ||||
[RFC7713]), DCTCP [RFC8257], or L4S [RFC9330] ... | ||||
Or perhaps, "More recently defined mechanisms ..." | ||||
--> | ||||
Recently, proposed mechanisms like Congestion Exposure (ConEx <xref target | ||||
="RFC7713"/>), DCTCP <xref target="RFC8257"/>, and L4S <xref target="RFC9330"/> | ||||
need to know when more than one marking is received | ||||
in one RTT, which is information that cannot be provided by the feedback | in one RTT, which is information that cannot be provided by the feedback | |||
scheme as specified in <xref target="RFC3168"/>. This document specifies | scheme as specified in <xref target="RFC3168"/>. This document specifies | |||
an update to the ECN feedback scheme of RFC 3168 that provides more | an update to the ECN feedback scheme of RFC 3168 that provides more | |||
accurate information and could be used by these and potentially other | accurate information and could be used by these and potentially other | |||
future TCP extensions, while still also supporting the pre-existing TCP | future TCP extensions, while still also supporting the pre-existing TCP | |||
congestion controllers that use just one feedback signal per round. | congestion controllers that use just one feedback signal per round. | |||
Congestion control is the term the IETF uses to describe data rate | Congestion control is the term the IETF uses to describe data rate | |||
management. It is the algorithm that a sender uses to optimize its | management. It is the algorithm that a sender uses to optimize its | |||
sending rate so that it transmits data as fast as the network can | sending rate so that it transmits data as fast as the network can | |||
carry it, but no faster. A fuller treatment of the motivation for | carry it, but no faster. A fuller description of the motivation for | |||
this specification is given in the associated requirements document | this specification is given in the associated requirements document | |||
<xref target="RFC7560"/>.</t> | <xref target="RFC7560"/>.</t> | |||
<t>This document specifies a Standards Track scheme for ECN feedback in | ||||
<t>This document specifies a standards track scheme for ECN feedback in | the TCP header to provide more than one feedback signal per RTT. It is | |||
the TCP header to provide more than one feedback signal per RTT. It will | called the more "Accurate ECN" feedback scheme, or AccECN for short. | |||
be called the more Accurate ECN feedback scheme, or AccECN for short. | ||||
This document updates RFC 3168 with respect to negotiation and use of | This document updates RFC 3168 with respect to negotiation and use of | |||
the feedback scheme for TCP. All aspects of RFC 3168 other than the TCP | the feedback scheme for TCP. All aspects of RFC 3168 other than the TCP | |||
feedback scheme and its negotiation remain unchanged by this | feedback scheme and its negotiation remain unchanged by this | |||
specification. In particular the definition of ECN at the IP layer is | specification. In particular, the definition of ECN at the IP layer is | |||
unaffected. <xref target="accecn_3168_updates"/> gives a more detailed | unaffected. <xref target="accecn_3168_updates"/> details the aspects of RF | |||
specification of exactly which aspects of RFC 3168 this document | C 3168 that are updated by this document.</t> | |||
updates.</t> | <t>This document uses the term "Classic ECN feedback" when it needs to | |||
distinguish the TCP/ECN feedback scheme defined in <xref target="RFC3168"/ | ||||
<t>This document uses the term Classic ECN feedback when it needs to | > from the AccECN TCP feedback scheme. AccECN is | |||
distinguish the TCP/ECN feedback scheme defined in <xref | ||||
target="RFC3168"/> from the AccECN TCP feedback scheme. AccECN is | ||||
intended to offer a complete replacement for Classic TCP/ECN feedback, | intended to offer a complete replacement for Classic TCP/ECN feedback, | |||
not a fork in the design of TCP. AccECN feedback complements TCP's loss | not a fork in the design of TCP. AccECN feedback complements TCP's loss | |||
feedback and it can coexist alongside hosts using Classic TCP/ECN | feedback and it can coexist alongside hosts using Classic TCP/ECN | |||
feedback. So its applicability is intended to include the public Internet | feedback. So its applicability is intended to include the public Internet | |||
as well as private IP network such as data centres (and even any non-IP | as well as private IP networks such as data centres (and even any non-IP | |||
networks over which TCP is used), whether or not any nodes on the path | networks over which TCP is used), whether or not any nodes on the path | |||
support ECN, of whatever flavour.</t> | support ECN, of whatever flavour.</t> | |||
<t>AccECN feedback overloads the two existing ECN flags in the TCP | <t>AccECN feedback overloads the two existing ECN flags in the TCP | |||
header and allocates the currently reserved flag (previously called NS) | header and allocates the currently reserved flag (previously called NS) | |||
in the TCP header, to be used as one three-bit counter field for feeding | in the TCP header to be used as one 3-bit counter field for feeding | |||
back the number of packets marked as congestion experienced (CE). Given | back the number of packets marked as congestion experienced (CE). Given | |||
the new definitions of these three bits, both ends have to support the | the new definitions of these three bits, both ends have to support the | |||
new wire protocol before it can be used. Therefore, during the TCP | new wire protocol before it can be used. Therefore, during the TCP | |||
handshake, the two ends use these three bits in the TCP header to | handshake, the two ends use these three bits in the TCP header to | |||
negotiate the most advanced feedback protocol that they can both | negotiate the most advanced feedback protocol that they can both | |||
support, in a way that is backward compatible with <xref | support, in a way that is backward compatible with <xref target="RFC3168"/ | |||
target="RFC3168"/>.</t> | >.</t> | |||
<t>AccECN is solely a change to the TCP wire protocol; it covers the | <t>AccECN is solely a change to the TCP wire protocol; it covers the | |||
negotiation and signaling of more Accurate ECN feedback from a TCP Data | negotiation and signaling of more Accurate ECN feedback from a TCP Data | |||
Receiver to a Data Sender. It is completely independent of how TCP might | Receiver to a Data Sender. It is completely independent of how TCP might | |||
respond to congestion feedback, which is out of scope, but ultimately | respond to congestion feedback, which is out of scope, but ultimately | |||
the motivation for Accurate ECN feedback. Like Classic ECN feedback, | the motivation for Accurate ECN feedback. Like Classic ECN feedback, | |||
AccECN can be used by standard Reno or CUBIC congestion control <xref | AccECN can be used by standard Reno or CUBIC congestion control <xref targ | |||
target="RFC5681"/> <xref target="RFC9438"/> to respond | et="RFC5681"/> <xref target="RFC9438"/> to respond | |||
to the existence of at least one congestion notification within a round | to the existence of at least one congestion notification within a round | |||
trip. Or, unlike Reno or CUBIC, AccECN can be used to respond to the | trip. | |||
<!-- [rfced] We are having trouble parsing "extent of congestion notification". | ||||
Perhaps this means "indicate the amount of congestion over the round trip"? Pl | ||||
ease clarify. | ||||
Original: | ||||
Or, unlike Reno or | ||||
CUBIC, AccECN can be used to respond to the extent of congestion | ||||
notification over a round trip, as for example DCTCP does in | ||||
controlled environments [RFC8257]. | ||||
--> | ||||
Or, unlike Reno or CUBIC, AccECN can be used to respond to the | ||||
extent of congestion notification over a round trip, as for example | extent of congestion notification over a round trip, as for example | |||
DCTCP does in controlled environments <xref target="RFC8257"/>. For | DCTCP does in controlled environments <xref target="RFC8257"/>. For | |||
congestion response, this specification refers to the original ECN | congestion response, this specification refers to the original ECN | |||
specificiation adopted in 2001 <xref target="RFC3168"/>, as updated | specification adopted in 2001 <xref target="RFC3168"/>, as updated | |||
by the more relaxed rules introduced in 2018 to allow ECN experiments | by the more relaxed rules introduced in 2018 to allow ECN experiments | |||
<xref target="RFC8311"/>, namely: a TCP-based Low Latency | <xref target="RFC8311"/>, namely: a TCP-based Low Latency | |||
Low Loss Scalable (L4S) congestion control <xref target="RFC9330"/>; or | Low Loss Scalable (L4S) congestion control <xref target="RFC9330"/>; or | |||
Alternative Backoff with ECN (ABE) <xref target="RFC8511"/>.</t> | Alternative Backoff with ECN (ABE) <xref target="RFC8511"/>.</t> | |||
<t><xref target="accecn_Interaction_Other"/> explains how AccECN is | <t><xref target="accecn_Interaction_Other"/> explains how AccECN is | |||
compatible with current commonly used TCP options, and a number of | compatible with current commonly used TCP options, and a number of | |||
current experimental modifications to TCP, as well as SYN cookies.</t> | current experimental modifications to TCP, as well as SYN cookies.</t> | |||
<section> | ||||
<section title="Document Roadmap"> | <name>Document Roadmap</name> | |||
<t>The following introductory section outlines the goals of AccECN | <t>The following introductory section outlines the goals of AccECN | |||
(<xref target="accecn_Goals"/>). Then, terminology is defined (<xref | (<xref target="accecn_Goals"/>). Then, terminology is defined (<xref tar | |||
target="accecn_Terminology"/>) and a recap of existing prerequisite | get="accecn_Terminology"/>) and a recap of existing prerequisite | |||
technology is given (<xref target="accecn_Recap"/>).</t> | technology is given (<xref target="accecn_Recap"/>).</t> | |||
<t><xref target="accecn_Overview"/> gives an informative overview of | <t><xref target="accecn_Overview"/> gives an informative overview of | |||
the AccECN protocol. Then <xref target="accecn_Spec"/> gives the | the AccECN protocol. Then <xref target="accecn_Spec"/> gives the | |||
normative protocol specification, and <xref | normative protocol specification, and <xref target="accecn_Mbox_Operatio | |||
target="accecn_Mbox_Operation"/> collects together requirements for | n"/> collects requirements for | |||
proxies, offload engines and other middleboxes. <xref | proxies, offload engines, and other middleboxes. <xref target="accecn_31 | |||
target="accecn_3168_updates"/> clarifies which aspects of RFC 3168 are | 68_updates"/> clarifies which aspects of RFC 3168 are | |||
updated by AccECN. <xref target="accecn_Interact_Variants"/> assesses | updated by AccECN. <xref target="accecn_Interact_Variants"/> assesses | |||
the interaction of AccECN with commonly used variants of TCP, whether | the interaction of AccECN with commonly used variants of TCP, whether | |||
standardized or not. Then <xref target="accecn_Properties"/> | they are standardized or not. Then <xref target="accecn_Properties"/> | |||
summarizes the features and properties of AccECN.</t> | summarizes the features and properties of AccECN.</t> | |||
<t><xref target="accecn_IANA_Considerations"/> summarizes the protocol | <t><xref target="accecn_IANA_Considerations"/> summarizes the protocol | |||
fields and numbers that IANA will need to assign and <xref | fields and numbers that IANA assigned, and <xref target="accecn_Security | |||
target="accecn_Security_Considerations"/> points to the aspects of the | _Considerations"/> points to the aspects of the | |||
protocol that will be of interest to the security community.</t> | protocol that will be of interest to the security community.</t> | |||
<t><xref target="accecn_Algo_Examples"/> gives pseudocode examples for | <t><xref target="accecn_Algo_Examples"/> gives pseudocode examples for | |||
the various algorithms that AccECN uses and <xref | the various algorithms that AccECN uses, and <xref target="accecn_flags_ | |||
target="accecn_flags_rationale"/> explains why AccECN uses flags in | rationale"/> explains why AccECN uses flags in | |||
the main TCP header and quantifies the space left for future use.</t> | the main TCP header and quantifies the space left for future use.</t> | |||
</section> | </section> | |||
<section anchor="accecn_Goals"> | ||||
<section anchor="accecn_Goals" title="Goals"> | <name>Goals</name> | |||
<t><xref target="RFC7560"/> enumerates requirements that a candidate | <t><xref target="RFC7560"/> enumerates requirements that a candidate | |||
feedback scheme will need to satisfy, under the headings: resilience, | feedback scheme needs to satisfy, under the headings: resilience, | |||
timeliness, integrity, accuracy (including ordering and lack of bias), | timeliness, integrity, accuracy (including ordering and lack of bias), | |||
complexity, overhead and compatibility (both backward and forward). It | complexity, overhead, and compatibility (both backward and forward). It | |||
recognizes that a perfect scheme that fully satisfies all the | recognizes that a perfect scheme that fully satisfies all the | |||
requirements is unlikely and trade-offs between requirements are | requirements is unlikely and trade-offs between requirements are | |||
likely. <xref target="accecn_Properties"/> presents the properties of | likely. <xref target="accecn_Properties"/> considers the properties of | |||
AccECN against these requirements and discusses the trade-offs | AccECN against these requirements and discusses the trade-offs.</t> | |||
made.</t> | ||||
<t>The requirements document recognizes that a protocol as ubiquitous | <t>The requirements document recognizes that a protocol as ubiquitous | |||
as TCP needs to be able to serve as-yet-unspecified requirements. | as TCP needs to be able to serve as-yet-unspecified requirements. | |||
Therefore an AccECN receiver acts as a generic (mechanistic) reflector | Therefore, an AccECN receiver acts as a generic (mechanistic) reflector | |||
of congestion information with the aim that in future new sender | of congestion information with the aim that new sender | |||
behaviours can be deployed unilaterally (see <xref | behaviours can be deployed unilaterally (see <xref target="accecn_demb_r | |||
target="accecn_demb_reflector"/>).</t> | eflector"/>) in the future.</t> | |||
</section> | </section> | |||
<section anchor="accecn_Terminology"> | ||||
<section anchor="accecn_Terminology" title="Terminology"> | <name>Terminology</name> | |||
<t><list style="hanging"> | <dl newline="false" spacing="normal"> | |||
<t hangText="AccECN:">The more Accurate ECN feedback scheme will | <dt>AccECN:</dt> | |||
be called AccECN for short.</t> | <dd>The more Accurate ECN feedback scheme is | |||
called AccECN for short.</dd> | ||||
<t hangText="Classic ECN:">The ECN protocol specified in <xref | <dt>Classic ECN:</dt> | |||
target="RFC3168"/>.</t> | <dd>The ECN protocol specified in <xref target="RFC3168"/>.</dd> | |||
<dt>Classic ECN feedback:</dt> | ||||
<t hangText="Classic ECN feedback:">The feedback aspect of the ECN | <dd>The feedback aspect of the ECN | |||
protocol specified in <xref target="RFC3168"/>, including | protocol specified in <xref target="RFC3168"/>, including | |||
generation, encoding, transmission and decoding of feedback, but | generation, encoding, transmission and decoding of feedback, but | |||
not the Data Sender's subsequent response to that feedback.</t> | not the Data Sender's subsequent response to that feedback.</dd> | |||
<dt>ACK:</dt> | ||||
<t hangText="ACK:">A TCP acknowledgement, with or without a data | <dd>A TCP acknowledgement, with or without a data | |||
payload (ACK=1).</t> | payload (ACK=1).</dd> | |||
<dt>Pure ACK:</dt> | ||||
<t hangText="Pure ACK:">A TCP acknowledgement without a data | <dd>A TCP acknowledgement without a data | |||
payload.</t> | payload.</dd> | |||
<dt>Acceptable packet / segment:</dt> | ||||
<t hangText="Acceptable packet / segment:">A packet or segment | <dd>A packet or segment | |||
that passes the acceptability tests in <xref target="RFC9293"/> | that passes the acceptability tests in <xref target="RFC9293"/> | |||
and <xref target="RFC5961"/>, or that has passed other tests with | and <xref target="RFC5961"/>, or that has passed other tests with | |||
equivalent protection.</t> | equivalent protection.</dd> | |||
<dt>TCP Client:</dt> | ||||
<t hangText="TCP Client:">The TCP stack that originates a | <dd>The TCP stack that originates a | |||
connection (the initiator).</t> | connection (the initiator).</dd> | |||
<dt>TCP Server:</dt> | ||||
<t hangText="TCP Server:">The TCP stack that responds to a | <dd>The TCP stack that responds to a | |||
connection request (the listener).</t> | connection request (the listener).</dd> | |||
<dt>Three-way handshake:</dt> | ||||
<t hangText="Three-way handshake:">The procedure used to establish | <dd>The procedure used to establish | |||
a TCP connection as described in the TCP protocol specification | a TCP connection as described in the TCP protocol specification | |||
<xref target="RFC9293"/>.</t> | <xref target="RFC9293"/>.</dd> | |||
<dt>Data Receiver:</dt> | ||||
<t hangText="Data Receiver:">The endpoint of a TCP half-connection | <dd>The endpoint of a TCP half-connection | |||
that receives data and sends AccECN feedback.</t> | that receives data and sends AccECN feedback.</dd> | |||
<dt>Data Sender:</dt> | ||||
<t hangText="Data Sender:">The endpoint of a TCP half-connection | <dd>The endpoint of a TCP half-connection | |||
that sends data and receives AccECN feedback.</t> | that sends data and receives AccECN feedback.</dd> | |||
</list> In a mild abuse of terminology, this document sometimes | </dl> | |||
<t> In a mild abuse of terminology, this document sometimes | ||||
refers to 'TCP packets' instead of 'TCP segments'.</t> | refers to 'TCP packets' instead of 'TCP segments'.</t> | |||
<t>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", | <t> | |||
"SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and | The key words "<bcp14>MUST</bcp14>", "<bcp14>MUST NOT</bcp14>", | |||
"OPTIONAL" in this document are to be interpreted as described in BCP | "<bcp14>REQUIRED</bcp14>", "<bcp14>SHALL</bcp14>", "<bcp14>SHALL NOT</bcp14> | |||
14 <xref target="RFC2119"/> <xref target="RFC8174"/> when, and only | ", | |||
when, they appear in all capitals, as shown here.</t> | "<bcp14>SHOULD</bcp14>", "<bcp14>SHOULD NOT</bcp14>", | |||
</section> | "<bcp14>RECOMMENDED</bcp14>", "<bcp14>NOT RECOMMENDED</bcp14>", | |||
"<bcp14>MAY</bcp14>", and "<bcp14>OPTIONAL</bcp14>" in this document are to | ||||
be | ||||
interpreted as described in BCP 14 <xref target="RFC2119"/> <xref | ||||
target="RFC8174"/> when, and only when, they appear in all capitals, as | ||||
shown here. | ||||
</t> | ||||
<section anchor="accecn_Recap" | </section> | |||
title="Recap of Existing ECN feedback in IP/TCP"> | <section anchor="accecn_Recap"> | |||
<name>Recap of Existing ECN Feedback in IP/TCP</name> | ||||
<t>Explicit Congestion Notification (ECN) <xref target="RFC3168"/> | <t>Explicit Congestion Notification (ECN) <xref target="RFC3168"/> | |||
can be split into two parts conceptionally. In the forward direction, | can be split into two parts conceptionally. In the forward direction, | |||
alongside the data stream, it uses a two-bit field in the IP header. | alongside the data stream, it uses a 2-bit field in the IP header. | |||
This is referred to as IP-ECN later on. This signal carried in the | This is referred to as IP-ECN later on. This signal carried in the | |||
IP (Layer 3) header is exposed to network devices and may be modified | IP (Layer 3) header is exposed to network devices and may be modified | |||
when such a device starts to experience congestion (see <xref | when such a device starts to experience congestion (see <xref target="ac | |||
target="accecn_Tab_ECN"/>). The second part is the feedback mechanism, | cecn_Tab_ECN"/>). The second part is the feedback mechanism, | |||
by which the original data sender is notified of the current congestion | by which the original data sender is notified of the current congestion | |||
state of the intermediate path. That returned signal is carried in a | state of the intermediate path. That returned signal is carried in a | |||
protocol specific manner, and is not to be modified by intermediate | protocol-specific manner, and is not to be modified by intermediate | |||
network devices. While ECN is in active use for protocols such as | network devices. While ECN is in active use for protocols such as | |||
QUIC <xref target="RFC9000"/>, SCTP <xref target="RFC9260"/>, | QUIC <xref target="RFC9000"/>, SCTP <xref target="RFC9260"/>, | |||
RTP <xref target="RFC6679"/> and Remote Direct Memory Access over | RTP <xref target="RFC6679"/>, and Remote Direct Memory Access over | |||
Converged Ethernet <xref target="RoCEv2"/>, this document only concerns | Converged Ethernet <xref target="RoCEv2"/>, this document only concerns | |||
itself with the specific implementation for the TCP protocol.</t> | itself with the specific implementation for the TCP protocol.</t> | |||
<t>Once ECN has been negotiated for a transport layer connection, the | <t>Once ECN has been negotiated for a transport layer connection, the | |||
Data Sender for either half-connection can set two possible codepoints | Data Sender for either half-connection can set two possible codepoints | |||
(ECT(0) or ECT(1)) in the IP header of a data packet to indicate an | (ECT(0) or ECT(1)) in the IP header of a data packet to indicate an | |||
ECN-capable transport (ECT). If the ECN codepoint is 0b00, the packet | ECN-capable transport (ECT). If the ECN codepoint is 0b00, the packet | |||
is considered to have been sent by a Not ECN-capable Transport | is considered to have been sent by a Not ECN-capable Transport | |||
(Not-ECT). When a network node experiences congestion, it will | (Not-ECT). When a network node experiences congestion, it will | |||
occasionally either drop or mark a packet, with the choice depending | occasionally either drop or mark a packet, with the choice depending | |||
on the packet's ECN codepoint. If the codepoint is Not-ECT, only drop | on the packet's ECN codepoint. If the codepoint is Not-ECT, only drop | |||
is appropriate. If the codepoint is ECT(0) or ECT(1), the node can | is appropriate. If the codepoint is ECT(0) or ECT(1), the node can | |||
mark the packet by setting the ECN codepoint to 0b11, which is termed | mark the packet by setting the ECN codepoint to 0b11, which is termed | |||
'Congestion Experienced' (CE), or loosely a 'congestion mark'. <xref | 'Congestion Experienced' (CE), or loosely a 'congestion mark'. <xref tar | |||
target="accecn_Tab_ECN"/> summarises these codepoints.</t> | get="accecn_Tab_ECN"/> summarises these codepoints.</t> | |||
<table anchor="accecn_Tab_ECN"> | ||||
<texttable anchor="accecn_Tab_ECN" | <name>The ECN Field in the IP Header</name> | |||
title="The ECN Field in the IP Header"> | <thead> | |||
<ttcol>IP-ECN codepoint</ttcol> | <tr> | |||
<th>IP-ECN codepoint</th> | ||||
<ttcol>Codepoint name</ttcol> | <th>Codepoint name</th> | |||
<th>Description</th> | ||||
<ttcol>Description</ttcol> | </tr> | |||
</thead> | ||||
<c>0b00</c> | <tbody> | |||
<tr> | ||||
<c>Not-ECT</c> | <td>0b00</td> | |||
<td>Not-ECT</td> | ||||
<c>Not ECN-Capable Transport</c> | <td>Not ECN-Capable Transport</td> | |||
</tr> | ||||
<c>0b01</c> | <tr> | |||
<td>0b01</td> | ||||
<c>ECT(1)</c> | <td>ECT(1)</td> | |||
<td>ECN-Capable Transport (1)</td> | ||||
<c>ECN-Capable Transport (1)</c> | </tr> | |||
<tr> | ||||
<c>0b10</c> | <td>0b10</td> | |||
<td>ECT(0)</td> | ||||
<c>ECT(0)</c> | <td>ECN-Capable Transport (0)</td> | |||
</tr> | ||||
<c>ECN-Capable Transport (0)</c> | <tr> | |||
<td>0b11</td> | ||||
<c>0b11</c> | <td>CE</td> | |||
<td>Congestion Experienced</td> | ||||
<c>CE</c> | </tr> | |||
</tbody> | ||||
<c>Congestion Experienced</c> | </table> | |||
</texttable> | <t>In the TCP header, the first two bits in byte 14 (the TCP header | |||
<t>In the TCP header the first two bits in byte 14 (the TCP header | ||||
flags at bit offsets 8 and 9 labelled Congestion Window Reduced (CWR) | flags at bit offsets 8 and 9 labelled Congestion Window Reduced (CWR) | |||
and Explicit Congestion notification Echo (ECE) in <xref | and Explicit Congestion notification Echo (ECE) in <xref target="accecn_ | |||
target="accecn_Fig_TCPHdr"/>) are defined as flags for the use of | Fig_TCPHdr"/>) are defined as flags for the use of | |||
Classic ECN <xref target="RFC3168"/>. A TCP Client indicates that it | Classic ECN <xref target="RFC3168"/>. A TCP Client indicates that it | |||
supports Classic ECN feedback by setting (CWR,ECE) = (1,1) in the SYN, | supports Classic ECN feedback by setting (CWR,ECE) = (1,1) in the SYN, | |||
and an ECN-enabled TCP Server confirms Classic ECN support by setting | and an ECN-enabled TCP Server confirms Classic ECN support by setting | |||
(CWR,ECE) = (0,1) in the SYN/ACK. On reception of a CE-marked packet | (CWR,ECE) = (0,1) in the SYN/ACK. On reception of a CE-marked packet | |||
at the IP layer, the Data Receiver for that half-connection starts to | at the IP layer, the Data Receiver for that half-connection starts to | |||
set the Echo Congestion Experienced (ECE) flag continuously in the TCP | set the Echo Congestion Experienced (ECE) flag continuously in the TCP | |||
header of ACKs, which gives the signal resilience to loss or | header of ACKs, which gives the signal resilience to loss or | |||
reordering of ACKs. The Data Sender for the same half-connection | reordering of ACKs. The Data Sender for the same half-connection | |||
confirms that it has received at least one ECE signal by responding | confirms that it has received at least one ECE signal by responding | |||
with the congestion window reduced (CWR) flag, which allows the Data | with the CWR flag, which allows the Data | |||
Receiver to stop repeating the ECN-Echo flag. This always leads to a | Receiver to stop repeating the ECN-Echo flag. This always leads to a | |||
full RTT of ACKs with ECE set. Thus Classic ECN cannot feed back any | full RTT of ACKs with ECE set. Thus Classic ECN cannot feed back any | |||
additional CE markings arriving within this RTT.</t> | additional CE markings arriving within this RTT.</t> | |||
<t>The last bit in byte 13 of the TCP header (the TCP header flag at | <t>The last bit in byte 13 of the TCP header (the TCP header flag at | |||
bit offset 7 in <xref target="accecn_Fig_TCPHdr"/>) was defined as the | bit offset 7 in <xref target="accecn_Fig_TCPHdr"/>) was defined as the | |||
Nonce Sum (NS) for the ECN Nonce <xref target="RFC3540"/>. In the | Nonce Sum (NS) for the ECN-nonce <xref target="RFC3540"/>. In the | |||
absence of widespread deployment RFC 3540 has been reclassified as | absence of widespread deployment, RFC 3540 was reclassified as | |||
historic <xref target="RFC8311"/> and the respective flag has been | Historic <xref target="RFC8311"/> and the respective flag was | |||
marked as "reserved", making this TCP flag available for use by AccECN | marked as "Reserved", which made this TCP flag available for use by AccE | |||
CN | ||||
instead.</t> | instead.</t> | |||
<figure anchor="accecn_Fig_TCPHdr"> | ||||
<?rfc needLines="8" ?> | <name>TCP Header Flags as Defined Before the Nonce Sum Flag Reverted t | |||
o Reserved</name> | ||||
<figure align="center" anchor="accecn_Fig_TCPHdr" | ||||
title="TCP header flags as defined before the Nonce Sum flag rev | ||||
erted to Reserved"> | ||||
<artwork align="center"><![CDATA[ | <artwork align="center"><![CDATA[ | |||
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |||
+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ | +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ | |||
| | | N | C | E | U | A | P | R | S | F | | | | | N | C | E | U | A | P | R | S | F | | |||
| Header Length | Reserved | S | W | C | R | C | S | S | Y | I | | | Header Length | Reserved | S | W | C | R | C | S | S | Y | I | | |||
| | | | R | E | G | K | H | T | N | N | | | | | | R | E | G | K | H | T | N | N | | |||
+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ | +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ | |||
]]></artwork> | ]]></artwork> | |||
</figure> | </figure> | |||
</section> | </section> | |||
</section> | </section> | |||
<!-- ================================================================ --> | <section anchor="accecn_Overview"> | |||
<name>AccECN Protocol Overview and Rationale</name> | ||||
<section anchor="accecn_Overview" | ||||
title="AccECN Protocol Overview and Rationale"> | ||||
<t>This section provides an informative overview of the AccECN protocol | <t>This section provides an informative overview of the AccECN protocol | |||
that will be normatively specified in <xref target="accecn_Spec"/></t> | that is normatively specified in <xref target="accecn_Spec"/>.</t> | |||
<t>Like the general TCP approach, the Data Receiver of each TCP | <t>Like the general TCP approach, the Data Receiver of each TCP | |||
half-connection sends AccECN feedback to the Data Sender on TCP | half-connection sends AccECN feedback to the Data Sender on TCP | |||
acknowledgements, reusing data packets of the other half-connection | acknowledgements, reusing data packets of the other half-connection | |||
whenever possible.</t> | whenever possible.</t> | |||
<t>The AccECN protocol has had to be designed in two parts:</t> | ||||
<t>The AccECN protocol has had to be designed in two parts:<list | <ul spacing="normal"> | |||
style="symbols"> | <li> | |||
<t>an essential feedback part that re-uses the TCP-ECN header bits for | <t>an essential feedback part that reuses the TCP-ECN header bits for | |||
the | the | |||
Data Receiver to feed back the number of packets arriving with CE in | Data Receiver to feed back the number of packets arriving with CE in | |||
the IP-ECN field. This provides more accuracy than Classic ECN | the IP-ECN field. This provides more accuracy than Classic ECN | |||
feedback, but limited resilience against ACK loss;</t> | feedback, but limited resilience against ACK loss;</t> | |||
</li> | ||||
<li> | ||||
<t>a supplementary feedback part using one of two new alternative AccE CN TCP | <t>a supplementary feedback part using one of two new alternative AccE CN TCP | |||
options that provide additional feedback on the number of payload byte s | options that provide additional feedback on the number of payload byte s | |||
that arrive marked with each of the three ECN codepoints in the IP-ECN | that arrive marked with each of the three ECN codepoints in the IP-ECN | |||
field (not just CE marks). See the BCP on Byte and Packet Congestion | field (not just CE marks). See the BCP on Byte and Packet Congestion | |||
Notification <xref target="RFC7141"/> for the rationale determining th at | Notification <xref target="RFC7141"/> for the rationale determining th at | |||
conveying congested payload bytes should be preferred over just | conveying congested payload bytes should be preferred over just | |||
providing feedback about congested packets. This also provides | providing feedback about congested packets. This also provides | |||
greater resilience against ACK loss than the essential feedback, | greater resilience against ACK loss than the essential feedback, | |||
but it is currently more likely to suffer from middlebox | but it is currently more likely to suffer from middlebox | |||
interference.</t> | interference.</t> | |||
</list>The two part design was necessary, given limitations on the | </li> | |||
</ul> | ||||
<t>The two part design was necessary, given limitations on the | ||||
space available for TCP options and given the possibility that certain | space available for TCP options and given the possibility that certain | |||
incorrectly designed middleboxes might prevent TCP using any new | incorrectly designed middleboxes might prevent TCP from using any new | |||
options.</t> | options.</t> | |||
<t>The essential feedback part overloads the previous definition of the th ree | <t>The essential feedback part overloads the previous definition of the th ree | |||
flags in the TCP header that had been assigned for use by Classic ECN. | flags in the TCP header that had been assigned for use by Classic ECN. | |||
This design choice deliberately allows AccECN peers to replace the | This design choice deliberately allows AccECN peers to replace the | |||
Classic ECN feedback protocol, rather than leaving Classic ECN feedback | Classic ECN feedback protocol, rather than leaving Classic ECN feedback | |||
intact and adding more accurate feedback separately because:<list | intact and adding more accurate feedback separately because:</t> | |||
style="symbols"> | <ul spacing="normal"> | |||
<li> | ||||
<t>this efficiently reuses scarce TCP header space, given TCP option | <t>this efficiently reuses scarce TCP header space, given TCP option | |||
space is approaching saturation;</t> | space is approaching saturation;</t> | |||
</li> | ||||
<li> | ||||
<t>a single upgrade path for the TCP protocol is preferable to a | <t>a single upgrade path for the TCP protocol is preferable to a | |||
fork in the design which modifies the TCP header to convey all | fork in the design that modifies the TCP header to convey all | |||
ECN feedback;</t> | ECN feedback;</t> | |||
</li> | ||||
<t>otherwise Classic and Accurate ECN feedback could give | <li> | |||
<t>otherwise, Classic and Accurate ECN feedback could give | ||||
conflicting feedback about the same segment, which could open up new | conflicting feedback about the same segment, which could open up new | |||
security concerns and make implementations unnecessarily | security concerns and make implementations unnecessarily | |||
complex;</t> | complex;</t> | |||
</li> | ||||
<li> | ||||
<t>middleboxes are more likely to faithfully forward the TCP ECN | <t>middleboxes are more likely to faithfully forward the TCP ECN | |||
flags than newly defined areas of the TCP header.</t> | flags than newly defined areas of the TCP header.</t> | |||
</list></t> | </li> | |||
</ul> | ||||
<t>AccECN is designed to work even if the supplementary feedback part is r emoved | <t>AccECN is designed to work even if the supplementary feedback part is r emoved | |||
or zeroed out, as long as the essential feedback part gets through.</t> | or zeroed out, as long as the essential feedback part gets through.</t> | |||
<section> | ||||
<section title="Capability Negotiation"> | <name>Capability Negotiation</name> | |||
<t>AccECN is a change to the wire protocol of the main TCP header, | <t>AccECN changes the wire protocol of the main TCP header; | |||
therefore it can only be used if both endpoints have been upgraded to | therefore, it can only be used if both endpoints have been upgraded to | |||
understand it. The TCP Client signals support for AccECN on the | understand it. The TCP Client signals support for AccECN on the | |||
initial SYN of a connection and the TCP Server signals whether it | initial SYN of a connection, and the TCP Server signals whether it | |||
supports AccECN on the SYN/ACK. The TCP flags on the SYN that the TCP | supports AccECN on the SYN/ACK. The TCP flags on the SYN that the TCP | |||
Client uses to signal AccECN support have been carefully chosen so | Client uses to signal AccECN support have been carefully chosen so | |||
that a TCP Server will interpret them as a request to support the most | that a TCP Server will interpret them as a request to support the most | |||
recent variant of ECN feedback that it supports. Then the TCP Client | recent variant of ECN feedback that it supports. Then the TCP Client | |||
falls back to the same variant of ECN feedback.</t> | falls back to the same variant of ECN feedback.</t> | |||
<t>An AccECN TCP Client does not send an AccECN Option on the SYN as | <t>An AccECN TCP Client does not send an AccECN Option on the SYN as | |||
SYN option space is limited. The TCP Server sends an AccECN Option on | SYN option space is limited. The TCP Server sends an AccECN Option on | |||
the SYN/ACK and the TCP Client sends one on the first ACK to test | the SYN/ACK, and the TCP Client sends one on the first ACK to test | |||
whether the network path forwards these options correctly.</t> | whether the network path forwards these options correctly.</t> | |||
</section> | </section> | |||
<section> | ||||
<section title="Feedback Mechanism"> | <name>Feedback Mechanism</name> | |||
<t>A Data Receiver maintains four counters initialized at the start of | <t>A Data Receiver maintains four counters initialized at the start of | |||
the half-connection. Three count the number of arriving payload bytes | the half-connection. Three count the number of arriving payload bytes | |||
marked CE, ECT(1) and ECT(0) in the IP-ECN field. These byte counters | marked CE, ECT(1), and ECT(0) in the IP-ECN field. These byte counters | |||
reflect only the TCP payload length, excluding the TCP header and TCP | reflect only the TCP payload length, excluding the TCP header and TCP | |||
options. The fourth counter counts the number of packets arriving | options. The fourth counter counts the number of packets arriving | |||
marked with a CE codepoint (including control packets without payload | marked with a CE codepoint (including control packets without payload | |||
if they are CE-marked).</t> | if they are CE-marked).</t> | |||
<t>The Data Sender maintains four equivalent counters for the half | <t>The Data Sender maintains four equivalent counters for the half | |||
connection, and the AccECN protocol is designed to ensure they will | connection, and the AccECN protocol is designed to ensure they will | |||
match the values in the Data Receiver's counters, albeit after a | match the values in the Data Receiver's counters, albeit after a | |||
little delay.</t> | little delay.</t> | |||
<t>Each ACK carries the three least significant bits (LSBs) of the | <t>Each ACK carries the three least significant bits (LSBs) of the | |||
packet-based CE counter using the ECN bits in the TCP header, now | packet-based CE counter using the ECN bits in the TCP header, now | |||
renamed the Accurate ECN (ACE) field (see <xref | renamed the Accurate ECN (ACE) field (see <xref target="accecn_Fig_ACE_A | |||
target="accecn_Fig_ACE_ACK"/> later). The 24 LSBs of some or all of | CK"/>). The 24 LSBs of some or all of | |||
the byte counters can be optionally carried in an AccECN Option. For | the byte counters can be optionally carried in an AccECN Option. For | |||
efficient use of limited option space, two alternative forms of AccECN | efficient use of limited option space, two alternative forms of the AccE CN | |||
Option are specified with the fields in the opposite order to each | Option are specified with the fields in the opposite order to each | |||
other.</t> | other.</t> | |||
</section> | </section> | |||
<section> | ||||
<name>Delayed ACKs and Resilience Against ACK Loss</name> | ||||
<section title="Delayed ACKs and Resilience Against ACK Loss"> | ||||
<t>With both the ACE and the AccECN Option mechanisms, the Data | <t>With both the ACE and the AccECN Option mechanisms, the Data | |||
Receiver continually repeats the current LSBs of each of its | Receiver continually repeats the current LSBs of each of its | |||
respective counters. There is no need to acknowledge these continually | respective counters. There is no need to acknowledge these continually | |||
repeated counters, so the congestion window reduced (CWR) mechanism of | repeated counters, so the Congestion Window Reduced (CWR) mechanism of | |||
<xref target="RFC3168"/> is no longer used. Even if some ACKs are | <xref target="RFC3168"/> is no longer used. Even if some ACKs are | |||
lost, the Data Sender ought to be able to infer how much to increment | lost, the Data Sender ought to be able to infer how much to increment | |||
its own counters, even if the protocol field has wrapped.</t> | its own counters, even if the protocol field has wrapped.</t> | |||
<t>The 3-bit ACE field can wrap fairly frequently. Therefore, even if | <t>The 3-bit ACE field can wrap fairly frequently. Therefore, even if | |||
it appears to have incremented by one (say), the field might have | it appears to have incremented by one (say), the field might have | |||
actually cycled completely then incremented by one. The Data Receiver | actually cycled completely and then incremented by one. The Data Receive r | |||
is not allowed to delay sending an ACK to such an extent that the ACE | is not allowed to delay sending an ACK to such an extent that the ACE | |||
field would cycle. However ACKs received at the Data Sender could | field would cycle. However, ACKs received at the Data Sender could | |||
still cycle because a whole sequence of ACKs carrying intervening | still cycle because a whole sequence of ACKs carrying intervening | |||
values of the field might all be lost or delayed in transit.</t> | values of the field might all be lost or delayed in transit.</t> | |||
<t>The fields in an AccECN Option are larger, but they will increment | <t>The fields in an AccECN Option are larger, but they will increment | |||
in larger steps because they count bytes not packets. Nonetheless, | in larger steps because they count bytes not packets. Nonetheless, | |||
their size has been chosen such that a whole cycle of the field would | their size has been chosen such that a whole cycle of the field would | |||
never occur between ACKs unless there had been an infeasibly long | never occur between ACKs unless there has been an infeasibly long | |||
sequence of ACK losses. Therefore, provided that an AccECN Option is | sequence of ACK losses. Therefore, provided that an AccECN Option is | |||
available, it can be treated as a dependable feedback channel.</t> | available, it can be treated as a dependable feedback channel.</t> | |||
<t>If an AccECN Option is not available, e.g., it is being | ||||
<t>If an AccECN Option is not available, e.g., it is being | ||||
stripped by a middlebox, the AccECN protocol will only feed back | stripped by a middlebox, the AccECN protocol will only feed back | |||
information on CE markings (using the ACE field). Although not ideal, | information on CE markings (using the ACE field). Although not ideal, | |||
this will be sufficient, because it is envisaged that neither ECT(0) | this will be sufficient, because it is envisaged that neither ECT(0) | |||
nor ECT(1) will ever indicate more severe congestion than CE, even | nor ECT(1) will ever indicate more severe congestion than CE, even | |||
though future uses for ECT(0) or ECT(1) are still unclear <xref | though future uses for ECT(0) or ECT(1) are still unclear <xref target=" | |||
target="RFC8311"/>. Because the 3-bit ACE field is so small, when it | RFC8311"/>. Because the 3-bit ACE field is so small, when it | |||
is the only field available, the Data Sender has to interpret it | is the only field available, the Data Sender has to interpret it | |||
assuming the most likely wrap, but with a degree of conservatism.</t> | assuming the most likely wrap, but with a degree of conservatism.</t> | |||
<t>Certain specified events trigger the Data Receiver to include an | <t>Certain specified events trigger the Data Receiver to include an | |||
AccECN Option on an ACK. The rules are designed to ensure that the | AccECN Option on an ACK. The rules are designed to ensure that the | |||
order in which different markings arrive at the receiver is | order in which different markings arrive at the receiver is | |||
communicated to the sender (as long as options are reaching the sender | communicated to the sender (as long as options are reaching the sender | |||
and as long as there is no ACK loss). Implementations are encouraged | and as long as there is no ACK loss). Implementations are encouraged | |||
to send an AccECN Option more frequently, but this is left up to the | to send an AccECN Option more frequently, but this is left up to the | |||
implementer.</t> | implementer.</t> | |||
<!--As one ACK might acknowledge multiple data segments at the same time the | <!--As one ACK might acknowledge multiple data segments at the same time the | |||
proposed scheme providing accumulated information does not preserve the | proposed scheme providing accumulated information does not preserve the | |||
order at which the marking were received.This decision was taken | order at which the marking were received.This decision was taken | |||
deliberately to reduce complexity.--> | deliberately to reduce complexity.--> | |||
</section> | </section> | |||
<section> | ||||
<section title="Feedback Metrics"> | <name>Feedback Metrics</name> | |||
<t>The CE packet counter in the ACE field and the CE byte counter in | <t>The CE packet counter in the ACE field and the CE byte counter in | |||
AccECN Options both provide feedback on received CE-marks. The CE | AccECN Options both provide feedback on received CE marks. The CE | |||
packet counter includes control packets that do not have payload data, | packet counter includes control packets that do not have payload data, | |||
while the CE byte counter solely includes marked payload bytes. If | while the CE byte counter solely includes marked payload bytes. If | |||
both are present, the byte counter in an AccECN Option will provide | both are present, the byte counter in an AccECN Option will provide | |||
the more accurate information needed for modern congestion control and | the more accurate information needed for modern congestion control and | |||
policing schemes, such as L4S, DCTCP or ConEx. If AccECN Options are | policing schemes, such as L4S, DCTCP, or ConEx. If AccECN Options are | |||
stripped, a simple algorithm to estimate the number of marked bytes | stripped, a simple algorithm to estimate the number of marked bytes | |||
from the ACE field is given in <xref | from the ACE field is given in <xref target="accecn_Algo_ACE_Bytes"/>.</ | |||
target="accecn_Algo_ACE_Bytes"/>.</t> | t> | |||
<t>The AccECN design has been generalized so that it ought to be able | <t>The AccECN design has been generalized so that it ought to be able | |||
to support possible future uses of the experimental ECT(1) codepoint | to support possible future uses of the experimental ECT(1) codepoint | |||
other than the L4S experiment <xref target="RFC9330"/>, such as a | other than the L4S experiment <xref target="RFC9330"/>, such as a | |||
lower severity or a more instant congestion signal than CE.</t> | lower severity or a more instant congestion signal than CE.</t> | |||
<t>Feedback in bytes is provided to protect against the receiver or a | <t>Feedback in bytes is provided to protect against the receiver or a | |||
middlebox using attacks similar to 'ACK-Division' to artificially | middlebox using attacks similar to 'ACK-Division' to artificially | |||
inflate the congestion window, which is why <xref target="RFC5681"/> | inflate the congestion window, which is why <xref target="RFC5681"/> | |||
now recommends that TCP counts acknowledged bytes not packets.</t> | now recommends that TCP counts acknowledge bytes not packets.</t> | |||
</section> | </section> | |||
<section anchor="accecn_demb_reflector"> | ||||
<section anchor="accecn_demb_reflector" | <name>Generic (Mechanistic) Reflector</name> | |||
title="Generic (Mechanistic) Reflector"> | ||||
<t>The ACE field provides feedback about CE markings in the IP-ECN | <t>The ACE field provides feedback about CE markings in the IP-ECN | |||
field of both data and control packets. According to <xref | field of both data and control packets. According to <xref target="RFC31 | |||
target="RFC3168"/> the Data Sender is meant to set the IP-ECN field of | 68"/>, the Data Sender is meant to set the IP-ECN field of | |||
control packets to Not-ECT. However, mechanisms in certain private | control packets to Not-ECT. However, mechanisms in certain private | |||
networks (e.g., data centres) set control packets to be ECN | networks (e.g., data centres) set control packets to be ECN-capable beca | |||
capable because they are precisely the packets that performance | use they are precisely the packets that performance | |||
depends on most.</t> | depends on most.</t> | |||
<t>For this reason, AccECN is designed to be a generic reflector of | <t>For this reason, AccECN is designed to be a generic reflector of | |||
whatever ECN markings it sees, whether or not they are compliant with | whatever ECN markings it sees, whether or not they are compliant with | |||
a current standard. Then as standards evolve, Data Senders can upgrade | a current standard. Then as standards evolve, Data Senders can upgrade | |||
unilaterally without any need for receivers to upgrade too.</t> | unilaterally without any need for receivers to upgrade too.</t> | |||
<t>It is also useful to be able to rely on generic reflection | <t>It is also useful to be able to rely on generic reflection | |||
behaviour when senders need to test for unexpected interference with | behaviour when senders need to test for unexpected interference with | |||
markings (for instance <xref target="accecn_sec_ecn-mangling"/>, <xref | markings (for instance Sections <xref target="accecn_sec_ecn-mangling" f | |||
target="accecn_sec_ACE_init_invalid"/> and <xref | ormat="counter"/>, <xref target="accecn_sec_ACE_init_invalid" format="counter"/> | |||
target="accecn_Mbox_Interference"/> of the present document and | , and <xref target="accecn_Mbox_Interference" format="counter"/> of the present | |||
paragraph 2 of Section 20.2 of <xref target="RFC3168"/>).</t> | document and | |||
paragraph 2 of <xref target="RFC3168" sectionFormat="of" section="20.2"/ | ||||
>).</t> | ||||
<t>The initial SYN and SYN/ACK are the most critical control packets, | <t>The initial SYN and SYN/ACK are the most critical control packets, | |||
so AccECN feeds back their IP-ECN fields. Although RFC 3168 prohibits | so AccECN feeds back their IP-ECN fields. Although RFC 3168 prohibits | |||
ECN-capable SYNs and SYN/ACKs, providing feedback of ECN marking on | ECN-capable SYNs and SYN/ACKs, providing feedback of ECN marking on | |||
the SYN and SYN/ACK supports future scenarios in which SYNs might be | the SYN and SYN/ACK supports future scenarios in which SYNs might be | |||
ECN-enabled (without prejudging whether they ought to be). For | ECN-enabled (without prejudging whether they ought to be). For | |||
instance, <xref target="RFC8311"/> updates this aspect of RFC 3168 to | instance, <xref target="RFC8311"/> updates this aspect of RFC 3168 to | |||
allow experimentation with ECN-capable TCP control packets.</t> | allow experimentation with ECN-capable TCP control packets.</t> | |||
<t>Even if the TCP Client (or Server) has set the SYN (or SYN/ACK) to | <t>Even if the TCP Client (or Server) has set the SYN (or SYN/ACK) to | |||
not-ECT in compliance with RFC 3168, feedback on the state of the | Not-ECT in compliance with RFC 3168, feedback on the state of the | |||
IP-ECN field when it arrives at the receiver could still be useful, | IP-ECN field when it arrives at the receiver could still be useful, | |||
because middleboxes have been known to overwrite the IP-ECN field as | because middleboxes have been known to overwrite the IP-ECN field as | |||
if it is still part of the old Type of Service (ToS) field <xref | if it is still part of the old Type of Service (ToS) field <xref target= | |||
target="Mandalari18"/>. For example, if a TCP Client has set the SYN | "Mandalari18"/>. For example, if a TCP Client has set the SYN | |||
to Not-ECT, but receives feedback that the IP-ECN field on the SYN | to Not-ECT, but receives feedback that the IP-ECN field on the SYN | |||
arrived with a different codepoint, it can detect such middlebox | arrived with a different codepoint, it can detect such middlebox | |||
interference. Previously, neither end knew what IP-ECN field the other | interference. Previously, neither end knew what IP-ECN field the other | |||
had sent. So, if a TCP Server received ECT or CE on a SYN, it could | sent. So, if a TCP Server received ECT or CE on a SYN, it could | |||
not know whether it was invalid because only the TCP Client knew | not know whether it was invalid because only the TCP Client knew | |||
whether it originally marked the SYN as Not-ECT (or ECT). Therefore, | whether it originally marked the SYN as Not-ECT (or ECT). Therefore, | |||
prior to AccECN, the Server's only safe course of action in this | prior to AccECN, the Server's only safe course of action in this | |||
example was to disable ECN for the connection. Instead, the AccECN | example was to disable ECN for the connection. Instead, the AccECN | |||
protocol allows the Server and Client to feed back the ECN field | protocol allows the Server and Client to feed back the ECN field | |||
received on the SYN and SYN/ACK to their peer, which then has all the | received on the SYN and SYN/ACK to their peer, which now has all the | |||
information to decide whether the connection has to fall-back from | information to decide whether the connection has to fall back from | |||
supporting ECN (or not).</t> | supporting ECN (or not).</t> | |||
</section> | </section> | |||
</section> | </section> | |||
<!-- ================================================================ --> | <section anchor="accecn_Spec"> | |||
<name>AccECN Protocol Specification</name> | ||||
<section anchor="accecn_Spec" title="AccECN Protocol Specification"> | <section anchor="accecn_Negotiation"> | |||
<section anchor="accecn_Negotiation" title="Negotiating to use AccECN"> | <name>Negotiating to Use AccECN</name> | |||
<t/> | <section anchor="accecn_Negotiation_3WHS"> | |||
<name>Negotiation During the TCP Three-Way Handshake</name> | ||||
<section anchor="accecn_Negotiation_3WHS" | <t>Given the ECN-nonce <xref target="RFC3540"/> has been | |||
title="Negotiation during the TCP three-way handshake"> | reclassified as Historic <xref target="RFC8311"/>, the TCP flag that | |||
<t>Given the ECN Nonce <xref target="RFC3540"/> has been | ||||
reclassified as historic <xref target="RFC8311"/>, the TCP flag that | ||||
was previously called NS (Nonce Sum) is renamed as the AE (Accurate | was previously called NS (Nonce Sum) is renamed as the AE (Accurate | |||
ECN) flag (the TCP header flag at bit offset 7 in <xref | ECN) flag (the TCP header flag at bit offset 7 in <xref target="accecn | |||
target="accecn_Fig_TCPHdr_AE"/>). See the IANA Considerations in | _Fig_TCPHdr_AE"/>). See the IANA Considerations in | |||
<xref target="accecn_IANA_Considerations"/>.</t> | <xref target="accecn_IANA_Considerations"/>.</t> | |||
<figure anchor="accecn_Fig_TCPHdr_AE"> | ||||
<figure align="center" anchor="accecn_Fig_TCPHdr_AE" | <name>The New Definition of the TCP Header Flags During the TCP Thre | |||
title="The new definition of the TCP header flags during the T | e-Way Handshake</name> | |||
CP three-way handshake"> | ||||
<artwork align="center"><![CDATA[ | <artwork align="center"><![CDATA[ | |||
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |||
+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ | +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ | |||
| | | A | C | E | U | A | P | R | S | F | | | | | A | C | E | U | A | P | R | S | F | | |||
| Header Length | Reserved | E | W | C | R | C | S | S | Y | I | | | Header Length | Reserved | E | W | C | R | C | S | S | Y | I | | |||
| | | | R | E | G | K | H | T | N | N | | | | | | R | E | G | K | H | T | N | N | | |||
+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ | +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ | |||
]]></artwork> | ]]></artwork> | |||
</figure> | </figure> | |||
<t>During the TCP three-way handshake at the start of a connection, to request | <t>During the TCP three-way handshake at the start of a connection, to request | |||
more Accurate ECN feedback the TCP Client (host A) MUST set the TCP | more Accurate ECN feedback the TCP Client (host A) <bcp14>MUST</bcp14> set the TCP | |||
flags (AE,CWR,ECE) = (1,1,1) in the initial SYN segment.</t> | flags (AE,CWR,ECE) = (1,1,1) in the initial SYN segment.</t> | |||
<t>If a TCP Server (host B) that is AccECN-enabled receives a SYN with | <t>If a TCP Server (host B) that is AccECN-enabled receives a SYN with | |||
the above three flags set, it MUST set both its half connections | the above three flags set, it <bcp14>MUST</bcp14> set both its half co | |||
into AccECN mode. Then it MUST set the AE, CWR and ECE TCP flags on | nnections | |||
the SYN/ACK to the combination in the top block of <xref | into AccECN mode. Then it <bcp14>MUST</bcp14> set the AE, CWR, and ECE | |||
target="accecn_Tab_Negotiation"/> that feeds back the IP-ECN field | TCP flags on | |||
the SYN/ACK to the combination in the top block of <xref target="accec | ||||
n_Tab_Negotiation"/> that feeds back the IP-ECN field | ||||
that arrived on the SYN. This applies whether or not the Server | that arrived on the SYN. This applies whether or not the Server | |||
itself supports setting the IP-ECN field on a SYN or SYN/ACK (see | itself supports setting the IP-ECN field on a SYN or SYN/ACK (see | |||
<xref target="accecn_demb_reflector"/> for rationale).</t> | <xref target="accecn_demb_reflector"/> for rationale).</t> | |||
<t>When the TCP Server returns any of the four combinations in the top | ||||
<t>When the TCP Server returns any of the 4 combinations in the top | ||||
block of <xref target="accecn_Tab_Negotiation"/>, it confirms that | block of <xref target="accecn_Tab_Negotiation"/>, it confirms that | |||
it supports AccECN. The TCP Server MUST NOT set one of these 4 | it supports AccECN. The TCP Server <bcp14>MUST NOT</bcp14> set one of | |||
combination of flags on the SYN/ACK unless the preceding SYN | these four | |||
combinations of flags on the SYN/ACK unless the preceding SYN | ||||
requested support for AccECN as above.</t> | requested support for AccECN as above.</t> | |||
<t>Once a TCP Client (A) has sent the above SYN to declare that it | <t>Once a TCP Client (A) has sent the above SYN to declare that it | |||
supports AccECN, and once it has received the above SYN/ACK segment | supports AccECN, and once it has received the above SYN/ACK segment | |||
that confirms that the TCP Server supports AccECN, the TCP Client | that confirms that the TCP Server supports AccECN, the TCP Client | |||
MUST set both its half connections into AccECN mode. The TCP Client | <bcp14>MUST</bcp14> set both its half connections into AccECN mode. Th | |||
MUST NOT enter AccECN mode (or any feedback mode) before it has | e TCP Client | |||
<bcp14>MUST NOT</bcp14> enter AccECN mode (or any feedback mode) befor | ||||
e it has | ||||
received the first SYN/ACK.</t> | received the first SYN/ACK.</t> | |||
<!-- [rfced Will "rights and obligations" be commonly understood in this context ? We only see it used in RFC 3647, and it appears as part of quoted text there. | ||||
<t>Once in AccECN mode, a TCP Client or Server has the rights and | Section 3.1.1 original: | |||
obligations to participate in the ECN protocol defined in <xref | Once in AccECN mode, a TCP Client or Server has the rights and | |||
target="accecn_implications_accecn_mode"/>.</t> | obligations to participate in the ECN protocol defined in | |||
Section 3.1.5. | ||||
<t>The procedures to follow for retransmission of SYNs or SYN/ACKs | Section 3.1.5 original: | |||
are given in <xref | An implementation that supports AccECN has the rights and obligations | |||
target="accecn_sec_multiple_SYNs_or_SYN-ACKs"/>.</t> | concerning the use of ECN defined below, which update those in | |||
Section 6.1.1 of [RFC3168]. | ||||
--> | ||||
<t>It is RECOMMENDED that the AccECN protocol is implemented | <t>Once in AccECN mode, a TCP Client or Server has the rights and | |||
obligations to participate in the ECN protocol defined in <xref target | ||||
="accecn_implications_accecn_mode"/>.</t> | ||||
<t>The procedures for retransmission of SYNs or SYN/ACKs | ||||
are given in <xref target="accecn_sec_multiple_SYNs_or_SYN-ACKs"/>.</t | ||||
> | ||||
<t>It is <bcp14>RECOMMENDED</bcp14> that the AccECN protocol be implem | ||||
ented | ||||
alongside Selective Acknowledgement (SACK) <xref target="RFC2018"/>. | alongside Selective Acknowledgement (SACK) <xref target="RFC2018"/>. | |||
If SACK is implemented with AccECN, Duplicate Selective Acknowledgemen t | If SACK is implemented with AccECN, Duplicate Selective Acknowledgemen t | |||
(D-SACK) <xref target="RFC2883"/> MUST also be implemented.</t> | (D-SACK) <xref target="RFC2883"/> <bcp14>MUST</bcp14> also be implemen | |||
ted.</t> | ||||
</section> | </section> | |||
<section anchor="accecn_sec_backward_compat"> | ||||
<section anchor="accecn_sec_backward_compat" | <name>Backward Compatibility</name> | |||
title="Backward Compatibility"> | <t>The three flags are set to 1 to indicate AccECN support on the SYN | |||
<t>The three flags set to 1 to indicate AccECN support on the SYN | ||||
have been carefully chosen to enable natural fall-back to prior | have been carefully chosen to enable natural fall-back to prior | |||
stages in the evolution of ECN. <xref | stages in the evolution of ECN. <xref target="accecn_Tab_Negotiation"/ | |||
target="accecn_Tab_Negotiation"/> tabulates all the negotiation | > tabulates all the negotiation | |||
possibilities for ECN-related capabilities that involve at least one | possibilities for ECN-related capabilities that involve at least one | |||
AccECN-capable host. The entries in the first two columns have been | AccECN-capable host. The entries in the first two columns have been | |||
abbreviated, as follows: <list hangIndent="4" style="hanging"> | abbreviated, as follows: </t> | |||
<t hangText="AccECN:">Supports more Accurate ECN Feedback (the | <dl newline="false" spacing="normal" indent="4"> | |||
present specification)</t> | <dt>AccECN:</dt> | |||
<dd>Supports more Accurate ECN feedback (the | ||||
<t hangText="Nonce:">Supports ECN Nonce feedback <xref | present specification)</dd> | |||
target="RFC3540"/></t> | <dt>Nonce:</dt> | |||
<dd>Supports ECN-nonce feedback <xref target="RFC3540"/></dd> | ||||
<t hangText="ECN:">Supports 'Classic' ECN feedback <xref | <dt>ECN:</dt> | |||
target="RFC3168"/></t> | <dd>Supports 'Classic' ECN feedback <xref target="RFC3168"/></dd> | |||
<dt>No ECN:</dt> | ||||
<t hangText="No ECN:">Not ECN-capable. Implicit congestion | <dd>Not ECN-capable. Implicit congestion | |||
notification using packet drop.</t> | notification using packet drop.</dd> | |||
</list></t> | </dl> | |||
<!-- <?rfc needLines="23" ?> --> | <!-- <?rfc needLines="23" ?> --> | |||
<table align="center" anchor="accecn_Tab_Negotiation"> | <table align="center" anchor="accecn_Tab_Negotiation"> | |||
<name>ECN capability negotiation between Client (A) and Server | <name>ECN Capability Negotiation Between Client (A) and Server | |||
(B)</name> | (B)</name> | |||
<thead> | <thead> | |||
<tr> | <tr> | |||
<th align="left">Host A</th> | <th align="left">Host A</th> | |||
<th align="left">Host B</th> | <th align="left">Host B</th> | |||
<th align="center">SYN<br/>A->B<br/>AE CWR ECE</th> | ||||
<th | <th align="center">SYN/ACK<br/>B->A<br/>AE CWR ECE</th> | |||
align="center">SYN<br/>A->B<br/>AE CWR ECE</th> | ||||
<th | ||||
align="center">SYN/ACK<br/>B->A<br/>AE CWR ECE</th> | ||||
<th align="left">Feedback Mode<br/>of Host A</th> | <th align="left">Feedback Mode<br/>of Host A</th> | |||
</tr> | </tr> | |||
</thead> | </thead> | |||
<tbody> | <tbody> | |||
<tr> | <tr> | |||
<td align="left">AccECN<br/>AccECN<br/>AccECN<br/>AccECN</td> | <td align="left">AccECN<br/>AccECN<br/>AccECN<br/>AccECN</td> | |||
<td align="left">AccECN<br/>AccECN<br/>AccECN<br/>AccECN</td> | <td align="left">AccECN<br/>AccECN<br/>AccECN<br/>AccECN</td> | |||
<td align="center">1 1 1<br/>1 1 | ||||
<td align="center">1 1 1<br/>1 1 | 1<br/>1 1 1<br/>1 1 1</td> | |||
1<br/>1 1 1<br/>1 1 1</td> | <td align="center">0 1 0<br/>0 1 | |||
1<br/>1 0 0<br/>1 1 0</td> | ||||
<td align="center">0 1 0<br/>0 1 | ||||
1<br/>1 0 0<br/>1 1 0</td> | ||||
<td align="left">AccECN (Not-ECT SYN)<br/>AccECN (ECT1 on | <td align="left">AccECN (Not-ECT SYN)<br/>AccECN (ECT1 on | |||
SYN)<br/>AccECN (ECT0 on SYN)<br/>AccECN (CE on SYN)</td> | SYN)<br/>AccECN (ECT0 on SYN)<br/>AccECN (CE on SYN)</td> | |||
</tr> | </tr> | |||
<tr> | <tr> | |||
<td align="left"/> | <td align="left"/> | |||
<td align="left"/> | <td align="left"/> | |||
<td align="center"/> | <td align="center"/> | |||
<td align="center"/> | <td align="center"/> | |||
<td align="left"/> | <td align="left"/> | |||
</tr> | </tr> | |||
<tr> | <tr> | |||
<td align="left">AccECN<br/>AccECN<br/>AccECN</td> | <td align="left">AccECN<br/>AccECN<br/>AccECN</td> | |||
<td align="left">Nonce<br/>ECN<br/>No ECN</td> | <td align="left">Nonce<br/>ECN<br/>No ECN</td> | |||
<td align="center">1 1 1<br/>1 1 | ||||
<td align="center">1 1 1<br/>1 1 | 1<br/>1 1 1</td> | |||
1<br/>1 1 1</td> | <td align="center">1 0 1<br/>0 0 | |||
1<br/>0 0 0</td> | ||||
<td align="center">1 0 1<br/>0 0 | ||||
1<br/>0 0 0</td> | ||||
<td align="left">(Reserved)<br/>Classic ECN<br/>Not ECN</td> | <td align="left">(Reserved)<br/>Classic ECN<br/>Not ECN</td> | |||
</tr> | </tr> | |||
<tr> | <tr> | |||
<td align="left"/> | <td align="left"/> | |||
<td align="left"/> | <td align="left"/> | |||
<td align="center"/> | <td align="center"/> | |||
<td align="center"/> | <td align="center"/> | |||
<td align="left"/> | <td align="left"/> | |||
</tr> | </tr> | |||
<tr> | <tr> | |||
<td align="left">Nonce<br/>ECN<br/>No ECN</td> | <td align="left">Nonce<br/>ECN<br/>No ECN</td> | |||
<td align="left">AccECN<br/>AccECN<br/>AccECN</td> | <td align="left">AccECN<br/>AccECN<br/>AccECN</td> | |||
<td align="center">0 1 1<br/>0 1 | ||||
<td align="center">0 1 1<br/>0 1 | 1<br/>0 0 0</td> | |||
1<br/>0 0 0</td> | <td align="center">0 0 1<br/>0 0 | |||
1<br/>0 0 0</td> | ||||
<td align="center">0 0 1<br/>0 0 | ||||
1<br/>0 0 0</td> | ||||
<td align="left">Classic ECN<br/>Classic ECN<br/>Not ECN</td> | <td align="left">Classic ECN<br/>Classic ECN<br/>Not ECN</td> | |||
</tr> | </tr> | |||
<tr> | <tr> | |||
<td align="left"/> | <td align="left"/> | |||
<td align="left"/> | <td align="left"/> | |||
<td align="center"/> | <td align="center"/> | |||
<td align="center"/> | <td align="center"/> | |||
<td align="left"/> | <td align="left"/> | |||
</tr> | </tr> | |||
<tr> | <tr> | |||
<td align="left">AccECN</td> | <td align="left">AccECN</td> | |||
<td align="left">Broken</td> | <td align="left">Broken</td> | |||
<td align="center">1 1 1</td> | ||||
<td align="center">1 1 1</td> | <td align="center">1 1 1</td> | |||
<td align="center">1 1 1</td> | ||||
<td align="left">Not ECN</td> | <td align="left">Not ECN</td> | |||
</tr> | </tr> | |||
</tbody> | </tbody> | |||
</table> | </table> | |||
<t><xref target="accecn_Tab_Negotiation"/> is divided into blocks, wit | ||||
<t><xref target="accecn_Tab_Negotiation"/> is divided into blocks | h | |||
each separated by an empty row.<list style="numbers"> | each block separated by an empty row.</t> | |||
<t>The top block shows the case already described in <xref | <ol spacing="normal" type="1"><li> | |||
target="accecn_Negotiation"/> where both endpoints support | <t>The top block shows the case already described in <xref target= | |||
"accecn_Negotiation"/> where both endpoints support | ||||
AccECN and how the TCP Server (B) indicates congestion | AccECN and how the TCP Server (B) indicates congestion | |||
feedback.</t> | feedback.</t> | |||
</li> | ||||
<li> | ||||
<t>The second block shows the cases where the TCP Client (A) | <t>The second block shows the cases where the TCP Client (A) | |||
supports AccECN but the TCP Server (B) supports some earlier | supports AccECN but the TCP Server (B) supports some earlier | |||
variant of TCP feedback, indicated in its SYN/ACK. Therefore, as | variant of TCP feedback, as indicated in its SYN/ACK. Therefore, a s | |||
soon as an AccECN-capable TCP Client (A) receives the SYN/ACK | soon as an AccECN-capable TCP Client (A) receives the SYN/ACK | |||
shown it MUST set both its half connections into the feedback | shown, it <bcp14>MUST</bcp14> set both its half connections into t he feedback | |||
mode shown in the rightmost column. If the TCP Client has set | mode shown in the rightmost column. If the TCP Client has set | |||
itself into Classic ECN feedback mode it MUST then comply with | itself into Classic ECN feedback mode, it <bcp14>MUST</bcp14> comp | |||
<xref target="RFC3168"/>.<vspace blankLines="1"/>An AccECN | ly with | |||
<xref target="RFC3168"/>.</t> | ||||
<t>An AccECN | ||||
implementation has no need to recognize or support the Server | implementation has no need to recognize or support the Server | |||
response labelled 'Nonce' or ECN Nonce feedback more generally | response labelled 'Nonce' or ECN-nonce feedback more generally | |||
<xref target="RFC3540"/>, which has been reclassified as | <xref target="RFC3540"/>, as RFC 3540 has been reclassified as | |||
historic <xref target="RFC8311"/>. AccECN is compatible with | Historic <xref target="RFC8311"/>. AccECN is compatible with | |||
alternative ECN feedback integrity approaches to the nonce (see | alternative ECN feedback integrity approaches to the nonce (see | |||
<xref target="accecn_Integrity"/>). The SYN/ACK labelled 'Nonce' | <xref target="accecn_Integrity"/>). The SYN/ACK labelled 'Nonce' | |||
with (AE,CWR,ECE) = (1,0,1) is reserved for future use. A TCP | with (AE,CWR,ECE) = (1,0,1) is reserved for future use. A TCP | |||
Client (A) that receives such a SYN/ACK follows the procedure | Client (A) that receives such a SYN/ACK follows the procedure | |||
for forward compatibility given in <xref | for forward compatibility given in <xref target="accecn_sec_forwar | |||
target="accecn_sec_forward_compat"/>.</t> | d_compat"/>.</t> | |||
</li> | ||||
<li> | ||||
<t>The third block shows the cases where the TCP Server (B) | <t>The third block shows the cases where the TCP Server (B) | |||
supports AccECN but the TCP Client (A) supports some earlier | supports AccECN but the TCP Client (A) supports some earlier | |||
variant of TCP feedback, indicated in its SYN.<vspace | variant of TCP feedback, as indicated in its SYN.</t> | |||
blankLines="1"/>When an AccECN-enabled TCP Server (B) receives a | <t>When an AccECN-enabled TCP Server (B) receives a | |||
SYN with (AE,CWR,ECE) = (0,1,1) it MUST do one of the | SYN with (AE,CWR,ECE) = (0,1,1), it <bcp14>MUST</bcp14> do one of | |||
following:<list style="symbols"> | the | |||
following:</t> | ||||
<ul spacing="normal"> | ||||
<li> | ||||
<t>set both its half connections into the Classic ECN | <t>set both its half connections into the Classic ECN | |||
feedback mode and return a SYN/ACK with (AE,CWR,ECE) = | feedback mode and return a SYN/ACK with (AE,CWR,ECE) = | |||
(0,0,1) as shown. Then it MUST comply with <xref | (0,0,1) as shown. Then it <bcp14>MUST</bcp14> comply with <xre | |||
target="RFC3168"/>.</t> | f target="RFC3168"/>.</t> | |||
</li> | ||||
<li> | ||||
<t>set both its half-connections into Not ECN mode and | <t>set both its half-connections into Not ECN mode and | |||
return a SYN/ACK with (AE,CWR,ECE) = (0,0,0), then continue | return a SYN/ACK with (AE,CWR,ECE) = (0,0,0), then continue | |||
with ECN disabled. This latter case is unlikely to be | with ECN disabled. This latter case is unlikely to be | |||
desirable, but it is allowed as a possibility, e.g., for | desirable, but it is allowed as a possibility, e.g., for | |||
minimal TCP implementations.</t> | minimal TCP implementations.</t> | |||
</list>When an AccECN-enabled TCP Server (B) receives a SYN | </li> | |||
with (AE,CWR,ECE) = (0,0,0) it MUST set both its half | </ul> | |||
<t>When an AccECN-enabled TCP Server (B) receives a SYN | ||||
with (AE,CWR,ECE) = (0,0,0), it <bcp14>MUST</bcp14> set both its h | ||||
alf | ||||
connections into the Not ECN feedback mode, return a SYN/ACK | connections into the Not ECN feedback mode, return a SYN/ACK | |||
with (AE,CWR,ECE) = (0,0,0) as shown and continue with ECN | with (AE,CWR,ECE) = (0,0,0) as shown, and continue with ECN | |||
disabled.</t> | disabled.</t> | |||
</li> | ||||
<t>The fourth block displays a combination labelled `Broken'. | <li> | |||
<t>The fourth block displays a combination labelled 'Broken'. | ||||
Some older TCP Server implementations incorrectly set the | Some older TCP Server implementations incorrectly set the | |||
TCP-ECN flags in the SYN/ACK by reflecting those in the SYN. | TCP-ECN flags in the SYN/ACK by reflecting those in the SYN. | |||
Such broken TCP Servers (B) cannot support ECN, so as soon as an | Such broken TCP Servers (B) cannot support ECN; so as soon as an | |||
AccECN-capable TCP Client (A) receives such a broken SYN/ACK it | AccECN-capable TCP Client (A) receives such a broken SYN/ACK, it | |||
MUST fall back to Not ECN mode for both its half connections and | <bcp14>MUST</bcp14> fall back to Not ECN mode for both its half co | |||
nnections and | ||||
continue with ECN disabled.</t> | continue with ECN disabled.</t> | |||
</list></t> | </li> | |||
</ol> | ||||
<t>The following additional rules do not fit the structure of the | <t>The following additional rules do not fit the structure of the | |||
table, but they complement it:<list style="hanging"> | table, but they complement it:</t> | |||
<t hangText="Simultaneous Open:">An originating AccECN Host (A), | <dl newline="false" spacing="normal"> | |||
<dt>Simultaneous Open:</dt> | ||||
<dd>An originating AccECN Host (A), | ||||
having sent a SYN with (AE,CWR,ECE) = (1,1,1), might receive | having sent a SYN with (AE,CWR,ECE) = (1,1,1), might receive | |||
another SYN from host B. Host A MUST then enter the same | another SYN from host B. Host A <bcp14>MUST</bcp14> then enter the same | |||
feedback mode as it would have entered had it been a responding | feedback mode as it would have entered had it been a responding | |||
host and received the same SYN. Then host A MUST send the same | host and received the same SYN. Then host A <bcp14>MUST</bcp14> se | |||
SYN/ACK as it would have sent had it been a responding host.</t> | nd the same | |||
SYN/ACK as it would have sent had it been a responding host.</dd> | ||||
<t hangText="In-window SYN during TIME-WAIT:">Many TCP | <dt>In-window SYN during TIME-WAIT:</dt> | |||
<dd>Many TCP | ||||
implementations create a new TCP connection if they receive an | implementations create a new TCP connection if they receive an | |||
in-window SYN packet during TIME-WAIT state. When a TCP host | in-window SYN packet during TIME-WAIT state. When a TCP host | |||
enters TIME-WAIT or CLOSED state, it ought to ignore any | enters TIME-WAIT or CLOSED state, it ought to ignore any | |||
previous state about the negotiation of AccECN for that | previous state about the negotiation of AccECN for that | |||
connection and renegotiate the feedback mode according to <xref | connection and renegotiate the feedback mode according to <xref ta | |||
target="accecn_Tab_Negotiation"/>.</t> | rget="accecn_Tab_Negotiation"/>.</dd> | |||
</list></t> | </dl> | |||
</section> | </section> | |||
<section anchor="accecn_sec_forward_compat"> | ||||
<section anchor="accecn_sec_forward_compat" | <name>Forward Compatibility</name> | |||
title="Forward Compatibility"> | ||||
<t>If a TCP Server that implements AccECN receives a SYN with the | <t>If a TCP Server that implements AccECN receives a SYN with the | |||
three TCP header flags (AE,CWR,ECE) set to any combination other | three TCP header flags (AE,CWR,ECE) set to any combination other | |||
than (0,0,0), (0,1,1) or (1,1,1) and it does not have logic specific | than (0,0,0), (0,1,1), or (1,1,1) and it does not have logic specific | |||
to such a combination, the Server MUST negotiate the use of AccECN | to such a combination, the Server <bcp14>MUST</bcp14> negotiate the us | |||
e of AccECN | ||||
as if the three flags had been set to (1,1,1). However, an AccECN | as if the three flags had been set to (1,1,1). However, an AccECN | |||
Client implementation MUST NOT send a SYN with any combination other | Client implementation <bcp14>MUST NOT</bcp14> send a SYN with any comb ination other | |||
than the three listed.</t> | than the three listed.</t> | |||
<t>If a TCP Client sent a SYN requesting AccECN feedback with | ||||
<t>If a TCP Client has sent a SYN requesting AccECN feedback with | (AE,CWR,ECE) = (1,1,1) and then receives a SYN/ACK with the currently | |||
(AE,CWR,ECE) = (1,1,1) then receives a SYN/ACK with the currently | ||||
reserved combination (AE,CWR,ECE) = (1,0,1) but it does not have | reserved combination (AE,CWR,ECE) = (1,0,1) but it does not have | |||
logic specific to such a combination, the Client MUST enable AccECN | logic specific to such a combination, the Client <bcp14>MUST</bcp14> e nable AccECN | |||
mode as if the SYN/ACK confirmed that the Server supported AccECN | mode as if the SYN/ACK confirmed that the Server supported AccECN | |||
and as if it fed back that the IP-ECN field on the SYN had arrived | and as if it fed back that the IP-ECN field on the SYN had arrived | |||
unchanged. However, an AccECN Server implementation MUST NOT send a | unchanged. However, an AccECN Server implementation <bcp14>MUST NOT</b cp14> send a | |||
SYN/ACK with this combination (AE,CWR,ECE) = (1,0,1).</t> | SYN/ACK with this combination (AE,CWR,ECE) = (1,0,1).</t> | |||
<aside> | <aside> | |||
<t>For the avoidance of doubt, the behaviour described in the | <t>For the avoidance of doubt, the behaviour described in the | |||
present specification applies whether or not the three remaining | present specification applies whether or not the three remaining | |||
reserved TCP header flags are zero.</t> | reserved TCP header flags are zero.</t> | |||
</aside> | </aside> | |||
<!-- [rfced] Because "Reserved combination" is not used much, would it help the reader to add a pointer - perhaps to table 2? | ||||
<t>All these requirements ensure that future uses of all the | Original: | |||
All these requirements ensure that future uses of all the Reserved | ||||
combinations on a SYN or SYN/ACK can rely on consistent behaviour | ||||
from the installed base of AccECN implementations. See Appendix B.3 | ||||
for related discussion. | ||||
--> | ||||
<t>All of these requirements ensure that future uses of all the | ||||
Reserved combinations on a SYN or SYN/ACK can rely on consistent | Reserved combinations on a SYN or SYN/ACK can rely on consistent | |||
behaviour from the installed base of AccECN implementations. See | behaviour from the installed base of AccECN implementations. See | |||
<xref target="accecn_space_evolution"/> for related discussion.</t> | <xref target="accecn_space_evolution"/> for related discussion.</t> | |||
</section> | </section> | |||
<section anchor="accecn_sec_multiple_SYNs_or_SYN-ACKs"> | ||||
<section anchor="accecn_sec_multiple_SYNs_or_SYN-ACKs" | <name>Multiple SYNs or SYN/ACKs</name> | |||
title="Multiple SYNs or SYN/ACKs"> | <section anchor="accecn_sec_SYN_rexmt"> | |||
<t/> | <name>Retransmitted SYNs</name> | |||
<section anchor="accecn_sec_SYN_rexmt" title="Retransmitted SYNs"> | ||||
<t>If the sender of an AccECN SYN (the TCP Client) times out | <t>If the sender of an AccECN SYN (the TCP Client) times out | |||
before receiving the SYN/ACK, it SHOULD attempt to negotiate the | before receiving the SYN/ACK, it <bcp14>SHOULD</bcp14> attempt to ne gotiate the | |||
use of AccECN at least one more time by continuing to set all | use of AccECN at least one more time by continuing to set all | |||
three TCP ECN flags (AE,CWR,ECE) = (1,1,1) on the first | three TCP ECN flags (AE,CWR,ECE) = (1,1,1) on the first | |||
retransmitted SYN (using the usual retransmission time-outs). If | retransmitted SYN (using the usual retransmission timeouts). If | |||
this first retransmission also fails to be acknowledged, in | this first retransmission also fails to be acknowledged, in | |||
deployment scenarios where AccECN path traversal might be | deployment scenarios where AccECN path traversal might be | |||
problematic, the TCP Client SHOULD send subsequent retransmissions | problematic, the TCP Client <bcp14>SHOULD</bcp14> send subsequent re transmissions | |||
of the SYN with the three TCP-ECN flags cleared (AE,CWR,ECE) = | of the SYN with the three TCP-ECN flags cleared (AE,CWR,ECE) = | |||
(0,0,0). Such a retransmitted SYN MUST use the same initial | (0,0,0). Such a retransmitted SYN <bcp14>MUST</bcp14> use the same i nitial | |||
sequence number (ISN) as the original SYN.</t> | sequence number (ISN) as the original SYN.</t> | |||
<t>Retrying once before fall-back adds delay in the case where a | <t>Retrying once before fall-back adds delay in the case where a | |||
middlebox drops an AccECN (or ECN) SYN deliberately. However, | middlebox drops an AccECN (or ECN) SYN deliberately. However, | |||
recent measurements <xref target="Mandalari18"/> imply that a drop | recent measurements <xref target="Mandalari18"/> imply that a drop | |||
is less likely to be due to middlebox interference than other | is less likely to be due to middlebox interference than other | |||
intermittent causes of loss, e.g., congestion, wireless | intermittent causes of loss, e.g., congestion, wireless | |||
transmission loss, etc.</t> | transmission loss, etc.</t> | |||
<!-- [rfced] Should a second closing parens appear after "congestion)"? | ||||
<t>Implementers MAY use other fall-back strategies if they are | Original: | |||
found to be more effective (e.g., attempting to negotiate | Implementers MAY use other fall-back strategies if they are found to | |||
be more effective (e.g., attempting to negotiate AccECN on the SYN | ||||
only once or more than twice (most appropriate during high levels of | ||||
congestion). | ||||
--> | ||||
<!-- [rfced] We are unsure what "try it without" refers to here. Is it "advisabl | ||||
e to experiment without using the ECT on a SYN"? | ||||
Original (sentence prior included for context): | ||||
Further it might make sense to also remove any other new or | ||||
experimental fields or options on the SYN in case a middlebox might | ||||
be blocking them, although the required behaviour will depend on the | ||||
specification of the other option(s) and any attempt to co-ordinate | ||||
fall-back between different modules of the stack. For instance, even | ||||
if taking part in an [RFC8311] experiment that allows ECT on a SYN, | ||||
it would be advisable to try it without. | ||||
--> | ||||
<t>Implementers <bcp14>MAY</bcp14> use other fall-back strategies if | ||||
they are | ||||
found to be more effective (e.g., attempting to negotiate | ||||
AccECN on the SYN only once or more than twice (most appropriate | AccECN on the SYN only once or more than twice (most appropriate | |||
during high levels of congestion).</t> | during high levels of congestion).</t> | |||
<t>Further it might make sense to also remove any other new or | <t>Further it might make sense to also remove any other new or | |||
experimental fields or options on the SYN in case a middlebox | experimental fields or options on the SYN in case a middlebox | |||
might be blocking them, although the required behaviour will | might be blocking them, although the required behaviour will | |||
depend on the specification of the other option(s) and any attempt | depend on the specification of the other option(s) and any attempt | |||
to co-ordinate fall-back between different modules of the stack. | to coordinate fall-back between different modules of the stack. | |||
For instance, even if taking part in an <xref target="RFC8311"/> | For instance, even if taking part in an <xref target="RFC8311"/> | |||
experiment that allows ECT on a SYN, it would be advisable to try | experiment that allows ECT on a SYN, it would be advisable to try | |||
it without.</t> | it without.</t> | |||
<t>Whichever fall-back strategy is used, the TCP initiator <bcp14>SH | ||||
<t>Whichever fall-back strategy is used, the TCP initiator SHOULD | OULD</bcp14> | |||
cache failed connection attempts. If it does, it SHOULD NOT give | cache failed connection attempts. If it does, it <bcp14>SHOULD NOT</ | |||
bcp14> give | ||||
up attempting to negotiate AccECN on the SYN of subsequent | up attempting to negotiate AccECN on the SYN of subsequent | |||
connection attempts until it is clear that the blockage is | connection attempts until it is clear that the blockage is | |||
persistently and specifically due to AccECN. The cache needs to be | persistently and specifically due to AccECN. The cache needs to be | |||
arranged to expire so that the initiator will infrequently attempt | arranged to expire so that the initiator will infrequently attempt | |||
to check whether the problem has been resolved.</t> | to check whether the problem has been resolved.</t> | |||
<t>All fall-back strategies will need to follow all the normative | <t>All fall-back strategies will need to follow all the normative | |||
rules in <xref target="accecn_implications_accecn_mode"/>, which | rules in <xref target="accecn_implications_accecn_mode"/>, which | |||
concern behaviour when SYNs or SYN/ACKs negotiating different | concern behaviour when SYNs or SYN/ACKs negotiating different | |||
types of feedback have been sent within the same connection, | types of feedback have been sent within the same connection, | |||
including the possibility that they arrive out of order. As | including the possibility that they arrive out of order. As | |||
examples, the following non-normative bullets call out those rules | examples, the following non-normative bullets call out those rules | |||
from <xref target="accecn_implications_accecn_mode"/> that apply | from <xref target="accecn_implications_accecn_mode"/> that apply | |||
to the above fall-back strategies:<list style="symbols"> | to the above fall-back strategies:</t> | |||
<!-- [rfced] Throughout, some of the bulleted lists use a mix of periods and sem | ||||
icolons to close the item - some within the same list. Please consider whether | ||||
these may be updated for consistency. We recommend using terminating periods, u | ||||
nless the goal is to clarify an "and" or "or" connection between the list items. | ||||
Please review. | ||||
--> | ||||
<ul spacing="normal"> | ||||
<li> | ||||
<t>Once the TCP Client has sent SYNs with (AE,CWR,ECE) = | <t>Once the TCP Client has sent SYNs with (AE,CWR,ECE) = | |||
(1,1,1) and with (AE,CWR,ECE) = (0,0,0), it might eventually | (1,1,1) and with (AE,CWR,ECE) = (0,0,0), it might eventually | |||
receive a SYN/ACK from the Server in response to one, the | receive a SYN/ACK from the Server in response to one, the | |||
other, or both and possibly reordered;</t> | other, or both, and possibly reordered;</t> | |||
</li> | ||||
<li> | ||||
<t>Such a TCP Client enters the feedback mode appropriate to | <t>Such a TCP Client enters the feedback mode appropriate to | |||
the first SYN/ACK it receives according to <xref | the first SYN/ACK it receives according to <xref target="accecn_ | |||
target="accecn_Tab_Negotiation"/>, and it does not switch to a | Tab_Negotiation"/>, and it does not switch to a | |||
different mode, whatever other SYN/ACKs it might receive or | different mode, whatever other SYN/ACKs it might receive or | |||
send;</t> | send;</t> | |||
</li> | ||||
<li> | ||||
<t>If a TCP Client has entered AccECN mode but then | <t>If a TCP Client has entered AccECN mode but then | |||
subsequently sends a SYN or receives a SYN/ACK with | subsequently sends a SYN or receives a SYN/ACK with | |||
(AE,CWR,ECE) = (0,0,0), it is still allowed to set ECT on | (AE,CWR,ECE) = (0,0,0), it is still allowed to set ECT on | |||
packets for the rest of the connection. Note that this rule is | packets for the rest of the connection. Note that this rule is | |||
different to that of a Server in an equivalent position (<xref | different than that of a Server in an equivalent position (<xref | |||
target="accecn_implications_accecn_mode"/> explains).</t> | target="accecn_implications_accecn_mode"/> explains).</t> | |||
</li> | ||||
<li> | ||||
<t>Having entered AccECN mode, in general a TCP Client commits | <t>Having entered AccECN mode, in general a TCP Client commits | |||
to respond to any incoming congestion feedback, whether or not | to respond to any incoming congestion feedback, whether or not | |||
it sets ECT on outgoing packets (for rationale and some | it sets ECT on outgoing packets (for rationale and some | |||
exceptions see <xref target="accecn_sec_ecn-mangling"/>, <xref | exceptions see <xref target="accecn_sec_ecn-mangling"/>, <xref t | |||
target="accecn_sec_ACE_init_invalid"/>);</t> | arget="accecn_sec_ACE_init_invalid"/>);</t> | |||
</li> | ||||
<li> | ||||
<t>Having entered AccECN mode, a TCP Client commits to using | <t>Having entered AccECN mode, a TCP Client commits to using | |||
AccECN to feed back the IP-ECN field in incoming packets for | AccECN to feed back the IP-ECN field in incoming packets for | |||
the rest of the connection, as specified in <xref | the rest of the connection, as specified in <xref target="accecn | |||
target="accecn_feedback"/>, even if it is not itself setting | _feedback"/>, even if it is not itself setting | |||
ECT on outgoing packets.</t> | ECT on outgoing packets.</t> | |||
</list></t> | </li> | |||
</ul> | ||||
</section> | </section> | |||
<section anchor="accecn_sec_SYN-ACK_rexmt"> | ||||
<section anchor="accecn_sec_SYN-ACK_rexmt" | <name>Retransmitted SYN/ACKs</name> | |||
title="Retransmitted SYN/ACKs"> | ||||
<t>A TCP Server might send multiple SYN/ACKs indicating different | <t>A TCP Server might send multiple SYN/ACKs indicating different | |||
feedback modes. For instance, when falling back to sending a | feedback modes. For instance, when falling back to sending a | |||
SYN/ACK with (AE,CWR,ECE) = (0,0,0) after previous AccECN SYN/ACKs | SYN/ACK with (AE,CWR,ECE) = (0,0,0) after previous AccECN SYN/ACKs | |||
have timed out (<xref target="accecn_AccECN_Option_Loss"/>); or to | have timed out (<xref target="accecn_AccECN_Option_Loss"/>); or to | |||
acknowledge different retransmissions of the SYN (<xref | acknowledge different retransmissions of the SYN (<xref target="acce | |||
target="accecn_sec_SYN_rexmt"/>).</t> | cn_sec_SYN_rexmt"/>).</t> | |||
<t>All fall-back strategies will need to follow all the normative | <t>All fall-back strategies will need to follow all the normative | |||
rules in <xref target="accecn_implications_accecn_mode"/>, which | rules in <xref target="accecn_implications_accecn_mode"/>, which | |||
concern behaviour when SYNs or SYN/ACKs negotiating different | concern behaviour when SYNs or SYN/ACKs negotiating different | |||
types of feedback are sent within the same connection, including | types of feedback are sent within the same connection, including | |||
the possibility that they arrive out of order. As examples, the | the possibility that they arrive out of order. As examples, the | |||
following non-normative bullets call out those rules from <xref | following non-normative bullets call out those rules from <xref targ | |||
target="accecn_implications_accecn_mode"/> that apply to the above | et="accecn_implications_accecn_mode"/> that apply to the above | |||
fall-back strategies:<list style="symbols"> | fall-back strategies:</t> | |||
<ul spacing="normal"> | ||||
<li> | ||||
<t>An AccECN-capable TCP Server enters the feedback mode | <t>An AccECN-capable TCP Server enters the feedback mode | |||
appropriate to the first SYN it receives using <xref | appropriate to the first SYN it receives using <xref target="acc | |||
target="accecn_Tab_Negotiation"/>, and it does not switch to a | ecn_Tab_Negotiation"/>, and it does not switch to a | |||
different mode, whatever other SYNs it might receive and | different mode, whatever other SYNs it might receive and | |||
whatever SYN/ACKs it might send;</t> | whatever SYN/ACKs it might send;</t> | |||
</li> | ||||
<t>if a TCP Server in AccECN mode receives a SYN with | <li> | |||
<t>If a TCP Server in AccECN mode receives a SYN with | ||||
(AE,CWR,ECE) = (0,0,0), it preferably acknowledges it first | (AE,CWR,ECE) = (0,0,0), it preferably acknowledges it first | |||
using an AccECN SYN/ACK, but it can retry using a SYN/ACK with | using an AccECN SYN/ACK, but it can retry using a SYN/ACK with | |||
(AE,CWR,ECE) = (0,0,0);</t> | (AE,CWR,ECE) = (0,0,0);</t> | |||
</li> | ||||
<li> | ||||
<t>If a TCP Server in AccECN mode sends multiple AccECN | <t>If a TCP Server in AccECN mode sends multiple AccECN | |||
SYN/ACKs, it uses the TCP-ECN flags in each SYN/ACK to feed | SYN/ACKs, it uses the TCP-ECN flags in each SYN/ACK to feed | |||
back the IP-ECN field on the latest SYN to have arrived;</t> | back the IP-ECN field on the latest SYN to have arrived;</t> | |||
</li> | ||||
<t>If a TCP Server enters AccECN mode then subsequently sends | <li> | |||
<t>If a TCP Server enters AccECN mode and then subsequently send | ||||
s | ||||
a SYN/ACK or receives a SYN with (AE,CWR,ECE) = (0,0,0), it is | a SYN/ACK or receives a SYN with (AE,CWR,ECE) = (0,0,0), it is | |||
prohibited from setting ECT on any packet for the rest of the | prohibited from setting ECT on any packet for the rest of the | |||
connection;</t> | connection;</t> | |||
</li> | ||||
<li> | ||||
<t>Having entered AccECN mode, in general a TCP Server commits | <t>Having entered AccECN mode, in general a TCP Server commits | |||
to respond to any incoming congestion feedback, whether or not | to respond to any incoming congestion feedback, whether or not | |||
it sets ECT on outgoing packets (for rationale and some | it sets ECT on outgoing packets (for rationale and some | |||
exceptions see <xref target="accecn_sec_ecn-mangling"/>, <xref | exceptions see Sections <xref target="accecn_sec_ecn-mangling" f | |||
target="accecn_sec_ACE_init_invalid"/>);</t> | ormat="counter"/>, <xref target="accecn_sec_ACE_init_invalid" format="counter"/> | |||
);</t> | ||||
</li> | ||||
<li> | ||||
<t>Having entered AccECN mode, a TCP Server commits to using | <t>Having entered AccECN mode, a TCP Server commits to using | |||
AccECN to feed back the IP-ECN field in incoming packets for | AccECN to feed back the IP-ECN field in incoming packets for | |||
the rest of the connection, as specified in <xref | the rest of the connection, as specified in <xref target="accecn | |||
target="accecn_feedback"/>, even if it is not itself setting | _feedback"/>, even if it is not itself setting | |||
ECT on outgoing packets.</t> | ECT on outgoing packets.</t> | |||
</list></t> | </li> | |||
</ul> | ||||
</section> | </section> | |||
</section> | </section> | |||
<section anchor="accecn_implications_accecn_mode"> | ||||
<section anchor="accecn_implications_accecn_mode" | <name>Implications of AccECN Mode</name> | |||
title="Implications of AccECN Mode"> | ||||
<t><xref target="accecn_Negotiation_3WHS"/> describes the only ways | <t><xref target="accecn_Negotiation_3WHS"/> describes the only ways | |||
that a host can enter AccECN mode, whether as a Client or as a | that a host can enter AccECN mode, whether as a Client or as a | |||
Server.</t> | Server.</t> | |||
<t>An implementation that supports AccECN has the rights and | <t>An implementation that supports AccECN has the rights and | |||
obligations concerning the use of ECN defined below, which update | obligations concerning the use of ECN defined below, which update | |||
those in Section 6.1.1 of <xref target="RFC3168"/>. This section | those in <xref target="RFC3168" sectionFormat="of" section="6.1.1"/>. | |||
uses the following definitions:<list style="hanging"> | This section | |||
<t hangText="'During the handshake':">The connection states | uses the following definitions:</t> | |||
prior to synchronization;</t> | <dl newline="false" spacing="normal"> | |||
<dt>'During the handshake':</dt> | ||||
<t hangText="'Valid SYN':">A SYN that has the same port numbers | <dd>The connection states | |||
prior to synchronization;</dd> | ||||
<dt>'Valid SYN':</dt> | ||||
<dd>A SYN that has the same port numbers | ||||
and the same ISN as the SYN that first caused the Server to open | and the same ISN as the SYN that first caused the Server to open | |||
the connection. An 'Acceptable' packet is defined in <xref | the connection. An 'Acceptable' packet is defined in <xref target= | |||
target="accecn_Terminology"/>.</t> | "accecn_Terminology"/>.</dd> | |||
</list></t> | </dl> | |||
<t>Handling SYNs or SYN/ACKs of multiple types | <t>Handling SYNs or SYN/ACKs of multiple types | |||
(e.g., fall-back): <list style="symbols"> | (e.g., fall-back): </t> | |||
<t>Any implementation that supports AccECN:<list style="symbols"> | <ul spacing="normal"> | |||
<t>MUST NOT switch into a different feedback mode to the one | <li> | |||
it first entered according to <xref | <t>Any implementation that supports AccECN:</t> | |||
target="accecn_Tab_Negotiation"/>, no matter whether it | <ul spacing="normal"> | |||
<li> | ||||
<t><bcp14>MUST NOT</bcp14> switch into a different feedback mo | ||||
de than the one | ||||
it first entered according to <xref target="accecn_Tab_Negotia | ||||
tion"/>, no matter whether it | ||||
subsequently receives valid SYNs or Acceptable SYN/ACKs of | subsequently receives valid SYNs or Acceptable SYN/ACKs of | |||
different types.</t> | different types.</t> | |||
</li> | ||||
<t>SHOULD ignore the TCP-ECN flags in SYNs or SYN/ACKs that | <li> | |||
<t><bcp14>SHOULD</bcp14> ignore the TCP-ECN flags in SYNs or S | ||||
YN/ACKs that | ||||
are received after the implementation reaches the | are received after the implementation reaches the | |||
Established state, in line with the general TCP approach | Established state, in line with the general TCP approach | |||
<xref target="RFC9293"/>;<vspace blankLines="1"/>Reason: | <xref target="RFC9293"/>;</t> | |||
<t>Reason: | ||||
Reaching established state implies that at least one SYN and | Reaching established state implies that at least one SYN and | |||
one SYN/ACK have successfully been delivered. And all the | one SYN/ACK have successfully been delivered. And all the | |||
rules for handshake fall-back are designed to work based on | rules for handshake fall-back are designed to work based on | |||
those packets that successfully traverse the path, whatever | those packets that successfully traverse the path, whatever | |||
other handshake packets are lost or delayed.</t> | other handshake packets are lost or delayed.</t> | |||
</li> | ||||
<t>MUST NOT send a 'Classic' ECN-setup SYN <xref | <li> | |||
target="RFC3168"/> with (AE,CWR,ECE) = (0,1,1) and a SYN | <t><bcp14>MUST NOT</bcp14> send a 'Classic' ECN-setup SYN <xre | |||
f target="RFC3168"/> with (AE,CWR,ECE) = (0,1,1) and a SYN | ||||
with (AE,CWR,ECE) = (1,1,1) requesting AccECN feedback | with (AE,CWR,ECE) = (1,1,1) requesting AccECN feedback | |||
within the same connection;</t> | within the same connection;</t> | |||
</li> | ||||
<t>MUST NOT send a 'Classic' ECN-setup SYN/ACK <xref | <li> | |||
target="RFC3168"/> with (AE,CWR,ECE) = (0,0,1) and a SYN/ACK | <t><bcp14>MUST NOT</bcp14> send a 'Classic' ECN-setup SYN/ACK | |||
<xref target="RFC3168"/> with (AE,CWR,ECE) = (0,0,1) and a SYN/ACK | ||||
agreeing to use AccECN feedback within the same | agreeing to use AccECN feedback within the same | |||
connection;</t> | connection;</t> | |||
</li> | ||||
<t>MUST reset the connection with a RST packet, if it | <li> | |||
<t><bcp14>MUST</bcp14> reset the connection with a RST packet, | ||||
if it | ||||
receives a 'Classic' ECN-setup SYN with (AE,CWR,ECE) = | receives a 'Classic' ECN-setup SYN with (AE,CWR,ECE) = | |||
(0,1,1) and a SYN requesting AccECN feedback during the same | (0,1,1) and a SYN requesting AccECN feedback during the same | |||
handshake;</t> | handshake;</t> | |||
</li> | ||||
<t>MUST reset the connection with a RST packet, if it | <li> | |||
<t><bcp14>MUST</bcp14> reset the connection with a RST packet, | ||||
if it | ||||
receives 'Classic' ECN-setup SYN/ACK with (AE,CWR,ECE) = | receives 'Classic' ECN-setup SYN/ACK with (AE,CWR,ECE) = | |||
(0,0,1) and a SYN/ACK agreeing to use AccECN feedback during | (0,0,1) and a SYN/ACK agreeing to use AccECN feedback during | |||
the same handshake;</t> | the same handshake;</t> | |||
</list>The last four rules are necessary because, if one peer | </li> | |||
</ul> | ||||
<t>The last four rules are necessary because, if one peer | ||||
were to negotiate the feedback mode in two different types of | were to negotiate the feedback mode in two different types of | |||
handshake, it would not be possible for the other peer to know | handshake, it would not be possible for the other peer to know | |||
for certain which handshake packet(s) the other end had | for certain which handshake packet(s) the other end had | |||
eventually received or in which order it received them. So, in | eventually received or in which order it received them. So, in | |||
the absence of these rules, the two peers could end up using | the absence of these rules, the two peers could end up using | |||
different ECN feedback modes without knowing it.</t> | different ECN feedback modes without knowing it.</t> | |||
</li> | ||||
<li> | ||||
<t>A host in AccECN mode that is feeding back the IP-ECN field | <t>A host in AccECN mode that is feeding back the IP-ECN field | |||
on a SYN or SYN/ACK:<list style="symbols"> | on a SYN or SYN/ACK:</t> | |||
<t>MUST feed back the IP-ECN field on the latest valid SYN | <ul spacing="normal"> | |||
<li> | ||||
<t><bcp14>MUST</bcp14> feed back the IP-ECN field on the lates | ||||
t valid SYN | ||||
or acceptable SYN/ACK to arrive.</t> | or acceptable SYN/ACK to arrive.</t> | |||
</list></t> | </li> | |||
</ul> | ||||
<t>A TCP Server already in AccECN mode:<list style="symbols"> | </li> | |||
<t>SHOULD acknowledge a valid SYN arriving with (AE,CWR,ECE) | <li> | |||
<t>A TCP Server already in AccECN mode:</t> | ||||
<ul spacing="normal"> | ||||
<li> | ||||
<t><bcp14>SHOULD</bcp14> acknowledge a valid SYN arriving with | ||||
(AE,CWR,ECE) | ||||
= (0,0,0) by emitting an AccECN SYN/ACK (with the | = (0,0,0) by emitting an AccECN SYN/ACK (with the | |||
appropriate combination of TCP-ECN flags to feed back the | appropriate combination of TCP-ECN flags to feed back the | |||
IP-ECN field of this latest SYN);</t> | IP-ECN field of this latest SYN);</t> | |||
</li> | ||||
<t>MAY acknowledge a valid SYN arriving with (AE,CWR,ECE) = | <li> | |||
<t><bcp14>MAY</bcp14> acknowledge a valid SYN arriving with (A | ||||
E,CWR,ECE) = | ||||
(0,0,0) by sending a SYN/ACK with (AE,CWR,ECE) = | (0,0,0) by sending a SYN/ACK with (AE,CWR,ECE) = | |||
(0,0,0);</t> | (0,0,0);</t> | |||
</list>Rationale: When a SYN arrives with (AE,CWR,ECE) = | </li> | |||
</ul> | ||||
<t>Rationale: When a SYN arrives with (AE,CWR,ECE) = | ||||
(0,0,0) at a TCP Server that is already in AccECN mode, it | (0,0,0) at a TCP Server that is already in AccECN mode, it | |||
implies that the TCP Client had probably not received the | implies that the TCP Client had probably not received the | |||
previous AccECN SYN/ACK emitted by the TCP Server. Therefore, | previous AccECN SYN/ACK emitted by the TCP Server. Therefore, | |||
the first bullet recommends attempting at least one more AccECN | the first bullet recommends attempting at least one more AccECN | |||
SYN/ACK. Nonetheless, the second bullet recognizes that the | SYN/ACK. Nonetheless, the second bullet recognizes that the | |||
Server might eventually need to fall back to a non-ECN SYN/ACK. | Server might eventually need to fall back to a non-ECN SYN/ACK. | |||
In either case, the TCP Server remains in AccECN feedback mode | In either case, the TCP Server remains in AccECN feedback mode | |||
(according to the earlier requirement not to switch modes).</t> | (according to the earlier requirement not to switch modes).</t> | |||
</li> | ||||
<t>An AccECN-capable TCP Server already in Not ECN mode:<list | <li> | |||
style="symbols"> | <t>An AccECN-capable TCP Server already in Not ECN mode:</t> | |||
<t>SHOULD respond to any subsequent valid SYN using a | <ul spacing="normal"> | |||
<li> | ||||
<t><bcp14>SHOULD</bcp14> respond to any subsequent valid SYN u | ||||
sing a | ||||
SYN/ACK with (AE,CWR,ECE) = (0,0,0), even if the SYN is | SYN/ACK with (AE,CWR,ECE) = (0,0,0), even if the SYN is | |||
offering to negotiate Classic ECN or AccECN feedback | offering to negotiate Classic ECN or AccECN feedback | |||
mode;<vspace blankLines="1"/>Rationale: There would be no | mode;</t> | |||
<t>Rationale: There would be no | ||||
point in the Server offering any type of ECN feedback, | point in the Server offering any type of ECN feedback, | |||
because the Client will not be using ECN. However, there is | because the Client will not be using ECN. However, there is | |||
no interoperability reason to make this rule mandatory.</t> | no interoperability reason to make this rule mandatory.</t> | |||
</list></t> | </li> | |||
</list>If for any reason a host is not willing to provide ECN | </ul> | |||
feedback on a particular TCP connection, it SHOULD clear the AE, CWR | </li> | |||
</ul> | ||||
<t>If for any reason a host is not willing to provide ECN | ||||
feedback on a particular TCP connection, it <bcp14>SHOULD</bcp14> clea | ||||
r the AE, CWR, | ||||
and ECE flags in all SYN and/or SYN/ACK packets that it sends.</t> | and ECE flags in all SYN and/or SYN/ACK packets that it sends.</t> | |||
<t>Sending ECT:</t> | ||||
<t>Sending ECT:<list style="symbols"> | <ul spacing="normal"> | |||
<t>Any implementation that supports AccECN:<list style="symbols"> | <li> | |||
<t>MUST NOT set ECT if it is in Not ECN feedback mode.</t> | <t>Any implementation that supports AccECN:</t> | |||
</list>A Data Sender in AccECN mode:<list style="symbols"> | <ul spacing="normal"> | |||
<t>SHOULD set an ECT codepoint in the IP header of packets to | <li> | |||
<t><bcp14>MUST NOT</bcp14> set ECT if it is in Not ECN feedbac | ||||
k mode.</t> | ||||
</li> | ||||
</ul> | ||||
<t>A Data Sender in AccECN mode:</t> | ||||
<ul spacing="normal"> | ||||
<li> | ||||
<t><bcp14>SHOULD</bcp14> set an ECT codepoint in the IP header | ||||
of packets to | ||||
indicate to the network that the transport is capable and | indicate to the network that the transport is capable and | |||
willing to participate in ECN for this packet;</t> | willing to participate in ECN for this packet;</t> | |||
</li> | ||||
<t>MAY not set ECT on any packet (for instance if | <li> | |||
<t><bcp14>MAY</bcp14> not set ECT on any packet (for instance | ||||
if | ||||
it has reason to believe such a packet would be | it has reason to believe such a packet would be | |||
blocked);</t> | blocked);</t> | |||
</list>A TCP Server in AccECN mode:<list style="symbols"> | </li> | |||
<t>MUST NOT set ECT on any packet for the rest of the | </ul> | |||
<t>A TCP Server in AccECN mode:</t> | ||||
<ul spacing="normal"> | ||||
<li> | ||||
<t><bcp14>MUST NOT</bcp14> set ECT on any packet for the rest | ||||
of the | ||||
connection, if it has received or sent at least one valid | connection, if it has received or sent at least one valid | |||
SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during | SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during | |||
the handshake.<vspace blankLines="0"/>This rule solely | the handshake.</t> | |||
<t>This rule solely | ||||
applies to a Server because, when a Server enters AccECN | applies to a Server because, when a Server enters AccECN | |||
mode it doesn't know for sure whether the Client will end up | mode, it doesn't know for sure whether the Client will end up | |||
in AccECN mode. But when a Client enters AccECN mode, it can | in AccECN mode. But when a Client enters AccECN mode, it can | |||
be certain that the Server is already in AccECN feedback | be certain that the Server is already in AccECN feedback | |||
mode.</t> | mode.</t> | |||
</list></t> | </li> | |||
</list></t> | </ul> | |||
</li> | ||||
<t>Congestion response:<list style="symbols"> | </ul> | |||
<t>A host in AccECN mode:<list style="symbols"> | <t>Congestion response:</t> | |||
<ul spacing="normal"> | ||||
<li> | ||||
<t>A host in AccECN mode:</t> | ||||
<ul spacing="normal"> | ||||
<li> | ||||
<t>is obliged to respond appropriately to AccECN feedback | <t>is obliged to respond appropriately to AccECN feedback | |||
that indicates there were ECN marks on packets it had | that indicates there were ECN marks on packets it had | |||
previously sent, where 'appropriately' is defined in Section | previously sent, where 'appropriately' is defined in <xref | |||
6.1 of <xref target="RFC3168"/> and updated by Sections 2.1 | target="RFC3168" sectionFormat="of" section="6.1"/> and | |||
and 4.1 of <xref target="RFC8311"/>;</t> | updated by Sections <xref target="RFC8311" | |||
sectionFormat="bare" section="2.1"/> and <xref | ||||
target="RFC8311" sectionFormat="bare" section="4.1"/> of | ||||
<xref target="RFC8311"/>;</t> | ||||
</li> | ||||
<li> | ||||
<t>is still obliged to respond appropriately to congestion | <t>is still obliged to respond appropriately to congestion | |||
feedback, even when it is solely sending non-ECN-capable | feedback, even when it is solely sending non-ECN-capable | |||
packets (for rationale, some examples and some exceptions | packets (for rationale, some examples and some exceptions | |||
see <xref target="accecn_sec_ecn-mangling"/>, <xref | see Sections <xref target="accecn_sec_ecn-mangling" format="co | |||
target="accecn_sec_ACE_init_invalid"/>).</t> | unter"/> and <xref target="accecn_sec_ACE_init_invalid" format="counter"/>).</t> | |||
</li> | ||||
<li> | ||||
<t>is still obliged to respond appropriately to congestion | <t>is still obliged to respond appropriately to congestion | |||
feedback, even if it has sent or received a SYN or SYN/ACK | feedback, even if it has sent or received a SYN or SYN/ACK | |||
packet with (AE,CWR,ECE) = (0,0,0) during the handshake;</t> | packet with (AE,CWR,ECE) = (0,0,0) during the handshake;</t> | |||
</li> | ||||
<t>MUST NOT set CWR to indicate that it has received and | <li> | |||
responded to indications of congestion.<vspace | <t><bcp14>MUST NOT</bcp14> set CWR to indicate that it has rec | |||
blankLines="1"/>For the avoidance of doubt, this is unlike | eived and | |||
responded to indications of congestion.</t> | ||||
<t>For the avoidance of doubt, this is unlike | ||||
an RFC 3168 data sender and this does not preclude the Data | an RFC 3168 data sender and this does not preclude the Data | |||
Sender from setting the bits of the ACE counter field, which | Sender from setting the bits of the ACE counter field, which | |||
includes an overloaded use of the same bit.</t> | includes an overloaded use of the same bit.</t> | |||
</list></t> | </li> | |||
</list></t> | </ul> | |||
</li> | ||||
<t>Receiving ECT:<list style="symbols"> | </ul> | |||
<t>A host in AccECN mode:<list style="symbols"> | <t>Receiving ECT:</t> | |||
<t>MUST feed back the information in the IP-ECN field of | <ul spacing="normal"> | |||
<li> | ||||
<t>A host in AccECN mode:</t> | ||||
<ul spacing="normal"> | ||||
<li> | ||||
<t><bcp14>MUST</bcp14> feed back the information in the IP-ECN | ||||
field of | ||||
incoming packets using Accurate ECN feedback, as specified | incoming packets using Accurate ECN feedback, as specified | |||
in <xref target="accecn_feedback"/>.<vspace | in <xref target="accecn_feedback"/>.</t> | |||
blankLines="1"/>For the avoidance of doubt, this requirement | <t>For the avoidance of doubt, this requirement | |||
stands even if the AccECN host has also sent or received a | stands even if the AccECN host has also sent or received a | |||
SYN or SYN/ACK with (AE,CWR,ECE) = (0,0,0). Reason: Such a | SYN or SYN/ACK with (AE,CWR,ECE) = (0,0,0). Reason: Such a | |||
SYN or SYN/ACK implies some form of packet mangling might be | SYN or SYN/ACK implies some form of packet mangling might be | |||
present. Even if the remote peer is not setting ECT, it | present. Even if the remote peer is not setting ECT, it | |||
could still be set erroneously by packet mangling at the IP | could still be set erroneously by packet mangling at the IP | |||
layer (see <xref target="accecn_sec_ecn-mangling"/>). In | layer (see <xref target="accecn_sec_ecn-mangling"/>). In | |||
such cases, the Data Sender is best placed to decide whether | such cases, the Data Sender is best placed to decide whether | |||
ECN markings are valid, but it can only do that if the Data | ECN markings are valid, but it can only do that if the Data | |||
Receiver mechanistically feeds back any ECN markings. This | Receiver mechanistically feeds back any ECN markings. This | |||
approach will not lead to TCP Options being generated | approach will not lead to TCP Options being generated | |||
unnecessarily if the recommended simple scheme in <xref | unnecessarily if the recommended simple scheme in <xref target | |||
target="accecn_option_usage"/> is used, because no byte | ="accecn_option_usage"/> is used, because no byte | |||
counters will change if no packets are set to ECT.</t> | counters will change if no packets are set to ECT.</t> | |||
</li> | ||||
<t>MUST NOT use reception of packets with ECT set in the | <li> | |||
<t><bcp14>MUST NOT</bcp14> use reception of packets with ECT s | ||||
et in the | ||||
IP-ECN field as an implicit signal that the peer is | IP-ECN field as an implicit signal that the peer is | |||
ECN-capable.<vspace blankLines="1"/>Reason: ECT at the IP | ECN-capable.</t> | |||
<t>Reason: ECT at the IP | ||||
layer does not explicitly confirm the peer has the correct | layer does not explicitly confirm the peer has the correct | |||
ECN feedback logic, because the packets could have been | ECN feedback logic, because the packets could have been | |||
mangled at the IP layer.</t> | mangled at the IP layer.</t> | |||
</list></t> | </li> | |||
</list></t> | </ul> | |||
</li> | ||||
</ul> | ||||
</section> | </section> | |||
</section> | </section> | |||
<section anchor="accecn_feedback"> | ||||
<section anchor="accecn_feedback" title="AccECN Feedback"> | <name>AccECN Feedback</name> | |||
<t>Each Data Receiver of each half connection maintains four counters, | <t>Each Data Receiver of each half connection maintains four counters, | |||
r.cep, r.ceb, r.e0b and r.e1b:<list style="symbols"> | r.cep, r.ceb, r.e0b, and r.e1b:</t> | |||
<t>The Data Receiver MUST increment the CE packet counter (r.cep), | <ul spacing="normal"> | |||
<li> | ||||
<t>The Data Receiver <bcp14>MUST</bcp14> increment the CE packet cou | ||||
nter (r.cep), | ||||
for every Acceptable packet that it receives with the CE code | for every Acceptable packet that it receives with the CE code | |||
point in the IP ECN field, including CE marked control packets and r etransmissions but | point in the IP-ECN field, including CE-marked control packets and r etransmissions but | |||
excluding CE on SYN packets (SYN=1; ACK=0).</t> | excluding CE on SYN packets (SYN=1; ACK=0).</t> | |||
</li> | ||||
<li> | ||||
<t>A Data Receiver that supports sending of AccECN TCP Options | <t>A Data Receiver that supports sending of AccECN TCP Options | |||
MUST increment the r.ceb, r.e0b or r.e1b byte counters by the | <bcp14>MUST</bcp14> increment the r.ceb, r.e0b, or r.e1b byte counte rs by the | |||
number of TCP payload octets in Acceptable packets marked with the | number of TCP payload octets in Acceptable packets marked with the | |||
CE, ECT(0) and ECT(1) codepoint in their IP-ECN field, including | CE, ECT(0), and ECT(1) codepoint in their IP-ECN field, including | |||
any payload octets on control packets and retransmissions, but not i ncluding any | any payload octets on control packets and retransmissions, but not i ncluding any | |||
payload octets on SYN packets (SYN=1; ACK=0).</t> | payload octets on SYN packets (SYN=1; ACK=0).</t> | |||
</list></t> | </li> | |||
</ul> | ||||
<t>Each Data Sender of each half connection maintains four counters, | <t>Each Data Sender of each half connection maintains four counters, | |||
s.cep, s.ceb, s.e0b and s.e1b intended to track the equivalent | s.cep, s.ceb, s.e0b, and s.e1b, intended to track the equivalent | |||
counters at the Data Receiver.</t> | counters at the Data Receiver.</t> | |||
<t>A Data Receiver feeds back the CE packet counter using the Accurate | <t>A Data Receiver feeds back the CE packet counter using the Accurate | |||
ECN (ACE) field, as explained in <xref target="accecn_ACE"/>. And it | ECN (ACE) field, as explained in <xref target="accecn_ACE"/>. And it | |||
optionally feeds back all the byte counters using the AccECN TCP | optionally feeds back all the byte counters using the AccECN TCP | |||
Option, as specified in <xref target="accecn_option"/>.</t> | Option, as specified in <xref target="accecn_option"/>.</t> | |||
<t>Whenever a Data Receiver feeds back the value of any counter, it | <t>Whenever a Data Receiver feeds back the value of any counter, it | |||
MUST report the most recent value, no matter whether it is in a pure | <bcp14>MUST</bcp14> report the most recent value, no matter whether it i s in a pure | |||
ACK, or an ACK piggybacked on a packet used by the other | ACK, or an ACK piggybacked on a packet used by the other | |||
half-connection, whether new payload data or a retransmission. | half-connection, whether a new payload data or a retransmission. | |||
Therefore the feedback piggybacked on a retransmitted packet is | Therefore, the feedback piggybacked on a retransmitted packet is | |||
unlikely to be the same as the feedback on the original packet.</t> | unlikely to be the same as the feedback on the original packet.</t> | |||
<section anchor="accecn_init_counters"> | ||||
<section anchor="accecn_init_counters" | <name>Initialization of Feedback Counters</name> | |||
title="Initialization of Feedback Counters"> | ||||
<t>When a host first enters AccECN mode, in its role as a Data | <t>When a host first enters AccECN mode, in its role as a Data | |||
Receiver it initializes its counters to r.cep = 5, r.e0b = r.e1b = 1 | Receiver, it initializes its counters to r.cep = 5, r.e0b = r.e1b = 1, | |||
and r.ceb = 0,</t> | and r.ceb = 0,</t> | |||
<t>Non-zero initial values are used to support a stateless handshake | <t>Non-zero initial values are used to support a stateless handshake | |||
(see <xref target="accecn_Interaction_SYN_Cookies"/>) and to be | (see <xref target="accecn_Interaction_SYN_Cookies"/>) and to be | |||
distinct from cases where the fields are incorrectly zeroed | distinct from cases where the fields are incorrectly zeroed | |||
(e.g., by middleboxes - see <xref | (e.g., by middleboxes -- see <xref target="accecn_sec_zero_option"/>). | |||
target="accecn_sec_zero_option"/>).</t> | </t> | |||
<t>When a host enters AccECN mode, in its role as a Data Sender, it | ||||
<t>When a host enters AccECN mode, in its role as a Data Sender it | initializes its counters to s.cep = 5, s.e0b = s.e1b = 1, and s.ceb = | |||
initializes its counters to s.cep = 5, s.e0b = s.e1b = 1 and s.ceb = | ||||
0.</t> | 0.</t> | |||
</section> | </section> | |||
<section anchor="accecn_ACE"> | ||||
<section anchor="accecn_ACE" title="The ACE Field"> | <name>The ACE Field</name> | |||
<t>After AccECN has been negotiated on the SYN and SYN/ACK, both | <t>After AccECN has been negotiated on the SYN and SYN/ACK, both | |||
hosts overload the three TCP flags (AE, CWR and ECE) in the main TCP | hosts overload the three TCP flags (AE, CWR, and ECE) in the main TCP | |||
header as one 3-bit field. Then the field is given a new name, ACE, | header as one 3-bit field. Then the field is given a new name, ACE, | |||
as shown in <xref target="accecn_Fig_ACE_ACK"/>.</t> | as shown in <xref target="accecn_Fig_ACE_ACK"/>.</t> | |||
<!-- <?rfc needLines="9" ?> --> | <!-- <?rfc needLines="9" ?> --> | |||
<figure align="center" anchor="accecn_Fig_ACE_ACK" | <figure anchor="accecn_Fig_ACE_ACK"> | |||
title="Definition of the ACE field within bytes 13 and 14 of | <name>Definition of the ACE Field Within Bytes 13 and 14 of the TCP | |||
the TCP Header (when AccECN has been negotiated and SYN=0)."> | Header (When AccECN Has Been Negotiated and SYN=0).</name> | |||
<artwork align="center"><![CDATA[ | <artwork align="center"><![CDATA[ | |||
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | |||
+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ | +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ | |||
| | | | U | A | P | R | S | F | | | | | | U | A | P | R | S | F | | |||
| Header Length | Reserved | ACE | R | C | S | S | Y | I | | | Header Length | Reserved | ACE | R | C | S | S | Y | I | | |||
| | | | G | K | H | T | N | N | | | | | | G | K | H | T | N | N | | |||
+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ | +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ | |||
]]></artwork> | ]]></artwork> | |||
</figure> | </figure> | |||
<t>The original definition of these three flags in the TCP header, | <t>The original definition of these three flags in the TCP header, | |||
including the addition of support for the ECN Nonce, is shown for | including the addition of support for the ECN-nonce, is shown for | |||
comparison in <xref target="accecn_Fig_TCPHdr"/>. This specification | comparison in <xref target="accecn_Fig_TCPHdr"/>. This specification | |||
does not rename these three TCP flags to ACE unconditionally; it | does not rename these three TCP flags to ACE unconditionally; it | |||
merely overloads them with another name and definition once an | merely overloads them with another name and definition once an | |||
AccECN connection has been established.</t> | AccECN connection has been established.</t> | |||
<t>With one exception (<xref target="accecn_ACE_3rdACK"/>), a host | <t>With one exception (<xref target="accecn_ACE_3rdACK"/>), a host | |||
with both of its half-connections in AccECN mode MUST interpret the | with both of its half-connections in AccECN mode <bcp14>MUST</bcp14> i | |||
AE, CWR and ECE flags as the 3-bit ACE counter on a segment with the | nterpret the | |||
SYN flag cleared (SYN=0). On such a packet, a Data Receiver MUST | AE, CWR, and ECE flags as the 3-bit ACE counter on a segment with the | |||
encode the three least significant bits of its r.cep counter into | SYN flag cleared (SYN=0). On such a packet, a Data Receiver <bcp14>MUS | |||
T</bcp14> | ||||
encode the 3 least significant bits of its r.cep counter into | ||||
the ACE field that it feeds back to the Data Sender. The least | the ACE field that it feeds back to the Data Sender. The least | |||
significant bit is at bit offset 9 in <xref | significant bit is at bit offset 9 in <xref target="accecn_Fig_ACE_ACK | |||
target="accecn_Fig_ACE_ACK"/>. A host MUST NOT interpret the 3 flags | "/>. A host <bcp14>MUST NOT</bcp14> interpret the three flags | |||
as a 3-bit ACE field on any segment with SYN=1 (whether ACK is 0 or | as a 3-bit ACE field on any segment with SYN=1 (whether ACK is 0 or | |||
1), or if AccECN negotiation is incomplete or has not succeeded.</t> | 1), or if AccECN negotiation is incomplete or has not succeeded.</t> | |||
<t>Both parts of each of these conditions are equally important. For | <t>Both parts of each of these conditions are equally important. For | |||
instance, even if AccECN negotiation has been successful, the ACE | instance, even if AccECN negotiation has been successful, the ACE | |||
field is not defined on any segments with SYN=1 (e.g., a | field is not defined on any segments with SYN=1 (e.g., a | |||
retransmission of an unacknowledged SYN/ACK, or when both ends send | retransmission of an unacknowledged SYN/ACK, or when both ends send | |||
SYN/ACKs after AccECN support has been successfully negotiated | SYN/ACKs after AccECN support has been successfully negotiated | |||
during a simultaneous open).</t> | during a simultaneous open).</t> | |||
<section anchor="accecn_ACE_3rdACK"> | ||||
<name>ACE Field on the ACK of the SYN/ACK</name> | ||||
<!-- [rfced] For clarity, we'd like to add quotes to "handshake encoding". Plea | ||||
se confirm this is correct, as opposed to "handshake encoding of the ACE field". | ||||
<section anchor="accecn_ACE_3rdACK" | Original: | |||
title="ACE Field on the ACK of the SYN/ACK"> | This shall be called the handshake encoding of the ACE | |||
<t>A TCP Client (A) in AccECN mode MUST feed back which of the 4 | field, and it is the only exception to the rule that the ACE field | |||
carries the 3 least significant bits of the r.cep counter on packets | ||||
with SYN=0. | ||||
--> | ||||
<t>A TCP Client (A) in AccECN mode <bcp14>MUST</bcp14> feed back whi | ||||
ch of the 4 | ||||
possible values of the IP-ECN field was on the SYN/ACK by writing | possible values of the IP-ECN field was on the SYN/ACK by writing | |||
it into the ACE field of a pure ACK with no SACK blocks using the | it into the ACE field of a pure ACK with no SACK blocks using the | |||
binary encoding in <xref target="accecn_Tab_SYN-ACK_fb2"/> (which | binary encoding in <xref target="accecn_Tab_SYN-ACK_fb2"/> (which | |||
is the same as that used on the SYN/ACK in <xref | is the same as that used on the SYN/ACK in <xref target="accecn_Tab_ | |||
target="accecn_Tab_Negotiation"/>). This shall be called the | Negotiation"/>). This shall be called the | |||
handshake encoding of the ACE field, and it is the only exception | handshake encoding of the ACE field, and it is the only exception | |||
to the rule that the ACE field carries the 3 least significant | to the rule that the ACE field carries the 3 least significant | |||
bits of the r.cep counter on packets with SYN=0.</t> | bits of the r.cep counter on packets with SYN=0.</t> | |||
<t>Normally, a TCP Client acknowledges a SYN/ACK with an ACK that | <t>Normally, a TCP Client acknowledges a SYN/ACK with an ACK that | |||
satisfies the above conditions anyway (SYN=0, no data, no SACK | satisfies the above conditions anyway (SYN=0, no data, no SACK | |||
blocks). If an AccECN TCP Client intends to acknowledge the | blocks). If an AccECN TCP Client intends to acknowledge the | |||
SYN/ACK with a packet that does not satisfy these conditions | SYN/ACK with a packet that does not satisfy these conditions | |||
(e.g., it has data to include on the ACK), it SHOULD first | (e.g., it has data to include on the ACK), it <bcp14>SHOULD</bcp14> | |||
send a pure ACK that does satisfy these conditions (see <xref | first | |||
target="accecn_Interaction_Other"/>), so that it can feed back | send a pure ACK that does satisfy these conditions (see <xref target | |||
="accecn_Interaction_Other"/>), so that it can feed back | ||||
which of the four values of the IP-ECN field arrived on the | which of the four values of the IP-ECN field arrived on the | |||
SYN/ACK. A valid exception to this "SHOULD" would be where the | SYN/ACK. A valid exception to this "<bcp14>SHOULD</bcp14>" would be where the | |||
implementation will only be used in an environment where mangling | implementation will only be used in an environment where mangling | |||
of the ECN field is unlikely.</t> | of the ECN field is unlikely.</t> | |||
<t>The TCP Client <bcp14>MUST</bcp14> also use the handshake encodin | ||||
<t>The TCP Client MUST also use the handshake encoding for the | g for the | |||
pure ACK of any retransmitted SYN/ACK that confirms that the TCP | pure ACK of any retransmitted SYN/ACK that confirms that the TCP | |||
Server supports AccECN. The procedure for the TCP Server to follow | Server supports AccECN. If the final ACK of the handshake does not a | |||
if the final ACK of the handshake does not arrive before its | rrive before its | |||
retransmission timer expires is given in <xref | retransmission timer expires, the TCP Server is follow the procedure | |||
target="accecn_sec_SYN-ACK_rexmt"/>.</t> | given in <xref target="accecn_sec_SYN-ACK_rexmt"/>.</t> | |||
<table anchor="accecn_Tab_SYN-ACK_fb2"> | ||||
<texttable anchor="accecn_Tab_SYN-ACK_fb2" | <name>The Encoding of the ACE Field in the ACK of the SYN-ACK to R | |||
title="The encoding of the ACE field in the ACK of the SY | eflect the SYN-ACK's IP-ECN Field</name> | |||
N-ACK to reflect the SYN-ACK's IP-ECN field"> | <thead> | |||
<ttcol>IP-ECN codepoint on SYN/ACK</ttcol> | <tr> | |||
<th>IP-ECN codepoint on SYN/ACK</th> | ||||
<ttcol>ACE on pure ACK of SYN/ACK</ttcol> | <th>ACE on pure ACK of SYN/ACK</th> | |||
<th>r.cep of TCP Client in AccECN mode</th> | ||||
<ttcol>r.cep of TCP Client in AccECN mode</ttcol> | </tr> | |||
</thead> | ||||
<c>Not-ECT</c> | <tbody> | |||
<tr> | ||||
<c>0b010</c> | <td>Not-ECT</td> | |||
<td>0b010</td> | ||||
<c>5</c> | <td>5</td> | |||
</tr> | ||||
<c>ECT(1)</c> | <tr> | |||
<td>ECT(1)</td> | ||||
<c>0b011</c> | <td>0b011</td> | |||
<td>5</td> | ||||
<c>5</c> | </tr> | |||
<tr> | ||||
<c>ECT(0)</c> | <td>ECT(0)</td> | |||
<td>0b100</td> | ||||
<c>0b100</c> | <td>5</td> | |||
</tr> | ||||
<c>5</c> | <tr> | |||
<td>CE</td> | ||||
<c>CE</c> | <td>0b110</td> | |||
<td>6</td> | ||||
</tr> | ||||
</tbody> | ||||
</table> | ||||
<!-- [rfced] For readability, may we break this text into two sentences? | ||||
<c>0b110</c> | Original: | |||
When an AccECN Server in SYN-RCVD state receives a pure ACK with | ||||
SYN=0 and no SACK blocks, instead of treating the ACE field as a | ||||
counter, it MUST infer the meaning of each possible value of the ACE | ||||
field from Table 4, which also shows the value that an AccECN Server | ||||
MUST set s.cep to as a result. | ||||
<c>6</c> | Perhaps: | |||
</texttable> | When an AccECN Server in SYN-RCVD state receives a pure ACK with | |||
SYN=0 and no SACK blocks, it MUST infer the meaning of each possible | ||||
value of the ACE field from Table 4 instead of treating the ACE field | ||||
as a counter. Table 4 also shows the value to which an AccECN Server | ||||
MUST set s.cep as a result. | ||||
--> | ||||
<t>When an AccECN Server in SYN-RCVD state receives a pure ACK | <t>When an AccECN Server in SYN-RCVD state receives a pure ACK | |||
with SYN=0 and no SACK blocks, instead of treating the ACE field | with SYN=0 and no SACK blocks, instead of treating the ACE field | |||
as a counter, it MUST infer the meaning of each possible value of | as a counter, it <bcp14>MUST</bcp14> infer the meaning of each possi ble value of | |||
the ACE field from <xref target="accecn_Tab_SYN-ACK_fb"/>, which | the ACE field from <xref target="accecn_Tab_SYN-ACK_fb"/>, which | |||
also shows the value that an AccECN Server MUST set s.cep to as a | also shows the value that an AccECN Server <bcp14>MUST</bcp14> set s .cep to as a | |||
result.</t> | result.</t> | |||
<!-- [rfced] We are unclear what "it" refers to in the following. Perhaps "it" | ||||
can be deleted? | ||||
Original: | ||||
Given this encoding of the ACE field on the ACK of a SYN/ACK is | ||||
exceptional, an AccECN Server using large receive offload (LRO) might | ||||
prefer to disable LRO until such an ACK has transitioned it out of | ||||
SYN-RCVD state. | ||||
--> | ||||
<t>Given this encoding of the ACE field on the ACK of a SYN/ACK is | <t>Given this encoding of the ACE field on the ACK of a SYN/ACK is | |||
exceptional, an AccECN Server using large receive offload (LRO) | exceptional, an AccECN Server using large receive offload (LRO) | |||
might prefer to disable LRO until such an ACK has transitioned it | might prefer to disable LRO until such an ACK has transitioned it | |||
out of SYN-RCVD state.</t> | out of SYN-RCVD state.</t> | |||
<table anchor="accecn_Tab_SYN-ACK_fb"> | ||||
<name>Meaning of the ACE Field on the ACK of the SYN/ACK</name> | ||||
<thead> | ||||
<tr> | ||||
<th>ACE on ACK of SYN/ACK</th> | ||||
<th>IP-ECN codepoint on SYN/ACK inferred by Server</th> | ||||
<th>s.cep of TCP Server in AccECN mode</th> | ||||
</tr> | ||||
</thead> | ||||
<tbody> | ||||
<tr> | ||||
<td>0b000</td> | ||||
<td>{Notes 1, 3}</td> | ||||
<td>Disable s.cep</td> | ||||
</tr> | ||||
<tr> | ||||
<td>0b001</td> | ||||
<td>{Notes 2, 3}</td> | ||||
<td>5</td> | ||||
</tr> | ||||
<tr> | ||||
<td>0b010</td> | ||||
<td>Not-ECT</td> | ||||
<td>5</td> | ||||
</tr> | ||||
<tr> | ||||
<td>0b011</td> | ||||
<td>ECT(1)</td> | ||||
<td>5</td> | ||||
</tr> | ||||
<tr> | ||||
<td>0b100</td> | ||||
<td>ECT(0)</td> | ||||
<td>5</td> | ||||
</tr> | ||||
<tr> | ||||
<td>0b101</td> | ||||
<td>Currently Unused {Note 2}</td> | ||||
<td>5</td> | ||||
</tr> | ||||
<tr> | ||||
<td>0b110</td> | ||||
<td>CE</td> | ||||
<td>6</td> | ||||
</tr> | ||||
<tr> | ||||
<td>0b111</td> | ||||
<td>Currently Unused {Note 2}</td> | ||||
<td>5</td> | ||||
</tr> | ||||
</tbody> | ||||
</table> | ||||
<!-- [rfced] We converted the notes following Table 4 into a list for clarity. | ||||
Please let us know if you have any concerns. | ||||
--> | ||||
<texttable anchor="accecn_Tab_SYN-ACK_fb" | <dl indent="9"><dt>Note 1:</dt><dd><t>If the Server is in AccECN mode and in SYN | |||
title="Meaning of the ACE field on the ACK of the SYN/ACK | -RCVD | |||
"> | ||||
<ttcol>ACE on ACK of SYN/ACK</ttcol> | ||||
<ttcol>IP-ECN codepoint on SYN/ACK inferred by Server</ttcol> | ||||
<ttcol>s.cep of TCP Server in AccECN mode</ttcol> | ||||
<c>0b000</c> | ||||
<c>{Notes 1, 3}</c> | ||||
<c>Disable s.cep</c> | ||||
<c>0b001</c> | ||||
<c>{Notes 2, 3}</c> | ||||
<c>5</c> | ||||
<c>0b010</c> | ||||
<c>Not-ECT</c> | ||||
<c>5</c> | ||||
<c>0b011</c> | ||||
<c>ECT(1)</c> | ||||
<c>5</c> | ||||
<c>0b100</c> | ||||
<c>ECT(0)</c> | ||||
<c>5</c> | ||||
<c>0b101</c> | ||||
<c>Currently Unused {Note 2}</c> | ||||
<c>5</c> | ||||
<c>0b110</c> | ||||
<c>CE</c> | ||||
<c>6</c> | ||||
<c>0b111</c> | ||||
<c>Currently Unused {Note 2}</c> | ||||
<c>5</c> | ||||
</texttable> | ||||
<t>{Note 1}: If the Server is in AccECN mode and in SYN-RCVD | ||||
state, and if it receives a value of zero on a pure ACK with SYN=0 | state, and if it receives a value of zero on a pure ACK with SYN=0 | |||
and no SACK blocks, for the rest of the connection the Server MUST | and no SACK blocks, for the rest of the connection the Server <bcp14 | |||
NOT set ECT on outgoing packets and MUST NOT respond to AccECN | >MUST | |||
feedback. Nonetheless, as a Data Receiver it MUST NOT disable | NOT</bcp14> set ECT on outgoing packets and <bcp14>MUST NOT</bcp14> | |||
respond to AccECN | ||||
feedback. Nonetheless, as a Data Receiver, it <bcp14>MUST NOT</bcp14 | ||||
> disable | ||||
AccECN feedback.</t> | AccECN feedback.</t> | |||
<t>Any of the circumstances below could cause a value of zero but, | <t>Any of the circumstances below could cause a value of zero but, | |||
whatever the cause, the actions above would be the appropriate | whatever the cause, the actions above would be the appropriate | |||
response:<list style="symbols"> | response:</t> | |||
<ul spacing="normal"> | ||||
<li> | ||||
<t>The TCP Client has somehow entered No ECN feedback mode | <t>The TCP Client has somehow entered No ECN feedback mode | |||
(most likely if the Server received a SYN or sent a SYN/ACK | (most likely if the Server received a SYN or sent a SYN/ACK | |||
with (AE,CWR,ECE) = (0,0,0) after entering AccECN mode, but | with (AE,CWR,ECE) = (0,0,0) after entering AccECN mode, but | |||
possible even if it didn't);</t> | possible even if it didn't);</t> | |||
</li> | ||||
<li> | ||||
<t>The TCP Client genuinely might be in AccECN mode, but its | <t>The TCP Client genuinely might be in AccECN mode, but its | |||
count of received CE marks might have caused the ACE field to | count of received CE marks might have caused the ACE field to | |||
wrap to zero. This is highly unlikely, but not impossible | wrap to zero. This is highly unlikely, but not impossible | |||
because the Server might have already sent multiple packets | because the Server might have already sent multiple packets | |||
while still in SYN-RCVD state, e.g., using TFO (see <xref | while still in SYN-RCVD state, e.g., using TFO (see <xref target | |||
target="accecn_Interaction_Other"/>) and some might have been | ="accecn_Interaction_Other"/>), and some might have been | |||
CE-marked. Then ACE on the first ACK seen by the Server might | CE-marked. Then ACE on the first ACK seen by the Server might | |||
be zero, due to previous ACKs experiencing an unfortunate | be zero, due to previous ACKs experiencing an unfortunate | |||
pattern of loss or delay.</t> | pattern of loss or delay.</t> | |||
</li> | ||||
<t>Some form of non-compliance at the TCP Client or on the | <li> | |||
<t>There is some form of non-compliance at the TCP Client or on | ||||
the | ||||
path (see <xref target="accecn_sec_ACE_init_invalid"/>).</t> | path (see <xref target="accecn_sec_ACE_init_invalid"/>).</t> | |||
</list></t> | </li> | |||
</ul></dd> | ||||
<t>{Note 2}: If the Server is in AccECN mode, these values are | <dt>Note 2:</dt><dd> If the Server is in AccECN mode, these values a | |||
re | ||||
Currently Unused but the AccECN Server's behaviour is still | Currently Unused but the AccECN Server's behaviour is still | |||
defined for forward compatibility. Then the designer of a future | defined for forward compatibility. Then the designer of a future | |||
protocol can know for certain what AccECN Servers will do with | protocol can know for certain what AccECN Servers will do with | |||
these codepoints.</t> | these codepoints.</dd> | |||
<dt>Note 3:</dt><dd> In the case where a Server that implements AccE | ||||
<t>{Note 3}: In the case where a Server that implements AccECN is | CN is | |||
also using a stateless handshake (termed a SYN cookie) it will not | also using a stateless handshake (termed a SYN cookie), it will not | |||
remember whether it entered AccECN mode. The values 0b000 or 0b001 | remember whether it entered AccECN mode. The values 0b000 or 0b001 | |||
will remind it that it did not enter AccECN mode, because AccECN | will remind it that it did not enter AccECN mode, because AccECN | |||
does not use them (see <xref | does not use them (see <xref target="accecn_Interaction_SYN_Cookies" | |||
target="accecn_Interaction_SYN_Cookies"/> for details). If a | /> for details). If a | |||
Server that uses a stateless handshake and implements AccECN | Server that uses a stateless handshake and implements AccECN | |||
receives either of these two values in the ACK, its action is | receives either of these two values in the ACK, its action is | |||
implementation-dependent and outside the scope of this document. It | implementation-dependent and outside the scope of this document. It | |||
will certainly not take the action in the third column because, | will certainly not take the action in the third column because, | |||
after it receives either of these values, it is not in AccECN | after it receives either of these values, it is not in AccECN | |||
mode. In example, it will not disable ECN (at least not just because | mode. For example, it will not disable ECN (at least not just becaus | |||
ACE | e ACE | |||
is 0b000) and it will not set s.cep.</t> | is 0b000) and it will not set s.cep.</dd></dl> | |||
</section> | </section> | |||
<section anchor="accecn_sec_ACE_feedback"> | ||||
<section anchor="accecn_sec_ACE_feedback" | <name>Encoding and Decoding Feedback in the ACE Field</name> | |||
title="Encoding and Decoding Feedback in the ACE Field"> | ||||
<t>Whenever the Data Receiver sends an ACK with SYN=0 (with or | <t>Whenever the Data Receiver sends an ACK with SYN=0 (with or | |||
without data), unless the handshake encoding in <xref | without data), unless the handshake encoding in <xref target="accecn | |||
target="accecn_ACE_3rdACK"/> applies, the Data Receiver MUST | _ACE_3rdACK"/> applies, the Data Receiver <bcp14>MUST</bcp14> | |||
encode the least significant 3 bits of its r.cep counter into the | encode the least significant 3 bits of its r.cep counter into the | |||
ACE field (see <xref target="accecn_Algo_ACE_Wrap"/>).</t> | ACE field (see <xref target="accecn_Algo_ACE_Wrap"/>).</t> | |||
<t>Whenever the Data Sender receives an ACK with SYN=0 (with or | <t>Whenever the Data Sender receives an ACK with SYN=0 (with or | |||
without data), it first checks whether it has already been | without data), it first checks whether it has already been | |||
superseded (defined in <xref target="accecn_Algo_Option_Coding"/>) | superseded (defined in <xref target="accecn_Algo_Option_Coding"/>) | |||
by another ACK in which case it ignores the ECN feedback. If the | by another ACK in which case it ignores the ECN feedback. If the | |||
ACK has not been superseded, and if the special handshake encoding | ACK has not been superseded, and if the special handshake encoding | |||
in <xref target="accecn_ACE_3rdACK"/> does not apply, the Data | in <xref target="accecn_ACE_3rdACK"/> does not apply, the Data | |||
Sender decodes the ACE field as follows (see <xref | Sender decodes the ACE field as follows (see <xref target="accecn_Al | |||
target="accecn_Algo_ACE_Wrap"/> for examples).<list | go_ACE_Wrap"/> for examples).</t> | |||
style="symbols"> | <ul spacing="normal"> | |||
<li> | ||||
<t>It takes the least significant 3 bits of its local s.cep | <t>It takes the least significant 3 bits of its local s.cep | |||
counter and subtracts them from the incoming ACE counter to | counter and subtracts them from the incoming ACE counter to | |||
work out the minimum positive increment it could apply to | work out the minimum positive increment it could apply to | |||
s.cep (assuming the ACE field only wrapped at most once).</t> | s.cep (assuming the ACE field only wrapped once at most).</t> | |||
</li> | ||||
<t>It then follows the safety procedures in <xref | <li> | |||
target="accecn_ACE_Safety_S"/> to calculate or estimate how | <t>It then follows the safety procedures in <xref target="accecn | |||
_ACE_Safety_S"/> to calculate or estimate how | ||||
many packets the ACK could have acknowledged under the | many packets the ACK could have acknowledged under the | |||
prevailing conditions to determine whether the ACE field might | prevailing conditions to determine whether the ACE field might | |||
have wrapped more than once.</t> | have wrapped more than once.</t> | |||
</list></t> | </li> | |||
</ul> | ||||
<t>The encode/decode procedures during the three-way handshake are | <t>The encode/decode procedures during the three-way handshake are | |||
exceptions to the general rules given so far, so they are spelled | exceptions to the general rules given so far, so they are spelled | |||
out step by step below for clarity:<list style="symbols"> | out step by step below for clarity:</t> | |||
<ul spacing="normal"> | ||||
<li> | ||||
<t>If a TCP Server in AccECN mode receives a CE mark in the | <t>If a TCP Server in AccECN mode receives a CE mark in the | |||
IP-ECN field of a SYN (SYN=1, ACK=0), it MUST NOT increment | IP-ECN field of a SYN (SYN=1, ACK=0), it <bcp14>MUST NOT</bcp14> | |||
r.cep (it remains at its initial value of 5). <vspace | increment | |||
blankLines="1"/>Reason: It would be redundant for the Server | r.cep (it remains at its initial value of 5). </t> | |||
<t>Reason: It would be redundant for the Server | ||||
to include CE-marked SYNs in its r.cep counter, because it | to include CE-marked SYNs in its r.cep counter, because it | |||
already reliably delivers feedback of any CE marking using the | already reliably delivers feedback of any CE marking using the | |||
encoding in the top block of <xref | encoding in the top block of <xref target="accecn_Tab_Negotiatio | |||
target="accecn_Tab_Negotiation"/> in the SYN/ACK. This also | n"/> in the SYN/ACK. This also | |||
ensures that, when the Server starts using the ACE field, it | ensures that, when the Server starts using the ACE field, it | |||
has not unnecessarily consumed more than one initial value, | has not unnecessarily consumed more than one initial value, | |||
given they can be used to negotiate variants of the AccECN | given they can be used to negotiate variants of the AccECN | |||
protocol (see <xref target="accecn_space_evolution"/>).</t> | protocol (see <xref target="accecn_space_evolution"/>).</t> | |||
</li> | ||||
<li> | ||||
<t>If a TCP Client in AccECN mode receives CE feedback in the | <t>If a TCP Client in AccECN mode receives CE feedback in the | |||
TCP flags of a SYN/ACK, it MUST NOT increment s.cep (it | TCP flags of a SYN/ACK, it <bcp14>MUST NOT</bcp14> increment s.c | |||
remains at its initial value of 5), so that it stays in step | ep (it | |||
remains at its initial value of 5) so that it stays in step | ||||
with r.cep on the Server. Nonetheless, the TCP Client still | with r.cep on the Server. Nonetheless, the TCP Client still | |||
triggers the congestion control actions necessary to respond | triggers the congestion control actions necessary to respond | |||
to the CE feedback.</t> | to the CE feedback.</t> | |||
</li> | ||||
<li> | ||||
<t>If a TCP Client in AccECN mode receives a CE mark in the | <t>If a TCP Client in AccECN mode receives a CE mark in the | |||
IP-ECN field of a SYN/ACK, it MUST increment r.cep, but no | IP-ECN field of a SYN/ACK, it <bcp14>MUST</bcp14> increment r.ce p, but no | |||
more than once no matter how many CE-marked SYN/ACKs it | more than once no matter how many CE-marked SYN/ACKs it | |||
receives (i.e., incremented from 5 to 6, but no further). | receives (i.e., incremented from 5 to 6, but no further). | |||
<vspace blankLines="1"/>Reason: Incrementing r.cep ensures the | </t> | |||
<t>Reason: Incrementing r.cep ensures the | ||||
Client will eventually deliver any CE marking to the Server | Client will eventually deliver any CE marking to the Server | |||
reliably when it starts using the ACE field. Even though the | reliably when it starts using the ACE field. Even though the | |||
Client also feeds back any CE marking on the ACK of the | Client also feeds back any CE marking on the ACK of the | |||
SYN/ACK using the encoding in <xref | SYN/ACK using the encoding in <xref target="accecn_Tab_SYN-ACK_f | |||
target="accecn_Tab_SYN-ACK_fb2"/>, this ACK is not delivered | b2"/>, this ACK is not delivered | |||
reliably, so it can be considered as a timely notification | reliably, so it can be considered as a timely notification | |||
that is redundant but unreliable. The Client does not | that is redundant but unreliable. The Client does not | |||
increment r.cep more than once, because the Server can only | increment r.cep more than once, because the Server can only | |||
increment s.cep once (see next bullet). Also, this limits the | increment s.cep once (see next bullet). Also, this limits the | |||
unnecessarily consumed initial values of the ACE field to | unnecessarily consumed initial values of the ACE field to | |||
two.</t> | two.</t> | |||
</li> | ||||
<li> | ||||
<t>If a TCP Server in AccECN mode and in SYN-RCVD state | <t>If a TCP Server in AccECN mode and in SYN-RCVD state | |||
receives CE feedback in the TCP flags of a pure ACK with no | receives CE feedback in the TCP flags of a pure ACK with no | |||
SACK blocks, it MUST increment s.cep (from 5 to 6). The TCP | SACK blocks, it <bcp14>MUST</bcp14> increment s.cep (from 5 to 6 ). The TCP | |||
Server then triggers the congestion control actions necessary | Server then triggers the congestion control actions necessary | |||
to respond to the CE feedback.<vspace | to respond to the CE feedback.</t> | |||
blankLines="1"/>Reasoning: The TCP Server can only increment | <t>Reasoning: The TCP Server can only increment | |||
s.cep once, because the first ACK it receives will cause it to | s.cep once, because the first ACK it receives will cause it to | |||
transition out of SYN-RCVD state. The Server's congestion | transition out of SYN-RCVD state. The Server's congestion | |||
response would be no different even if it could receive | response would be no different, even if it could receive | |||
feedback of more than one CE-marked SYN/ACK.<vspace | feedback of more than one CE-marked SYN/ACK.</t> | |||
blankLines="1"/>Once the TCP Server transitions to ESTABLISHED | <t>Once the TCP Server transitions to ESTABLISHED | |||
state, it might later receive other pure ACK(s) with the | state, it might later receive other pure ACK(s) with the | |||
handshake encoding in the ACE field. A Server MAY implement a | handshake encoding in the ACE field. A Server <bcp14>MAY</bcp14> implement a | |||
test for such a case, but it is not required. Therefore, once | test for such a case, but it is not required. Therefore, once | |||
in the ESTABLISHED state, it will be sufficient for the Server | in the ESTABLISHED state, it will be sufficient for the Server | |||
to consider the ACE field to be encoded as the normal ACE | to consider the ACE field to be encoded as the normal ACE | |||
counter on all packets with SYN=0.<vspace | counter on all packets with SYN=0.</t> | |||
blankLines="1"/>Reasoning: Such ACKs will be quite unusual, | <t>Reasoning: Such ACKs will be quite unusual, | |||
e.g., a SYN/ACK (or ACK of the SYN/ACK) that is delayed | e.g., a SYN/ACK (or ACK of the SYN/ACK) that is delayed | |||
for longer than the Server's retransmission timeout; or packet | for longer than the Server's retransmission timeout; or packet | |||
duplication by the network. And the impact of any error in the | duplication by the network. And the impact of any error in the | |||
feedback on such ACKs will only be temporary.</t> | feedback on such ACKs will only be temporary.</t> | |||
</list></t> | </li> | |||
</ul> | ||||
</section> | </section> | |||
<section anchor="accecn_sec_ecn-mangling"> | ||||
<section anchor="accecn_sec_ecn-mangling" | <name>Testing for Mangling of the IP/ECN Field</name> | |||
title="Testing for Mangling of the IP/ECN Field"> | <ul spacing="normal"> | |||
<t><list style="symbols"> | <li> | |||
<t>TCP Client side:<vspace | <t>TCP Client side:</t> | |||
blankLines="1"/>The value of the TCP-ECN flags on the SYN/ACK indi | <t>The value of the TCP-ECN flags on the SYN/ACK indicates the | |||
cates the | ||||
value of the IP-ECN field when the SYN arrived at the Server. The | value of the IP-ECN field when the SYN arrived at the Server. The | |||
TCP Client can compare this with how it originally set the IP-ECN | TCP Client can compare this with how it originally set the IP-ECN | |||
field on the SYN. If this comparison implies an invalid transition | field on the SYN. If this comparison implies an invalid transition | |||
(defined below) of the IP-ECN field, for the remainder of the | (defined below) of the IP-ECN field, for the remainder of the | |||
half-connection the Client is advised to send non-ECN-capable | half-connection the Client is advised to send non-ECN-capable | |||
packets, but it still ought to respond to any feedback of CE | packets, but it still ought to respond to any feedback of CE | |||
markings (explained below). However, the TCP Client MUST remain in | markings (explained below). However, the TCP Client <bcp14>MUST</b | |||
the AccECN feedback mode and it MUST continue to feed back any ECN | cp14> remain in | |||
the AccECN feedback mode and it <bcp14>MUST</bcp14> continue to fe | ||||
ed back any ECN | ||||
markings on arriving packets (in its role as Data Receiver). <!--T here is no need to say the following for forward compatibility: | markings on arriving packets (in its role as Data Receiver). <!--T here is no need to say the following for forward compatibility: | |||
"If the server deliberately sends false feedback in the ACE field that implies a n unsafe transition, | "If the server deliberately sends false feedback in the ACE field that implies a n unsafe transition, | |||
it MUST continue the connection | it MUST continue the connection | |||
even if the client does not disable sending ECN-capable packets"--></t> | even if the client does not disable sending ECN-capable packets"--> | |||
</t> | ||||
<t>TCP Server side:<vspace | </li> | |||
blankLines="1"/>The value of the ACE field on the last ACK of the | <li> | |||
three-way handshake | <t>TCP Server side:</t> | |||
<t>The value of the ACE field on the last ACK of the three-way h | ||||
andshake | ||||
indicates the value of the IP-ECN field when the SYN/ACK arrived | indicates the value of the IP-ECN field when the SYN/ACK arrived | |||
at the TCP Client. The Server can compare this with how it | at the TCP Client. The Server can compare this with how it | |||
originally set the IP-ECN field on the SYN/ACK. If this comparison | originally set the IP-ECN field on the SYN/ACK. If this comparison | |||
implies an invalid transition of the IP-ECN field, for the | implies an invalid transition of the IP-ECN field, for the | |||
remainder of the half-connection the Server is advised to send | remainder of the half-connection the Server is advised to send | |||
non-ECN-capable packets, but it still ought to respond to any | non-ECN-capable packets, but it still ought to respond to any | |||
feedback of CE markings (explained below). However, the Server | feedback of CE markings (explained below). However, the Server | |||
MUST remain in the AccECN feedback mode and it MUST continue to | <bcp14>MUST</bcp14> remain in the AccECN feedback mode and it <bcp 14>MUST</bcp14> continue to | |||
feed back any ECN markings on arriving packets (in its role as | feed back any ECN markings on arriving packets (in its role as | |||
Data Receiver).<!--There is no need to say the following for forwa rd compatibility: | Data Receiver).<!--There is no need to say the following for forwa rd compatibility: | |||
"If the client deliberately sends false feedback in the ACE field that implies a n unsafe transition, | "If the client deliberately sends false feedback in the ACE field that implies a n unsafe transition, | |||
it MUST continue the connection | it MUST continue the connection | |||
even if the server does not disable sending ECN-capable packets"--></t> | even if the server does not disable sending ECN-capable packets"--> | |||
</list></t> | </t> | |||
</li> | ||||
</ul> | ||||
<t>If a Data Sender in AccECN mode starts sending non-ECN-capable | <t>If a Data Sender in AccECN mode starts sending non-ECN-capable | |||
packets because it has detected mangling, it is still advised to | packets because it has detected mangling, it is still advised to | |||
respond to CE feedback. Reason: any CE-marking arriving at the | respond to CE feedback. Reason: Any CE marking arriving at the | |||
Data Receiver could be due to something early in the path mangling | Data Receiver could be due to something early in the path mangling | |||
the non-ECN-capable IP-ECN field into an ECN-capable codepoint and | the non-ECN-capable IP-ECN field into an ECN-capable codepoint and | |||
then, later in the path, a network bottleneck might be applying | then, later in the path, a network bottleneck might be applying | |||
CE-markings to indicate genuine congestion. This argument applies | CE markings to indicate genuine congestion. This argument applies | |||
whether the handshake packet originally sent by the TCP Client or | whether the handshake packet originally sent by the TCP Client or | |||
Server was non-ECN-capable or ECN-capable because, in either case, | Server was non-ECN-capable or ECN-capable because, in either case, | |||
an unsafe transition could imply that non-ECN-capable packets | an unsafe transition could imply that non-ECN-capable packets | |||
later in the connection might get mangled.</t> | later in the connection might get mangled.</t> | |||
<t>Once a Data Sender has entered AccECN mode it is advised to | <t>Once a Data Sender has entered AccECN mode it is advised to | |||
check whether it is receiving continuous feedback of CE. Specifying | check whether it is receiving continuous feedback of CE. Specifying | |||
exactly how to do this is beyond the scope of the present | exactly how to do this is beyond the scope of the present | |||
specification, but the sender might check whether the feedback for | specification, but the sender might check whether the feedback for | |||
every packet it sends for the first three or four rounds indicates | every packet it sends for the first three or four rounds indicates | |||
CE-marking. If continuous CE-marking is detected, for the | CE marking. If continuous CE marking is detected, for the | |||
remainder of the half-connection, the Data Sender ought to send | remainder of the half-connection, the Data Sender ought to send | |||
non-ECN-capable packets and it is advised not to respond to any | non-ECN-capable packets, and it is advised not to respond to any | |||
feedback of CE markings. The Data Sender might occasionally test | feedback of CE markings. The Data Sender might occasionally test | |||
whether it can resume sending ECN-capable packets.</t> | whether it can resume sending ECN-capable packets.</t> | |||
<t>The above advice on switching to sending non-ECN-capable | <t>The above advice on switching to sending non-ECN-capable | |||
packets but still responding to CE-markings unless they become | packets but still responding to CE markings unless they become | |||
continuous is not stated normatively (in capitals), because the | continuous is not stated normatively (in capitals), because the | |||
best strategy might depend on experience of the most likely types | best strategy might depend on experience of the most likely types | |||
of mangling, which can only be known at the time of | of mangling, which can only be known at the time of | |||
deployment. The same is true for other forms of mangling (or resumpt ion | deployment. The same is true for other forms of mangling (or resumpt ion | |||
of expected marking) during later stages of a connection.</t> | of expected marking) during later stages of a connection.</t> | |||
<t>As always, once a host has entered AccECN mode, it follows the | <t>As always, once a host has entered AccECN mode, it follows the | |||
general mandatory requirements (<xref | general mandatory requirements (<xref target="accecn_implications_ac | |||
target="accecn_implications_accecn_mode"/>) to remain in the same | cecn_mode"/>) to remain in the same | |||
feedback mode and to continue feeding back any ECN markings on | feedback mode and to continue feeding back any ECN markings on | |||
arriving packets using AccECN feedback. This follows the general | arriving packets using AccECN feedback. This follows the general | |||
approach where an AccECN Data Receiver mechanistically reflects | approach where an AccECN Data Receiver mechanistically reflects | |||
whatever it receives (<xref target="accecn_demb_reflector"/>).</t> | whatever it receives (<xref target="accecn_demb_reflector"/>).</t> | |||
<t>The ACK of the SYN/ACK is not reliably delivered (nonetheless, | <t>The ACK of the SYN/ACK is not reliably delivered (nonetheless, | |||
the count of CE marks is still eventually delivered reliably). If | the count of CE marks is still eventually delivered reliably). If | |||
this ACK does not arrive, the Server is advised to continue to | this ACK does not arrive, the Server is advised to continue to | |||
send ECN-capable packets without having tested for mangling of the | send ECN-capable packets without having tested for mangling of the | |||
IP-ECN field on the SYN/ACK.</t> | IP-ECN field on the SYN/ACK.</t> | |||
<t>All the fall-back behaviours in this section are necessary in | <t>All the fall-back behaviours in this section are necessary in | |||
case mangling of the IP-ECN field is asymmetric, which is | case mangling of the IP-ECN field is asymmetric, which is | |||
currently common over some mobile networks <xref | currently common over some mobile networks <xref target="Mandalari18 | |||
target="Mandalari18"/>. Then one end might see no unsafe | "/>. In this case, one end might see no unsafe | |||
transition and continue sending ECN-capable packets, while the | transition and continue sending ECN-capable packets, while the | |||
other end sees an unsafe transition and stops sending ECN-capable | other end sees an unsafe transition and stops sending ECN-capable | |||
packets.</t> | packets.</t> | |||
<t>Invalid transitions of the IP-ECN field are defined in Section | ||||
<t>Invalid transitions of the IP-ECN field are defined in section | <xref target="RFC3168" sectionFormat="bare" section="18"/> of the | |||
18 of the Classic ECN specification <xref target="RFC3168"/> and rep | Classic ECN specification <xref target="RFC3168"/> and repeated | |||
eated here for | here for convenience:</t> | |||
convenience:<list style="symbols"> | <ul spacing="normal"> | |||
<t>the not-ECT codepoint changes;</t> | <li> | |||
<t>the Not-ECT codepoint changes;</t> | ||||
<t>either ECT codepoint transitions to not-ECT;</t> | </li> | |||
<li> | ||||
<t>either ECT codepoint transitions to Not-ECT;</t> | ||||
</li> | ||||
<li> | ||||
<t>the CE codepoint changes.</t> | <t>the CE codepoint changes.</t> | |||
</list></t> | </li> | |||
</ul> | ||||
<t>RFC 3168 says that a router that changes ECT to not-ECT is | <t>RFC 3168 says that a router that changes ECT to Not-ECT is | |||
invalid but safe. However, from a host's viewpoint, this | invalid but safe. However, from a host's viewpoint, this | |||
transition is unsafe because it could be the result of two | transition is unsafe because it could be the result of two | |||
transitions at different routers on the path: ECT to CE (safe) | transitions at different routers on the path: ECT to CE (safe) | |||
then CE to not-ECT (unsafe). This scenario could well happen where | then CE to Not-ECT (unsafe). This scenario could well happen where | |||
an ECN-enabled home router congests its upstream mobile broadband | an ECN-enabled home router congests its upstream mobile broadband | |||
bottleneck link, then the ingress to the mobile network clears the | bottleneck link, then the ingress to the mobile network clears the | |||
ECN field <xref target="Mandalari18"/>.</t> | ECN field <xref target="Mandalari18"/>.</t> | |||
</section> | </section> | |||
<section anchor="accecn_sec_ACE_init_invalid"> | ||||
<section anchor="accecn_sec_ACE_init_invalid" | <name>Testing for Zeroing of the ACE Field</name> | |||
title="Testing for Zeroing of the ACE Field"> | ||||
<t><xref target="accecn_ACE"/> required the Data Receiver to | <t><xref target="accecn_ACE"/> required the Data Receiver to | |||
initialize the r.cep counter to a non-zero value. Therefore, in | initialize the r.cep counter to a non-zero value. Therefore, in | |||
either direction the initial value of the ACE counter ought to be | either direction the initial value of the ACE counter ought to be | |||
non-zero.</t> | non-zero.</t> | |||
<t>This section does not concern the case where the ACE field is | <t>This section does not concern the case where the ACE field is | |||
zero when the handshake encoding has been used on the ACK of the | zero when the handshake encoding has been used on the ACK of the | |||
SYN/ACK under the carefully worded conditions in <xref | SYN/ACK under the carefully worded conditions in <xref target="accec | |||
target="accecn_ACE_3rdACK"/>.</t> | n_ACE_3rdACK"/>.</t> | |||
<t>If AccECN has been successfully negotiated, the Data Sender <bcp1 | ||||
<t>If AccECN has been successfully negotiated, the Data Sender MAY | 4>MAY</bcp14> | |||
check the value of the ACE counter in the first feedback packet | check the value of the ACE counter in the first feedback packet | |||
(with or without data) that arrives after the three-way handshake. | (with or without data) that arrives after the three-way handshake. | |||
If the value of this ACE field is found to be zero (0b000), for the | If the value of this ACE field is found to be zero (0b000), for the | |||
remainder of the half-connection the Data Sender ought to send | remainder of the half-connection the Data Sender ought to send | |||
non-ECN-capable packets and it is advised not to respond to any | non-ECN-capable packets and it is advised not to respond to any | |||
feedback of CE markings.</t> | feedback of CE markings.</t> | |||
<t>Reason: the symptoms imply any or all of the following: <list | <t>Reason: the symptoms imply any or all of the following: </t> | |||
style="symbols"> | <ul spacing="normal"> | |||
<li> | ||||
<t>the remote peer has somehow entered Not ECN | <t>the remote peer has somehow entered Not ECN | |||
feedback mode;</t> | feedback mode;</t> | |||
</li> | ||||
<li> | ||||
<t>a broken remote TCP implementation;</t> | <t>a broken remote TCP implementation;</t> | |||
</li> | ||||
<li> | ||||
<t>potential mangling of the ECN fields in the TCP headers (alth ough | <t>potential mangling of the ECN fields in the TCP headers (alth ough | |||
unlikely given they clearly survived during the handshake).</t> | unlikely given they clearly survived during the handshake).</t> | |||
</list></t> | </li> | |||
</ul> | ||||
<!-- [rfced] We are having trouble parsing "depend on experience of the most lik | ||||
ely scenarios". Does it depend on how good the experience is, the outcome, etc? | ||||
Please consider whether this text can be clarified. | ||||
Original: | ||||
This advice is not stated normatively (in capitals), because the best | ||||
strategy might depend on experience of the most likely scenarios, | ||||
which can only be known at the time of deployment. | ||||
--> | ||||
<t>This advice is not stated normatively (in capitals), because the best | <t>This advice is not stated normatively (in capitals), because the best | |||
strategy might depend on experience of the most likely scenarios, | strategy might depend on experience of the most likely scenarios, | |||
which can only be known at the time of deployment.</t> | which can only be known at the time of deployment.</t> | |||
<t>Note that a host in AccECN mode <bcp14>MUST</bcp14> continue to p | ||||
<t>Note that a host in AccECN mode MUST continue to provide | rovide | |||
Accurate ECN feedback to its peer, even if it is no longer sending | Accurate ECN feedback to its peer, even if it is no longer sending | |||
ECT itself over the other half connection. <!--There is no need to s ay the following for forward compatibility: | ECT itself over the other half connection. <!--There is no need to s ay the following for forward compatibility: | |||
"If a data receiver negotiates AccECN but then zeros the ACE field in its first segment with SYN=0, | "If a data receiver negotiates AccECN but then zeros the ACE field in its first segment with SYN=0, | |||
it MUST continue the connection even if the data sender does not disable sending | it MUST continue the connection even if the data sender does not disable sending | |||
ECN-capable packets."--></t> | ECN-capable packets."--> | |||
</t> | ||||
<t>If reordering occurs, the first feedback packet that arrives will | <t>If reordering occurs, the first feedback packet that arrives will | |||
not necessarily be the same as the first packet in sequence order. | not necessarily be the same as the first packet in sequence order. | |||
The test has been specified loosely like this to simplify | The test has been specified loosely like this to simplify | |||
implementation, and because it would not have been any more | implementation, and because it would not have been any more | |||
precise to have specified the first packet in sequence order, | precise to have specified the first packet in sequence order, | |||
which would not necessarily be the first ACE counter that the Data | which would not necessarily be the first ACE counter that the Data | |||
Receiver fed back anyway, given it might have been a | Receiver fed back anyway, given it might have been a | |||
retransmission.</t> | retransmission.</t> | |||
<t>The possibility of reordering means that there is a small | ||||
<t>The possibility of re-ordering means that there is a small | ||||
chance that the ACE field on the first packet to arrive is | chance that the ACE field on the first packet to arrive is | |||
genuinely zero (without middlebox interference). This would cause | genuinely zero (without middlebox interference). This would cause | |||
a host to unnecessarily disable ECN for a half connection. | a host to unnecessarily disable ECN for a half connection. | |||
Therefore, in environments where there is no evidence of the ACE | Therefore, in environments where there is no evidence of the ACE | |||
field being zeroed, implementations MAY skip this test.</t> | field being zeroed, implementations <bcp14>MAY</bcp14> skip this tes | |||
t.</t> | ||||
<t>Note that the Data Sender MUST NOT test whether the arriving | <t>Note that the Data Sender <bcp14>MUST NOT</bcp14> test whether th | |||
e arriving | ||||
counter in the initial ACE field has been initialized to a | counter in the initial ACE field has been initialized to a | |||
specific valid value - the above check solely tests whether the | specific valid value -- the above check solely tests whether the | |||
ACE fields have been incorrectly zeroed. This allows hosts to use | ACE fields have been incorrectly zeroed. This allows hosts to use | |||
different initial values as an additional signalling channel in | different initial values as an additional signalling channel in the | |||
future.</t> | future.</t> | |||
</section> | </section> | |||
<section anchor="accecn_ACE_Safety"> | ||||
<section anchor="accecn_ACE_Safety" | <name>Safety Against Ambiguity of the ACE Field</name> | |||
title="Safety against Ambiguity of the ACE Field"> | ||||
<t>If too many CE-marked segments are acknowledged at once, or if | <t>If too many CE-marked segments are acknowledged at once, or if | |||
a long run of ACKs is lost or thinned out, the 3-bit counter in | a long run of ACKs is lost or thinned out, the 3-bit counter in | |||
the ACE field might have cycled between two ACKs arriving at the | the ACE field might have cycled between two ACKs arriving at the | |||
Data Sender. The following safety procedures minimize this | Data Sender. The following safety procedures minimize this | |||
ambiguity.</t> | ambiguity.</t> | |||
<section anchor="accecn_ACE_Safety_R"> | ||||
<section anchor="accecn_ACE_Safety_R" | <name>Packet Receiver Safety Procedures</name> | |||
title="Packet Receiver Safety Procedures"> | ||||
<t>The following rules define when the receiver of a packet in Acc ECN | <t>The following rules define when the receiver of a packet in Acc ECN | |||
mode emits an ACK:<list style="hanging"> | mode emits an ACK:</t> | |||
<t hangText="Change-Triggered ACKs:">An AccECN Data Receiver | <dl newline="false" spacing="normal"> | |||
SHOULD emit an ACK whenever a data packet marked CE arrives | <dt>Change-Triggered ACKs:</dt> | |||
after the previous packet was not CE.<vspace | <dd> | |||
blankLines="1"/>Even though this rule is stated as a | <t>An AccECN Data Receiver | |||
"SHOULD", it is important for a transition to trigger an ACK | <bcp14>SHOULD</bcp14> emit an ACK whenever a data packet marke | |||
if at all possible, The only valid exception to this rule is | d CE arrives | |||
given below these bullets.<vspace blankLines="1"/>For the | after the previous packet was not CE.</t> | |||
<!-- [rfced] Where is "below these bullets", as we don't see a bulletized list i | ||||
n Section 3.2.2.5.1? If possible, we recommend adding a pointer for clarity. | ||||
Original: | ||||
Even though this rule is stated as a "SHOULD", it is important for | ||||
a transition to trigger an ACK if at all possible, The only valid | ||||
exception to this rule is given below these bullets. | ||||
--> | ||||
<t>Even though this rule is stated as a | ||||
"<bcp14>SHOULD</bcp14>", it is important for a transition to t | ||||
rigger an ACK | ||||
if at all possible. The only valid exception to this rule is | ||||
given below these bullets.</t> | ||||
<t>For the | ||||
avoidance of doubt, this rule is deliberately worded to | avoidance of doubt, this rule is deliberately worded to | |||
apply solely when <spanx style="emph">data</spanx> packets | apply solely when <em>data</em> packets | |||
arrive, but the comparison with the previous packet includes | arrive, but the comparison with the previous packet includes | |||
any packet, not just data packets.</t> | any packet, not just data packets.</t> | |||
</dd> | ||||
<t hangText="Increment-Triggered ACKs:">An AccECN receiver of | <dt>Increment-Triggered ACKs:</dt> | |||
a packet | <dd>An AccECN receiver of a packet | |||
MUST emit an ACK if 'n' CE marks have arrived since | <bcp14>MUST</bcp14> emit an ACK if 'n' CE marks have arrived s | |||
ince | ||||
the previous ACK. If there is unacknowledged data at the | the previous ACK. If there is unacknowledged data at the | |||
receiver, 'n' SHOULD be 2. If there is no unacknowledged data | receiver, 'n' <bcp14>SHOULD</bcp14> be 2. If there is no unack | |||
at the receiver, 'n' SHOULD be 3 and MUST be no less | nowledged data | |||
than 3. In either case, 'n' MUST be no greater than 7.</t> | at the receiver, 'n' <bcp14>SHOULD</bcp14> be 3 and <bcp14>MUS | |||
</list>The above rules for when to send an ACK are designed to | T</bcp14> be no less | |||
be complemented by those in <xref | than 3. In either case, 'n' <bcp14>MUST</bcp14> be no greater | |||
target="accecn_option_usage"/>, which concern whether an AccECN | than 7.</dd> | |||
</dl> | ||||
<t>The above rules for when to send an ACK are designed to | ||||
be complemented by those in <xref target="accecn_option_usage"/>, | ||||
which concern whether an AccECN | ||||
TCP Option ought to be included on ACKs.</t> | TCP Option ought to be included on ACKs.</t> | |||
<t>If the arrivals of a number of data packets are all processed | <t>If the arrivals of a number of data packets are all processed | |||
as one event, e.g., using large receive offload (LRO) or | as one event, e.g., using large receive offload (LRO) or | |||
generic receive offload (GRO), both the above rules SHOULD be | generic receive offload (GRO), both the above rules <bcp14>SHOULD< | |||
/bcp14> be | ||||
interpreted as requiring multiple ACKs to be emitted | interpreted as requiring multiple ACKs to be emitted | |||
back-to-back (for each transition and for each sequence of 'n' | back to back (for each transition and for each sequence of 'n' | |||
CE marks). If this is problematic for high performance, either | CE marks). If this is problematic for high performance, either | |||
rule can be interpreted as requiring just a single ACK at the | rule can be interpreted as requiring just a single ACK at the | |||
end of the whole receive event.</t> | end of the whole receive event.</t> | |||
<t>Even if a number of data packets do not arrive as one event, | <t>Even if a number of data packets do not arrive as one event, | |||
the 'Change-Triggered ACKs' rule could sometimes cause the ACK | the 'Change-Triggered ACKs' rule could sometimes cause the ACK | |||
rate to be problematic for high performance (although high | rate to be problematic for high performance (although high | |||
performance protocols such as DCTCP already successfully use | performance protocols such as DCTCP already successfully use | |||
change-triggered ACKs). The rationale for change-triggered ACKs | change-triggered ACKs). The rationale for change-triggered ACKs | |||
is so that the Data Sender can rely on them to detect queue | is so that the Data Sender can rely on them to detect queue | |||
growth as soon as possible, particularly at the start of a flow. | growth as soon as possible, particularly at the start of a flow. | |||
The approach can lead to some additional ACKs but it feeds back | The approach can lead to some additional ACKs but it feeds back | |||
the timing and the order in which ECN marks are received with | the timing and the order in which ECN marks are received with | |||
minimal additional complexity. If CE marks are infrequent, as is | minimal additional complexity. If CE marks are infrequent, as is | |||
the case for most Active Queue Managment (AQM) packet schedulers | the case for most Active Queue Management (AQM) packet schedulers | |||
at the time of writing, or there are | at the time of writing, or there are | |||
multiple marks in a row, the additional load will be low. | multiple marks in a row, the additional load will be low. | |||
However, marking patterns with numerous non-contiguous CE marks | However, marking patterns with numerous non-contiguous CE marks | |||
could increase the load significantly. One possible compromise | could increase the load significantly. One possible compromise | |||
would be for the receiver to heuristically detect whether the | would be for the receiver to heuristically detect whether the | |||
sender is in slow-start, then to implement change-triggered ACKs | sender is in slow-start, then to implement change-triggered ACKs | |||
while the sender is in slow-start, and offload otherwise.</t> | while the sender is in slow-start, and offload otherwise.</t> | |||
<t>In a scenario where both endpoints support AccECN, if host B | <t>In a scenario where both endpoints support AccECN, if host B | |||
has chosen to use ECN-capable pure ACKs (as | has chosen to use ECN-capable pure ACKs (as | |||
allowed in <xref target="RFC8311"/> experiments) and enough of | allowed in <xref target="RFC8311"/> experiments) and enough of | |||
these ACKs become CE-marked, then the 'Increment-Triggered ACKs' | these ACKs become CE marked, then the 'Increment-Triggered ACKs' | |||
rule ensures that its peer (host A) gives B sufficient | rule ensures that its peer (host A) gives B sufficient | |||
feedback about this congestion on the ACKs from B to A. | feedback about this congestion on the ACKs from B to A. | |||
Normally, for instance in a unidirectional data scenario from | Normally, for instance in a unidirectional data scenario from | |||
host A to B, the Data Sender (A) can piggyback that feedback on | host A to B, the Data Sender (A) can piggyback that feedback on | |||
its data. But if A stops sending data, the second part of the | its data. But if A stops sending data, the second part of the | |||
'Increment-Triggered ACKs' rule requires A to emit a pure ACK | 'Increment-Triggered ACKs' rule requires A to emit a pure ACK | |||
for at least every third CE-marked incoming ACK over the | for at least every third CE-marked incoming ACK over the | |||
subsequent round trip.</t> | subsequent round trip.</t> | |||
<t>Although TCP normally only ACKs data segments, in this | <t>Although TCP normally only ACKs data segments, in this | |||
case the increment-triggered ACK rule makes it mandatory for A | case the increment-triggered ACK rule makes it mandatory for A | |||
to emit ACKs of ACKs. This is justifiable because the ACKs in | to emit ACKs of ACKs. This is justifiable because the ACKs in | |||
this case are ECN-capable and so, even though the ACKs of these | this case are ECN-capable and so, even though the ACKs of these | |||
ACKs do not acknowledge new data, they feed back new congestion | ACKs do not acknowledge new data, they feed back new congestion | |||
state (useful in case B starts sending). The minimum of 3 for | state (useful in case B starts sending). The minimum of 3 for | |||
'n' in this case ensures that, even if A also uses ECN-capable | 'n' in this case ensures that, even if A also uses ECN-capable | |||
pure ACKs, and even if there is pathological congestion in both | pure ACKs, and even if there is pathological congestion in both | |||
directions, any resulting ping-pong of ACKs will be rapidly | directions, any resulting ping-pong of ACKs will be rapidly | |||
damped.</t> | damped.</t> | |||
skipping to change at line 1925 ¶ | skipping to change at line 1902 ¶ | |||
<t>Although TCP normally only ACKs data segments, in this | <t>Although TCP normally only ACKs data segments, in this | |||
case the increment-triggered ACK rule makes it mandatory for A | case the increment-triggered ACK rule makes it mandatory for A | |||
to emit ACKs of ACKs. This is justifiable because the ACKs in | to emit ACKs of ACKs. This is justifiable because the ACKs in | |||
this case are ECN-capable and so, even though the ACKs of these | this case are ECN-capable and so, even though the ACKs of these | |||
ACKs do not acknowledge new data, they feed back new congestion | ACKs do not acknowledge new data, they feed back new congestion | |||
state (useful in case B starts sending). The minimum of 3 for | state (useful in case B starts sending). The minimum of 3 for | |||
'n' in this case ensures that, even if A also uses ECN-capable | 'n' in this case ensures that, even if A also uses ECN-capable | |||
pure ACKs, and even if there is pathological congestion in both | pure ACKs, and even if there is pathological congestion in both | |||
directions, any resulting ping-pong of ACKs will be rapidly | directions, any resulting ping-pong of ACKs will be rapidly | |||
damped.</t> | damped.</t> | |||
<t>In the above bidirectional scenario, incoming ACKs of ACKs | <t>In the above bidirectional scenario, incoming ACKs of ACKs | |||
could be mistaken for duplicate ACKs. But ACKs of ACKs can be | could be mistaken for duplicate ACKs. But ACKs of ACKs can be | |||
distinguished from duplicate ACKs because they do not contain any | distinguished from duplicate ACKs because they do not contain any | |||
SACK blocks even when SACK has been negotiated. It is outside the | SACK blocks even when SACK has been negotiated. It is outside the | |||
scope of this AccECN specification to normatively specify this add itional | scope of this AccECN specification to normatively specify this add itional | |||
test for DupACKs, because ACKs of ACKs can only arise if the | test for DupACKs, because ACKs of ACKs can only arise if the | |||
original ACKs are ECN-capable. Instead any specification that allo | original ACKs are ECN-capable. Instead, any specification that all | |||
ws | ows | |||
ECN-capable pure ACKs MUST make sending ACKs of ACKs conditional | ECN-capable pure ACKs <bcp14>MUST</bcp14> make sending ACKs of ACK | |||
s conditional | ||||
on measures to distinguish ACKs of ACKs from DupACKs (see for | on measures to distinguish ACKs of ACKs from DupACKs (see for | |||
example <xref target="I-D.ietf-tcpm-generalized-ecn"/>). All that | example <xref target="I-D.ietf-tcpm-generalized-ecn"/>). All that | |||
is necessary here is to require that these ACKs of ACKs MUST NOT | is necessary here is to require that these ACKs of ACKs <bcp14>MUS T NOT</bcp14> | |||
contain any SACK blocks (which would normally not happen | contain any SACK blocks (which would normally not happen | |||
anyway).</t> | anyway).</t> | |||
</section> | </section> | |||
<section anchor="accecn_ACE_Safety_S"> | ||||
<section anchor="accecn_ACE_Safety_S" | <name>Data Sender Safety Procedures</name> | |||
title="Data Sender Safety Procedures"> | ||||
<t>If the Data Sender has not received AccECN TCP Options to | <t>If the Data Sender has not received AccECN TCP Options to | |||
give it more dependable information, and it detects that the ACE | give it more dependable information, and it detects that the ACE | |||
field could have cycled, it SHOULD deem whether it cycled by | field could have cycled, it <bcp14>SHOULD</bcp14> deem whether it cycled by | |||
taking the safest likely case under the prevailing conditions. | taking the safest likely case under the prevailing conditions. | |||
It can detect if the counter could have cycled by using the jump | It can detect if the counter could have cycled by using the jump | |||
in the acknowledgement number since the last ACK to calculate or | in the acknowledgement number since the last ACK to calculate or | |||
estimate how many segments could have been acknowledged. An | estimate how many segments could have been acknowledged. An | |||
example algorithm to implement this policy is given in <xref | example algorithm to implement this policy is given in <xref targe | |||
target="accecn_Algo_ACE_Wrap"/>. An implementation MAY use an | t="accecn_Algo_ACE_Wrap"/>. An implementation <bcp14>MAY</bcp14> use an | |||
alternative algorithm as long as it satisfies the requirements | alternative algorithm as long as it satisfies the requirements | |||
in this subsection.</t> | in this subsection.</t> | |||
<t>If missing acknowledgement numbers arrive later (reordering) | <t>If missing acknowledgement numbers arrive later (reordering) | |||
and prove that the counter did not cycle, the Data Sender MAY | and prove that the counter did not cycle, the Data Sender <bcp14>M AY</bcp14> | |||
attempt to neutralize the effect of any action it took based on | attempt to neutralize the effect of any action it took based on | |||
a conservative assumption that it later found to be | a conservative assumption that it later found to be | |||
incorrect.</t> | incorrect.</t> | |||
<t>The Data Sender can estimate how many packets (of any | <t>The Data Sender can estimate how many packets (of any | |||
marking) an ACK acknowledges. If the ACE counter on an ACK seems | marking) an ACK acknowledges. If the ACE counter on an ACK seems | |||
to imply that the minimum number of newly CE-marked packets is | to imply that the minimum number of newly CE-marked packets is | |||
greater than the number of newly acknowledged packets, the Data | greater than the number of newly acknowledged packets, the Data | |||
Sender SHOULD consider the ACE counter to be correct (and its | Sender <bcp14>SHOULD</bcp14> consider the ACE counter to be correc t (and its | |||
count of control packets to be incomplete), unless it can be | count of control packets to be incomplete), unless it can be | |||
sure that it is counting all control packets correctly.</t> | sure that it is counting all control packets correctly.</t> | |||
</section> | </section> | |||
</section> | </section> | |||
</section> | </section> | |||
<section anchor="accecn_option"> | ||||
<section anchor="accecn_option" title="The AccECN Option"> | <name>The AccECN Option</name> | |||
<t>Two alternative AccECN Options are defined as shown in <xref | <t>Two alternative AccECN Options are defined as shown in <xref target | |||
target="accecn_Fig_TCPopt"/>. The initial 'E' of each field name | ="accecn_Fig_TCPopt"/>. The initial 'E' of each field name | |||
stands for 'Echo'.</t> | stands for 'Echo'.</t> | |||
<figure anchor="accecn_Fig_TCPopt"> | ||||
<figure align="center" anchor="accecn_Fig_TCPopt" | <name>The Two Alternative AccECN TCP Options</name> | |||
title="The Two Alternative AccECN TCP Options"> | <artwork align="center"><![CDATA[ | |||
<artwork align="center"><![CDATA[ 0 1 | 0 1 2 3 | |||
2 3 | ||||
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
| Kind = 172 | Length = 11 | EE0B field | | | Kind = 172 | Length = 11 | EE0B field | | |||
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
| EE0B (cont'd) | ECEB field | | | EE0B (cont'd) | ECEB field | | |||
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
| EE1B field | Order 0 | | EE1B field | Order 0 | |||
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
0 1 2 3 | 0 1 2 3 | |||
skipping to change at line 2000 ¶ | skipping to change at line 1971 ¶ | |||
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | |||
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
| Kind = 174 | Length = 11 | EE1B field | | | Kind = 174 | Length = 11 | EE1B field | | |||
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
| EE1B (cont'd) | ECEB field | | | EE1B (cont'd) | ECEB field | | |||
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
| EE0B field | Order 1 | | EE0B field | Order 1 | |||
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |||
]]></artwork> | ]]></artwork> | |||
</figure> | </figure> | |||
<t><xref target="accecn_Fig_TCPopt"/> shows two option field orders; | <t><xref target="accecn_Fig_TCPopt"/> shows two option field orders; | |||
order 0 and order 1. They both consists of three 24-bit fields. | order 0 and order 1. They both consist of three 24-bit fields. | |||
Order 0 provides the 24 least significant bits of the r.e0b, r.ceb | Order 0 provides the 24 least significant bits of the r.e0b, r.ceb, | |||
and r.e1b counters, respectively. Order 1 provides the same fields, | and r.e1b counters, respectively. Order 1 provides the same fields, | |||
but in the opposite order. On each packet, the Data Receiver can use | but in the opposite order. On each packet, the Data Receiver can use | |||
whichever order is more efficient. In either case, the bytes within | whichever order is more efficient. In either case, the bytes within | |||
the fields are in network byte order (big-endian).</t> | the fields are in network byte order (big-endian).</t> | |||
<t>The choice to use three bytes (24 bits) fields in the options was m ade to | <t>The choice to use three bytes (24 bits) fields in the options was m ade to | |||
strike a balance between TCP option space usage, and the required | strike a balance between TCP option space usage, and the required | |||
fidelity of the counters to accomodate typical scenarios such as | fidelity of the counters to accommodate typical scenarios such as | |||
hardware TCP segmentation offloading (TSO), and periods where no optio | hardware TCP Segmentation Offloading (TSO), and periods during which n | |||
n may | o option may | |||
be transmitted (e.g., SACK loss recovery). Providing only 2 bytes | be transmitted (e.g., SACK loss recovery). Providing only 2 bytes (16 | |||
(16 bits) | bits) | |||
for these counters could easily roll over within a single TSO transmis sion | for these counters could easily roll over within a single TSO transmis sion | |||
or large/generic receive offload (LRO/GRO) event. Having two distinct | or large/generic receive offload (LRO/GRO) event. Having two distinct | |||
orderings further allows the transmission of the most pertinent change s | orderings further allows the transmission of the most pertinent change s | |||
in an abbreviated option (see below).</t> | in an abbreviated option (see below).</t> | |||
<t>When a Data Receiver sends an AccECN Option, it <bcp14>MUST</bcp14> | ||||
<t>When a Data Receiver sends an AccECN Option, it MUST set the Kind | set the Kind | |||
field to 172 if using Order 0, or to 174 if using Order 1. These two | field to 172 if using Order 0, or to 174 if using Order 1. These two | |||
new TCP Option Kinds are registered in <xref | new TCP Option Kinds are registered in <xref target="accecn_IANA_Consi | |||
target="accecn_IANA_Considerations"/> and called respectively | derations"/> and are called | |||
AccECN0 and AccECN1.</t> | AccECN0 and AccECN1, respectively.</t> | |||
<t>Note that there is no field to feed back Not-ECT bytes. | <t>Note that there is no field to feed back Not-ECT bytes. | |||
Nonetheless an algorithm for the Data Sender to calculate the number | Nonetheless, an algorithm for the Data Sender to calculate the number | |||
of payload bytes received as Not-ECT is given in <xref | of payload bytes received as Not-ECT is given in <xref target="accecn_ | |||
target="accecn_Algo_Not-ECT"/>.</t> | Algo_Not-ECT"/>.</t> | |||
<t>Whenever a Data Receiver sends an AccECN Option, the rules in | <t>Whenever a Data Receiver sends an AccECN Option, the rules in | |||
<xref target="accecn_option_usage"/> allow it to omit unchanged | <xref target="accecn_option_usage"/> allow it to omit unchanged | |||
fields from the tail of the option, to help cope with option space | fields from the tail of the option, to help cope with option space | |||
limitations, as long as it preserves the order of the remaining | limitations, as long as it preserves the order of the remaining | |||
fields and includes any field that has changed. The length field | fields and includes any field that has changed. The length field | |||
MUST indicate which fields are present as follows:</t> | <bcp14>MUST</bcp14> indicate which fields are present as follows:</t> | |||
<table anchor="accecn_Fig_TCPopttab"> | ||||
<texttable suppress-title="true" anchor="accecn_Fig_TCPopttab" | <name>Fields included in AccECN TCP Options of each length and order | |||
title="Fields included in AccECN TCP Options of each length | </name> | |||
and order"> | <thead> | |||
<ttcol>Length</ttcol> | <tr> | |||
<th>Length</th> | ||||
<ttcol>Order 0</ttcol> | <th>Order 0</th> | |||
<th>Order 1</th> | ||||
<ttcol>Order 1</ttcol> | </tr> | |||
</thead> | ||||
<c>11</c> | <tbody> | |||
<tr> | ||||
<c>EE0B, ECEB, EE1B</c> | <td>11</td> | |||
<td>EE0B, ECEB, EE1B</td> | ||||
<c>EE1B, ECEB, EE0B</c> | <td>EE1B, ECEB, EE0B</td> | |||
</tr> | ||||
<c>8</c> | <tr> | |||
<td>8</td> | ||||
<c>EE0B, ECEB</c> | <td>EE0B, ECEB</td> | |||
<td>EE1B, ECEB</td> | ||||
<c>EE1B, ECEB</c> | </tr> | |||
<tr> | ||||
<c>5</c> | <td>5</td> | |||
<td>EE0B</td> | ||||
<c>EE0B</c> | <td>EE1B</td> | |||
</tr> | ||||
<c>EE1B</c> | <tr> | |||
<td>2</td> | ||||
<c>2</c> | <td>(empty)</td> | |||
<td>(empty)</td> | ||||
<c>(empty)</c> | </tr> | |||
</tbody> | ||||
<c>(empty)</c> | </table> | |||
</texttable> | ||||
<t>The empty option of Length=2 is provided to allow for a case | <t>The empty option of Length=2 is provided to allow for a case | |||
where an AccECN Option has to be sent (e.g., on the SYN/ACK to | where an AccECN Option has to be sent (e.g., on the SYN/ACK to | |||
test the path), but there is very limited space for the option.</t> | test the path), but there is very limited space for the option.</t> | |||
<t>All implementations of a Data Sender that read any AccECN Option | <t>All implementations of a Data Sender that read any AccECN Option | |||
MUST be able to read AccECN Options of any of the above lengths. For | <bcp14>MUST</bcp14> be able to read AccECN Options of any of the above lengths. For | |||
forward compatibility, if the AccECN Option is of any other length, | forward compatibility, if the AccECN Option is of any other length, | |||
implementations MUST use those whole 3-octet fields that fit within | implementations <bcp14>MUST</bcp14> use those whole 3-octet fields tha t fit within | |||
the length and ignore the remainder of the option, treating it as | the length and ignore the remainder of the option, treating it as | |||
padding.</t> | padding.</t> | |||
<t>AccECN Options have to be optional to implement, because both | <t>AccECN Options have to be optional to implement, because both | |||
sender and receiver have to be able to cope without options anyway - | sender and receiver have to be able to cope without options anyway -- | |||
in cases where they do not traverse a network path. It is | in cases where they do not traverse a network path. It is | |||
RECOMMENDED to implement both sending and receiving of AccECN | <bcp14>RECOMMENDED</bcp14> to implement both sending and receiving of AccECN | |||
Options. Support for AccECN Options is particularly valuable over | Options. Support for AccECN Options is particularly valuable over | |||
paths that introduce a high degree of ACK filtering, where the 3-bit | paths that introduce a high degree of ACK filtering, where the 3-bit | |||
ACE counter alone might sometimes be insufficient, when it is | ACE counter alone might sometimes be insufficient, when it is | |||
ambiguous whether it has wrapped. If sending of AccECN Options is | ambiguous whether it has wrapped. If sending of AccECN Options is | |||
implemented, the fall-backs described in this document will need to | implemented, the fall-backs described in this document will need to | |||
be implemented as well (unless solely for a controlled environment | be implemented as well (unless solely for a controlled environment | |||
where path traversal is not considered a problem). Even if a | where path traversal is not considered a problem). Even if a | |||
developer does not implement logic to understand received AccECN | developer does not implement logic to understand received AccECN | |||
Options, it is RECOMMENDED that they implement logic to send AccECN | Options, it is <bcp14>RECOMMENDED</bcp14> that they implement logic to send AccECN | |||
Options. Otherwise, those remote peers that implement the receiving | Options. Otherwise, those remote peers that implement the receiving | |||
logic will still be excluded from congestion feedback that is robust | logic will still be excluded from congestion feedback that is robust | |||
against the increasingly aggressive ACK filtering in the Internet. | against the increasingly aggressive ACK filtering in the Internet. | |||
The logic to send AccECN Options is the simpler to implement of the | The logic to send AccECN Options is the simpler to implement of the | |||
two sides.</t> | two sides.</t> | |||
<t>If a Data Receiver intends to send an AccECN Option at any time | <t>If a Data Receiver intends to send an AccECN Option at any time | |||
during the rest of the connection it is RECOMMENDED to also test | during the rest of the connection, it is <bcp14>RECOMMENDED</bcp14> to | |||
path traversal of the AccECN Option as specified in <xref | also test | |||
target="accecn_Mbox_Interference"/>.</t> | path traversal of the AccECN Option as specified in <xref target="acce | |||
cn_Mbox_Interference"/>.</t> | ||||
<section title="Encoding and Decoding Feedback in the AccECN Option Fi | <section> | |||
elds"> | <name>Encoding and Decoding Feedback in the AccECN Option Fields</na | |||
me> | ||||
<t>Whenever the Data Receiver includes any of the counter fields | <t>Whenever the Data Receiver includes any of the counter fields | |||
(ECEB, EE0B, EE1B) in an AccECN Option, it MUST encode the 24 | (ECEB, EE0B, EE1B) in an AccECN Option, it <bcp14>MUST</bcp14> encod e the 24 | |||
least significant bits of the current value of the associated | least significant bits of the current value of the associated | |||
counter into the field (respectively r.ceb, r.e0b, r.e1b).</t> | counter into the field (respectively r.ceb, r.e0b, r.e1b).</t> | |||
<t>Whenever the Data Sender receives an ACK carrying an AccECN | <t>Whenever the Data Sender receives an ACK carrying an AccECN | |||
Option, it first checks whether the ACK has already been | Option, it first checks whether the ACK has already been | |||
superseded by another ACK in which case it ignores the ECN | superseded by another ACK in which case it ignores the ECN | |||
feedback. If the ACK has not been superseded, the Data Sender | feedback. If the ACK has not been superseded, the Data Sender | |||
normally decodes the fields in the AccECN Option as follows. For | normally decodes the fields in the AccECN Option as follows. For | |||
each field, it takes the least significant 24 bits of its | each field, it takes the least significant 24 bits of its | |||
associated local counter (s.ceb, s.e0b or s.e1b) and subtracts | associated local counter (s.ceb, s.e0b, or s.e1b) and subtracts | |||
them from the counter in the associated field of the incoming | them from the counter in the associated field of the incoming | |||
AccECN Option (respectively ECEB, EE0B, EE1B), to work out the | AccECN Option (respectively ECEB, EE0B, EE1B), to work out the | |||
minimum positive increment it could apply to s.ceb, s.e0b or s.e1b | minimum positive increment it could apply to s.ceb, s.e0b, or s.e1b | |||
(assuming the field in the option only wrapped at most once).</t> | (assuming the field in the option only wrapped once at most).</t> | |||
<t><xref target="accecn_Algo_Option_Coding"/> gives an example | <t><xref target="accecn_Algo_Option_Coding"/> gives an example | |||
algorithm for the Data Receiver to encode its byte counters into | algorithm for the Data Receiver to encode its byte counters into | |||
an AccECN Option, and for the Data Sender to decode the AccECN | an AccECN Option, and for the Data Sender to decode the AccECN | |||
Option fields into its byte counters.</t> | Option fields into its byte counters.</t> | |||
<t>Note that, as specified in <xref target="accecn_feedback"/>, | <t>Note that, as specified in <xref target="accecn_feedback"/>, | |||
any data on the SYN (SYN=1, ACK=0) is not included in any of the | any data on the SYN (SYN=1, ACK=0) is not included in any of the | |||
byte counters held locally for each ECN marking nor in an AccECN | byte counters held locally for each ECN marking nor in an AccECN | |||
Option on the wire.</t> | Option on the wire.</t> | |||
</section> | </section> | |||
<section anchor="accecn_Mbox_Interference"> | ||||
<section anchor="accecn_Mbox_Interference" | <name>Path Traversal of the AccECN Option</name> | |||
title="Path Traversal of the AccECN Option"> | <section anchor="accecn_AccECN_Option_3WHS"> | |||
<t/> | <name>Testing the AccECN Option During the Handshake</name> | |||
<t>The TCP Client <bcp14>MUST NOT</bcp14> include an AccECN TCP Op | ||||
<section anchor="accecn_AccECN_Option_3WHS" | tion on the | |||
title="Testing the AccECN Option during the Handshake"> | SYN. If there is somehow an AccECN Option on a SYN, it <bcp14>MUST | |||
<t>The TCP Client MUST NOT include an AccECN TCP Option on the | </bcp14> be | |||
SYN. If there is somehow an AccECN Option on a SYN, it MUST be | ||||
ignored when forwarded or received.</t> | ignored when forwarded or received.</t> | |||
<t>A TCP Server that confirms its support for AccECN (in | <t>A TCP Server that confirms its support for AccECN (in | |||
response to an AccECN SYN from the Client as described in <xref | response to an AccECN SYN from the Client as described in <xref ta | |||
target="accecn_Negotiation"/>) SHOULD include an AccECN TCP | rget="accecn_Negotiation"/>) <bcp14>SHOULD</bcp14> include an AccECN TCP | |||
Option on the SYN/ACK.</t> | Option on the SYN/ACK.</t> | |||
<t>A TCP Client that has successfully negotiated AccECN <bcp14>SHO | ||||
<t>A TCP Client that has successfully negotiated AccECN SHOULD | ULD</bcp14> | |||
include an AccECN Option in the first ACK at the end of the | include an AccECN Option in the first ACK at the end of the | |||
three-way handshake. However, this first ACK is not delivered reli ably, so the | three-way handshake. However, this first ACK is not delivered reli ably, so the | |||
TCP Client SHOULD also include an AccECN Option on the first | TCP Client <bcp14>SHOULD</bcp14> also include an AccECN Option on the first | |||
data segment it sends (if it ever sends one).</t> | data segment it sends (if it ever sends one).</t> | |||
<t>A host <bcp14>MAY</bcp14> omit an AccECN Option in any of the a | ||||
<t>A host MAY omit an AccECN Option in any of the above three | bove three | |||
cases due to insufficient option space or if it has cached | cases because of insufficient option space or because it has cache | |||
d | ||||
knowledge that the packet would be likely to be blocked on the | knowledge that the packet would be likely to be blocked on the | |||
path to the other host if it included an AccECN Option.</t> | path to the other host if it included an AccECN Option.</t> | |||
</section> | </section> | |||
<section anchor="accecn_AccECN_Option_Loss"> | ||||
<section anchor="accecn_AccECN_Option_Loss" | <name>Testing for Loss of Packets Carrying the AccECN Option</name | |||
title="Testing for Loss of Packets Carrying the AccECN Opti | > | |||
on"> | ||||
<t>If the TCP Server has not received an ACK to acknowledge its | <t>If the TCP Server has not received an ACK to acknowledge its | |||
SYN/ACK after the normal TCP timeout or it receives a second SYN | SYN/ACK after the normal TCP timeout or if it receives a second SY N | |||
with a request for AccECN support, then either the SYN/ACK might | with a request for AccECN support, then either the SYN/ACK might | |||
just have been lost, e.g., due to congestion, or a middlebox | just have been lost, e.g., due to congestion, or a middlebox | |||
might be blocking AccECN Options. To expedite connection setup | might be blocking AccECN Options. To expedite connection setup | |||
in deployment scenarios where AccECN path traversal might be | in deployment scenarios where AccECN path traversal might be | |||
problematic, the TCP Server SHOULD retransmit the SYN/ACK, but | problematic, the TCP Server <bcp14>SHOULD</bcp14> retransmit the S YN/ACK, but | |||
with no AccECN Option. If this retransmission times out, to | with no AccECN Option. If this retransmission times out, to | |||
expedite connection setup, the TCP Server SHOULD retransmit the | expedite connection setup, the TCP Server <bcp14>SHOULD</bcp14> re transmit the | |||
SYN/ACK with (AE,CWR,ECE) = (0,0,0) and no AccECN Option, but it | SYN/ACK with (AE,CWR,ECE) = (0,0,0) and no AccECN Option, but it | |||
remains in AccECN feedback mode (per <xref | remains in AccECN feedback mode (per <xref target="accecn_implicat | |||
target="accecn_implications_accecn_mode"/>).</t> | ions_accecn_mode"/>).</t> | |||
<aside> | <aside> | |||
<t>Note that a retransmitted AccECN SYN/ACK will not | <t>Note that a retransmitted AccECN SYN/ACK will not | |||
necessarily have the same TCP-ECN flags as the original | necessarily have the same TCP-ECN flags as the original | |||
SYN/ACK, because it feeds back the IP-ECN field of the latest | SYN/ACK, because it feeds back the IP-ECN field of the latest | |||
SYN to have arrived (by the rule in <xref | SYN to have arrived (by the rule in <xref target="accecn_implica | |||
target="accecn_implications_accecn_mode"/>).</t> | tions_accecn_mode"/>).</t> | |||
</aside> | </aside> | |||
<t>The above fall-back approach limits any interference by | <t>The above fall-back approach limits any interference by | |||
middleboxes that might drop packets with unknown options, even | middleboxes that might drop packets with unknown options, even | |||
though it is more likely that SYN/ACK loss is due to congestion. | though it is more likely that SYN/ACK loss is due to congestion. | |||
The TCP Server MAY try to send another packet with an AccECN | The TCP Server <bcp14>MAY</bcp14> try to send another packet with an AccECN | |||
Option at a later point during the connection but it ought to | Option at a later point during the connection but it ought to | |||
monitor if that packet got lost as well, in which case it SHOULD | monitor if that packet got lost as well, in which case it <bcp14>S HOULD</bcp14> | |||
disable the sending of AccECN Options for this | disable the sending of AccECN Options for this | |||
half-connection.</t> | half-connection.</t> | |||
<t>Implementers <bcp14>MAY</bcp14> use other fall-back strategies | ||||
<t>Implementers MAY use other fall-back strategies if they are | if they are | |||
found to be more effective (e.g., retrying an AccECN Option | found to be more effective (e.g., retrying an AccECN Option | |||
for a second time before fall-back - most appropriate during | for a second time before fall-back -- most appropriate during | |||
high levels of congestion). However, other fall-back strategies | high levels of congestion). However, other fall-back strategies | |||
will need to follow all the rules in <xref | will need to follow all the rules in <xref target="accecn_implicat | |||
target="accecn_implications_accecn_mode"/>, which concern | ions_accecn_mode"/>, which concern | |||
behaviour when SYNs or SYN/ACKs negotiating different types of | behaviour when SYNs or SYN/ACKs negotiating different types of | |||
feedback have been sent within the same connection.</t> | feedback have been sent within the same connection.</t> | |||
<t>Further it might make sense to also remove any other new or | <t>Further it might make sense to also remove any other new or | |||
experimental fields or options on the SYN/ACK, although the | experimental fields or options on the SYN/ACK, although the | |||
required behaviour will depend on the specification of the other | required behaviour will depend on the specification of the other | |||
option(s) and on any attempt to co-ordinate fall-back between | option(s) and on any attempt to coordinate fall-back between | |||
different modules of the stack.</t> | different modules of the stack.</t> | |||
<t>If the TCP Client detects that the first data segment it sent | <t>If the TCP Client detects that the first data segment it sent | |||
with an AccECN Option was lost, in deployment scenarios where | with an AccECN Option was lost, in deployment scenarios where | |||
AccECN path traversal might be problematic, it SHOULD fall back | AccECN path traversal might be problematic, it <bcp14>SHOULD</bcp1 4> fall back | |||
to no AccECN Option on the retransmission. Again, implementers | to no AccECN Option on the retransmission. Again, implementers | |||
MAY use other fall-back strategies such as attempting to | <bcp14>MAY</bcp14> use other fall-back strategies such as attempti ng to | |||
retransmit a second segment with an AccECN Option before | retransmit a second segment with an AccECN Option before | |||
fall-back, and/or caching whether AccECN Options are blocked for | fall-back, and/or caching whether AccECN Options are blocked for | |||
subsequent connections. <xref target="RFC9040"/> further | subsequent connections. <xref target="RFC9040"/> further | |||
discusses caching of TCP parameters and status information.</t> | discusses caching of TCP parameters and status information.</t> | |||
<t>If a middlebox is dropping packets with options it does not | <t>If a middlebox is dropping packets with options it does not | |||
recognize, a host that is sending little or no data but mostly | recognize, a host that is sending little or no data but mostly | |||
pure ACKs will not inherently detect such losses. Such a host | pure ACKs will not inherently detect such losses. Such a host | |||
MAY detect loss of ACKs carrying the AccECN Option by detecting | <bcp14>MAY</bcp14> detect loss of ACKs carrying the AccECN Option by detecting | |||
whether the acknowledged data always reappears as a | whether the acknowledged data always reappears as a | |||
retransmission. In such cases, the host SHOULD disable the | retransmission. In such cases, the host <bcp14>SHOULD</bcp14> disa ble the | |||
sending of the AccECN Option for this half-connection.</t> | sending of the AccECN Option for this half-connection.</t> | |||
<t>If a host falls back to not sending AccECN Options, it will | <t>If a host falls back to not sending AccECN Options, it will | |||
continue to process any incoming AccECN Options as normal.</t> | continue to process any incoming AccECN Options as normal.</t> | |||
<t>Either host <bcp14>MAY</bcp14> include AccECN Options in one or | ||||
<t>Either host MAY include AccECN Options in a subsequent | more subsequent | |||
segment or segments to retest whether AccECN Options can | segments to retest whether AccECN Options can | |||
traverse the path.</t> | traverse the path.</t> | |||
<t>Similarly, an AccECN endpoint <bcp14>MAY</bcp14> separately mem | ||||
<t>Similarly, an AccECN endpoint MAY separately memorize which | orize which | |||
data packets carried an AccECN Option and disable the sending of | data packets carried an AccECN Option and disable the sending of | |||
AccECN Options if the loss probability of those packets is | AccECN Options if the loss probability of those packets is | |||
significantly higher than that of all other data packets in the | significantly higher than that of all other data packets in the | |||
same connection.</t> | same connection.</t> | |||
</section> | </section> | |||
<section> | ||||
<section title="Testing for Absence of the AccECN Option"> | <name>Testing for Absence of the AccECN Option</name> | |||
<t>If the TCP Client has successfully negotiated AccECN but does | <t>If the TCP Client has successfully negotiated AccECN but does | |||
not receive an AccECN Option on the SYN/ACK (e.g., because | not receive an AccECN Option on the SYN/ACK (e.g., because | |||
is has been stripped by a middlebox or not sent by the Server), | is has been stripped by a middlebox or not sent by the Server), | |||
the Client switches into a mode that assumes that the AccECN | the Client switches into a mode that assumes that the AccECN | |||
Option is not available for this half connection.</t> | Option is not available for this half connection.</t> | |||
<t>Similarly, if the TCP Server has successfully negotiated | <t>Similarly, if the TCP Server has successfully negotiated | |||
AccECN but does not receive an AccECN Option on the first | AccECN but does not receive an AccECN Option on the first | |||
segment that acknowledges sequence space at least covering the | segment that acknowledges sequence space at least covering the | |||
ISN, it switches into a mode that assumes that the AccECN Option | ISN, it switches into a mode that assumes that the AccECN Option | |||
is not available for this half connection.</t> | is not available for this half connection.</t> | |||
<t>While a host is in this mode that assumes incoming AccECN | <t>While a host is in this mode that assumes incoming AccECN | |||
Options are not available, it MUST adopt the conservative | Options are not available, it <bcp14>MUST</bcp14> adopt the conser | |||
interpretation of the ACE field discussed in <xref | vative | |||
target="accecn_ACE_Safety"/>. However, it cannot make any | interpretation of the ACE field discussed in <xref target="accecn_ | |||
ACE_Safety"/>. However, it cannot make any | ||||
assumption about support of outgoing AccECN Options on the other | assumption about support of outgoing AccECN Options on the other | |||
half connection, so it SHOULD continue to send AccECN Options | half connection, so it <bcp14>SHOULD</bcp14> continue to send AccE CN Options | |||
itself (unless it has established that sending AccECN Options is | itself (unless it has established that sending AccECN Options is | |||
causing packets to be blocked as in <xref | causing packets to be blocked as in <xref target="accecn_AccECN_Op | |||
target="accecn_AccECN_Option_Loss"/>).</t> | tion_Loss"/>).</t> | |||
<t>If a host is in the mode that assumes incoming AccECN Options | <t>If a host is in the mode that assumes incoming AccECN Options | |||
are not available, but it receives an AccECN Option at any later | are not available, but it receives an AccECN Option at any later | |||
point during the connection, this clearly indicates that AccECN | point during the connection, this clearly indicates that AccECN | |||
Options are no longer blocked on the respective path, and the | Options are no longer blocked on the respective path, and the | |||
AccECN endpoint MAY switch out of the mode that assumes AccECN | AccECN endpoint <bcp14>MAY</bcp14> switch out of the mode that ass umes AccECN | |||
Options are not available for this half connection.</t> | Options are not available for this half connection.</t> | |||
</section> | </section> | |||
<section anchor="accecn_sec_zero_option"> | ||||
<section anchor="accecn_sec_zero_option" | <name>Test for Zeroing of the AccECN Option</name> | |||
title="Test for Zeroing of the AccECN Option"> | ||||
<t>For a related test for invalid initialization of the ACE | <t>For a related test for invalid initialization of the ACE | |||
field, see <xref target="accecn_sec_ACE_init_invalid"/></t> | field, see <xref target="accecn_sec_ACE_init_invalid"/></t> | |||
<t><xref target="accecn_init_counters"/> required the Data | <t><xref target="accecn_init_counters"/> required the Data | |||
Receiver to initialize the r.e0b and r.e1b counters to a | Receiver to initialize the r.e0b and r.e1b counters to a | |||
non-zero value. Therefore, in either direction the initial value | non-zero value. Therefore, in either direction the initial value | |||
of the EE0B field or EE1B field in an AccECN Option (if one | of the EE0B field or EE1B field in an AccECN Option (if one | |||
exists) ought to be non-zero. If AccECN has been | exists) ought to be non-zero. If AccECN has been | |||
negotiated:<list style="symbols"> | negotiated:</t> | |||
<t>the TCP Server MAY check that the initial value of the | <ul spacing="normal"> | |||
<li> | ||||
<t>the TCP Server <bcp14>MAY</bcp14> check that the initial va | ||||
lue of the | ||||
EE0B field or the EE1B field is non-zero in the first | EE0B field or the EE1B field is non-zero in the first | |||
segment that acknowledges sequence space that at least | segment that acknowledges sequence space that at least | |||
covers the ISN plus 1. If it runs a test and either initial | covers the ISN plus 1. If it runs a test and either initial | |||
value is zero, the Server will switch into a mode that | value is zero, the Server will switch into a mode that | |||
ignores AccECN Options for this half connection.</t> | ignores AccECN Options for this half connection.</t> | |||
</li> | ||||
<t>the TCP Client MAY check the initial value of the EE0B | <li> | |||
<t>the TCP Client <bcp14>MAY</bcp14> check that the initial va | ||||
lue of the EE0B | ||||
field or the EE1B field is non-zero on the SYN/ACK. If it | field or the EE1B field is non-zero on the SYN/ACK. If it | |||
runs a test and either initial value is zero, the Client | runs a test and either initial value is zero, the Client | |||
will switch into a mode that ignores AccECN Options for this | will switch into a mode that ignores AccECN Options for this | |||
half connection.</t> | half connection.</t> | |||
</list></t> | </li> | |||
</ul> | ||||
<t>While a host is in the mode that ignores AccECN Options it | <t>While a host is in the mode that ignores AccECN Options, it | |||
MUST adopt the conservative interpretation of the ACE field | <bcp14>MUST</bcp14> adopt the conservative interpretation of the A | |||
CE field | ||||
discussed in <xref target="accecn_ACE_Safety"/>.</t> | discussed in <xref target="accecn_ACE_Safety"/>.</t> | |||
<t>Note that the Data Sender <bcp14>MUST NOT</bcp14> test whether | ||||
<t>Note that the Data Sender MUST NOT test whether the arriving | the arriving | |||
byte counters in an initial AccECN Option have been initialized | byte counters in an initial AccECN Option have been initialized | |||
to specific valid values - the above checks solely test whether | to specific valid values -- the above checks solely test whether | |||
these fields have been incorrectly zeroed. This allows hosts to | these fields have been incorrectly zeroed. This allows hosts to | |||
use different initial values as an additional signalling channel | use different initial values as an additional signalling channel | |||
in future. Also note that the initial value of either field | in the future. Also note that the initial value of either field | |||
might be greater than its expected initial value, because the | might be greater than its expected initial value, because the | |||
counters might already have been incremented. Nonetheless, the | counters might already have been incremented. Nonetheless, the | |||
initial values of the counters have been chosen so that they | initial values of the counters have been chosen so that they | |||
cannot wrap to zero on these initial segments.</t> | cannot wrap to zero on these initial segments.</t> | |||
</section> | </section> | |||
<section> | ||||
<section title="Consistency between AccECN Feedback Fields"> | <name>Consistency Between AccECN Feedback Fields</name> | |||
<t>When AccECN Options are available they ought to provide more | <t>When AccECN Options are available, they ought to provide more | |||
unambiguous feedback. However, they supplement but do not | unambiguous feedback. However, they supplement but do not | |||
replace the ACE field. An endpoint using AccECN feedback MUST | replace the ACE field. An endpoint using AccECN feedback <bcp14>MU ST</bcp14> | |||
always reconcile the information provided in the ACE field with | always reconcile the information provided in the ACE field with | |||
that in any AccECN Option, so that the state of the ACE-related | that in any AccECN Option, so that the state of the ACE-related | |||
packet counter can be relied on if future feedback does not | packet counter can be relied on if future feedback does not | |||
carry an AccECN Option.</t> | carry an AccECN Option.</t> | |||
<t>If an AccECN Option is present, the s.cep counter might | <t>If an AccECN Option is present, the s.cep counter might | |||
increase more than expected from the increase of the s.ceb | increase more than expected from the increase of the s.ceb | |||
counter (e.g., due to a CE-marked control packet). The | counter (e.g., due to a CE-marked control packet). The | |||
sender's response to such a situation is out of scope, and needs | sender's response to such a situation is out of scope, and needs | |||
to be dealt with in a specification that uses ECN-capable | to be dealt with in a specification that uses ECN-capable | |||
control packets. Theoretically, this situation could also occur | control packets. Theoretically, this situation could also occur | |||
if a middlebox mangled an AccECN Option but not the ACE field. | if a middlebox mangled an AccECN Option but not the ACE field. | |||
However, the Data Sender has to assume that the integrity of | However, the Data Sender has to assume that the integrity of | |||
AccECN Options is sound, based on the above test of the | AccECN Options is sound, based on the above test of the | |||
well-known initial values and optionally other integrity tests | well-known initial values and optionally other integrity tests | |||
(<xref target="accecn_Integrity"/>).</t> | (<xref target="accecn_Integrity"/>).</t> | |||
<t>If either endpoint detects that the s.ceb counter has | <t>If either endpoint detects that the s.ceb counter has | |||
increased but the s.cep has not (and by testing ACK coverage it | increased but the s.cep has not (and by testing ACK coverage it | |||
is certain how much the ACE field has wrapped), and if there is | is certain how much the ACE field has wrapped), and if there is | |||
no explanation other than an invalid protocol transition due to | no explanation other than an invalid protocol transition due to | |||
some form of feedback mangling, the Data Sender MUST disable | some form of feedback mangling, the Data Sender <bcp14>MUST</bcp14 > disable | |||
sending ECN-capable packets for the remainder of the | sending ECN-capable packets for the remainder of the | |||
half-connection by setting the IP-ECN field in all subsequent | half-connection by setting the IP-ECN field in all subsequent | |||
packets to Not-ECT.<!--There is no need to say the following for f orward compatibility: | packets to Not-ECT.<!--There is no need to say the following for f orward compatibility: | |||
"If a data receiver negotiates AccECN but then deliberately makes the counters i nconsistent, | "If a data receiver negotiates AccECN but then deliberately makes the counters i nconsistent, | |||
it MUST continue the connection | it MUST continue the connection | |||
even if the data sender does not disable sending ECN-capable packets."--></t> | even if the data sender does not disable sending ECN-capable packets."--> | |||
</t> | ||||
</section> | </section> | |||
</section> | </section> | |||
<section anchor="accecn_option_usage"> | ||||
<section anchor="accecn_option_usage" | <name>Usage of the AccECN TCP Option</name> | |||
title="Usage of the AccECN TCP Option"> | ||||
<t>If a Data Receiver in AccECN mode intends to use AccECN TCP | <t>If a Data Receiver in AccECN mode intends to use AccECN TCP | |||
Options to provide feedback, the rules below determine when it | Options to provide feedback, the rules below determine when to | |||
includes an AccECN TCP Option, and which fields to include, given | include an AccECN TCP Option, and which fields to include, given | |||
other options might be competing for limited option space:<list | other options might be competing for limited option space:</t> | |||
style="hanging"> | <dl newline="false" spacing="normal"> | |||
<t hangText="Importance of Congestion Control:">AccECN is for | <dt>Importance of Congestion Control:</dt> | |||
congestion control, which implementations SHOULD generally | <dd> | |||
<t>AccECN is for | ||||
congestion control, which implementations <bcp14>SHOULD</bcp14> | ||||
generally | ||||
prioritize over other TCP options when there is insufficient | prioritize over other TCP options when there is insufficient | |||
space for all the options in use.<vspace blankLines="1"/>If | space for all the options in use.</t> | |||
<t>If | ||||
SACK has been negotiated <xref target="RFC2018"/>, and the | SACK has been negotiated <xref target="RFC2018"/>, and the | |||
smallest recommended AccECN Option would leave insufficient | smallest recommended AccECN Option would leave insufficient | |||
space for two SACK blocks on a particular ACK, the Data | space for two SACK blocks on a particular ACK, the Data | |||
Receiver MUST give precedence to the SACK option (total 18 | Receiver <bcp14>MUST</bcp14> give precedence to the SACK option (total 18 | |||
octets), because loss feedback is more critical.</t> | octets), because loss feedback is more critical.</t> | |||
</dd> | ||||
<dt>Recommended Simple Scheme:</dt> | ||||
<dd> | ||||
<!-- [rfced] For ease of the reader, we suggest adding a pointer to the examples | ||||
. | ||||
<t hangText="Recommended Simple Scheme:">The Data Receiver | Original: | |||
SHOULD include an AccECN TCP Option on every scheduled ACK if | Recommended Simple Scheme: The Data Receiver SHOULD include an | |||
AccECN TCP Option on every scheduled ACK if any byte counter has | ||||
incremented since the last ACK. Whenever possible, it SHOULD | ||||
include a field for every byte counter that has changed at some | ||||
time during the connection (see examples later). | ||||
--> | ||||
<t>The Data Receiver | ||||
<bcp14>SHOULD</bcp14> include an AccECN TCP Option on every sche | ||||
duled ACK if | ||||
any byte counter has incremented since the last ACK. Whenever | any byte counter has incremented since the last ACK. Whenever | |||
possible, it SHOULD include a field for every byte counter | possible, it <bcp14>SHOULD</bcp14> include a field for every byt e counter | |||
that has changed at some time during the connection (see | that has changed at some time during the connection (see | |||
examples later). <vspace blankLines="1"/>A scheduled ACK means | examples later). </t> | |||
<t>A scheduled ACK means | ||||
an ACK that the Data Receiver would send by its regular | an ACK that the Data Receiver would send by its regular | |||
delayed ACK rules. Recall that <xref | delayed ACK rules. Recall that <xref target="accecn_Terminology" | |||
target="accecn_Terminology"/> defines an 'ACK' as either with | /> defines an 'ACK' as either with | |||
data payload or without. But the above rule is worded so that, | data payload or without. But the above rule is worded so that, | |||
in the common case when most of the data is from a Server to a | in the common case when most of the data is from a Server to a | |||
Client, the Server only includes an AccECN TCP Option while it | Client, the Server only includes an AccECN TCP Option while it | |||
is acknowledging data from the Client.</t> | is acknowledging data from the Client.</t> | |||
</list>When available TCP option space is limited on particular | </dd> | |||
</dl> | ||||
<t>When available TCP option space is limited on particular | ||||
packets, the recommended scheme will need to include compromises. | packets, the recommended scheme will need to include compromises. | |||
To guide the implementer the rules below are ranked in order of | To guide the implementer, the rules below are ranked in order of | |||
importance, but the final decision has to be | importance, but the final decision has to be | |||
implementation-dependent, because tradeoffs will alter as new TCP | implementation-dependent, because tradeoffs will alter as new TCP | |||
options are defined and new use-cases arise.<list style="hanging"> | options are defined and new use-cases arise.</t> | |||
<t hangText="Necessary Option Length:">When TCP option space | <dl newline="false" spacing="normal"> | |||
is limited, an AccECN TCP option MAY be truncated to omit one | <dt>Necessary Option Length:</dt> | |||
<dd> | ||||
<t>When TCP option space | ||||
is limited, an AccECN TCP option <bcp14>MAY</bcp14> be truncated | ||||
to omit one | ||||
or two fields from the end of the option, as indicated by the | or two fields from the end of the option, as indicated by the | |||
permitted variants listed in | permitted variants listed in | |||
<xref target="accecn_Fig_TCPopttab"/>, provided that the | <xref target="accecn_Fig_TCPopttab"/>, provided that the | |||
counter(s) that have changed since the previous AccECN TCP | counter(s) that have changed since the previous AccECN TCP | |||
option are not omitted.<vspace blankLines="1"/> | option are not omitted.</t> | |||
<t> | ||||
If there is insufficient space to include an AccECN TCP | If there is insufficient space to include an AccECN TCP | |||
option containing the counter(s) that have changed since | option containing the counter(s) that have changed since | |||
the previous AccECN TCP option, then the entire AccECN | the previous AccECN TCP option, then the entire AccECN | |||
TCP option MUST be omitted. (see <xref target="accecn_option"/>) | TCP option <bcp14>MUST</bcp14> be omitted. (see <xref target="ac | |||
;</t> | cecn_option"/>);</t> | |||
</dd> | ||||
<t hangText="Change-Triggered AccECN TCP Options:">If an | <dt>Change-Triggered AccECN TCP Options:</dt> | |||
<dd> | ||||
<t>If an | ||||
arriving packet increments a different byte counter to that | arriving packet increments a different byte counter to that | |||
incremented by the previous packet, the Data Receiver SHOULD | incremented by the previous packet, the Data Receiver <bcp14>SHO ULD</bcp14> | |||
feed it back in an AccECN Option on the next scheduled ACK. | feed it back in an AccECN Option on the next scheduled ACK. | |||
<vspace blankLines="1"/> | </t> | |||
<t> | ||||
For the avoidance of doubt, this rule | For the avoidance of doubt, this rule | |||
does not concern the arrival of control packets with no | does not concern the arrival of control packets with no | |||
payload, because they cannot alter any byte counters.</t> | payload, because they cannot alter any byte counters.</t> | |||
</dd> | ||||
<t hangText="Continual Repetition:">Otherwise, if arriving | <dt>Continual Repetition:</dt> | |||
packets continue to increment the same byte counter:<list | <dd> | |||
style="symbols"> | <t>Otherwise, if arriving | |||
<t>the Data Receiver SHOULD include a counter that has | packets continue to increment the same byte counter:</t> | |||
<ul spacing="normal"> | ||||
<li> | ||||
<t>the Data Receiver <bcp14>SHOULD</bcp14> include a counter | ||||
that has | ||||
continued to increment on the next scheduled ACK following | continued to increment on the next scheduled ACK following | |||
a change-triggered AccECN TCP Option;</t> | a change-triggered AccECN TCP Option;</t> | |||
</li> | ||||
<li> | ||||
<t>while the same counter continues to increment, it | <t>while the same counter continues to increment, it | |||
SHOULD include the counter every n ACKs as consistently as | <bcp14>SHOULD</bcp14> include the counter every n ACKs as co nsistently as | |||
possible, where n can be chosen by the implementer;</t> | possible, where n can be chosen by the implementer;</t> | |||
</li> | ||||
<t>It SHOULD always include an AccECN Option if the r.ceb | <li> | |||
counter is incrementing and it MAY include an AccECN | <t>It <bcp14>SHOULD</bcp14> always include an AccECN Option | |||
if the r.ceb | ||||
counter is incrementing and it <bcp14>MAY</bcp14> include an | ||||
AccECN | ||||
Option if r.ec0b or r.ec1b is incrementing</t> | Option if r.ec0b or r.ec1b is incrementing</t> | |||
</li> | ||||
<t>It SHOULD include each counter at least once for every | <li> | |||
<t>It <bcp14>SHOULD</bcp14> include each counter at least on | ||||
ce for every | ||||
2^22 bytes incremented to prevent overflow during | 2^22 bytes incremented to prevent overflow during | |||
continual repetition.</t> | continual repetition.</t> | |||
</list></t> | </li> | |||
</list></t> | </ul> | |||
</dd> | ||||
<t>The above rules complement those in <xref | </dl> | |||
target="accecn_ACE_Safety"/>, which determine when to generate an | <t>The above rules complement those in <xref target="accecn_ACE_Safe | |||
ty"/>, which determine when to generate an | ||||
ACK irrespective of whether an AccECN TCP Option is to be | ACK irrespective of whether an AccECN TCP Option is to be | |||
included.</t> | included.</t> | |||
<t>The recommended scheme is intended as a simple way to ensure | <t>The recommended scheme is intended as a simple way to ensure | |||
that all the relevant byte counters will be carried on any ACK | that all the relevant byte counters will be carried on any ACK | |||
that reaches the Data Sender, no matter how many pure ACKs are | that reaches the Data Sender, no matter how many pure ACKs are | |||
filtered or coalesced along the network path, and without | filtered or coalesced along the network path, and without | |||
consuming the space available for payload data with counter | consuming the space available for payload data with counter | |||
field(s) that have never changed.</t> | field(s) that have never changed.</t> | |||
<t>As an example of the recommended scheme, if ECT(0) is the only | <t>As an example of the recommended scheme, if ECT(0) is the only | |||
codepoint that has ever arrived in the IP-ECN field, the Data | codepoint that has ever arrived in the IP-ECN field, the Data | |||
Receiver will feed back an AccECN0 TCP Option with only the EE0B | Receiver will feed back an AccECN0 TCP Option with only the EE0B | |||
field on every packet that acknowledges new data. However, as soon | field on every packet that acknowledges new data. However, as soon | |||
as even one CE-marked packet arrives, on every packet that | as even one CE-marked packet arrives, on every packet that | |||
acknowledges new data it will start to include an option with two | acknowledges new data it will start to include an option with two | |||
fields, EE0B and ECEB. As a second example, if the first packet to | fields, EE0B and ECEB. As a second example, if the first packet to | |||
arrive happens to be CE-marked, the Data Receiver will have to | arrive happens to be CE marked, the Data Receiver will have to | |||
arbitrarily choose whether to precede the ECEB field with an EE0B | arbitrarily choose whether to precede the ECEB field with an EE0B | |||
field or an EE1B field. If it chooses, say, EEB0 but it turns out | field or an EE1B field. If it chooses, say, EEB0 but it turns out | |||
never to receive ECT(0), it can start sending EE1B and ECEB | never to receive ECT(0), it can start sending EE1B and ECEB | |||
instead - it does not have to include the EE0B field if the r.e0b | instead -- it does not have to include the EE0B field if the r.e0b | |||
counter has never changed during the connection.</t> | counter never changed during the connection.</t> | |||
<t>With the recommended scheme, if the data sending direction | <t>With the recommended scheme, if the data sending direction | |||
switches during a connection, there can be cases where the AccECN | switches during a connection, there can be cases where the AccECN | |||
TCP Option that is meant to feed back the counter values at the | TCP Option that is meant to feed back the counter values at the | |||
end of a volley in one direction never reaches the other peer, due | end of a volley in one direction never reaches the other peer due | |||
to packet loss. ACE feedback ought to be sufficient to fill this | to packet loss. ACE feedback ought to be sufficient to fill this | |||
gap, given accurate feedback becomes moot after data transmission | gap, given accurate feedback becomes moot after data transmission | |||
has paused.</t> | has paused.</t> | |||
<t><xref target="accecn_Algo_ACE_Bytes"/> gives an example | <t><xref target="accecn_Algo_ACE_Bytes"/> gives an example | |||
algorithm to estimate the number of marked bytes from the ACE | algorithm to estimate the number of marked bytes from the ACE | |||
field alone, if AccECN Options are not available.</t> | field alone, if AccECN Options are not available.</t> | |||
<t>If a host has determined that segments with AccECN Options | <t>If a host has determined that segments with AccECN Options | |||
always seem to be discarded somewhere along the path, it is no | always seem to be discarded somewhere along the path, it is no | |||
longer obliged to follow any of the rules in this section.</t> | longer obliged to follow any of the rules in this section.</t> | |||
</section> | </section> | |||
</section> | </section> | |||
</section> | </section> | |||
<section anchor="accecn_Mbox_Operation"> | ||||
<section anchor="accecn_Mbox_Operation" | <name>AccECN Compliance Requirements for TCP Proxies, Offload Engines, a | |||
title="AccECN Compliance Requirements for TCP Proxies, Offload En | nd Other Middleboxes</name> | |||
gines and other Middleboxes"> | ||||
<t>Given AccECN alters the TCP protocol on the wire, this section | <t>Given AccECN alters the TCP protocol on the wire, this section | |||
specifies new requirements on certain networking equipment that | specifies new requirements on certain networking equipment that | |||
forwards TCP and inspects TCP header information.</t> | forwards TCP and inspects TCP header information.</t> | |||
<section> | ||||
<section title="Requirements for TCP Proxies"> | <name>Requirements for TCP Proxies</name> | |||
<t>A large class of middleboxes split TCP connections. Such a | <t>A large class of middleboxes split TCP connections. Such a | |||
middlebox would be compliant with the AccECN protocol if the TCP | middlebox would be compliant with the AccECN protocol if the TCP | |||
implementation on each side complied with the present AccECN | implementation on each side complied with the present AccECN | |||
specification and each side negotiated AccECN independently of the | specification and each side negotiated AccECN independently of the | |||
other side.</t> | other side.</t> | |||
</section> | </section> | |||
<section anchor="accecn_middlebox_transparent_normalizers"> | ||||
<section anchor="accecn_middlebox_transparent_normalizers" | <name>Requirements for Transparent Middleboxes and TCP Normalizers</na | |||
title="Requirements for Transparent Middleboxes and TCP Normali | me> | |||
zers"> | ||||
<t>Another large class of middleboxes intervenes to some degree at | <t>Another large class of middleboxes intervenes to some degree at | |||
the transport layer, but attempts to be transparent (invisible) to | the transport layer, but attempts to be transparent (invisible) to | |||
the end-to-end connection. A subset of this class of middleboxes | the end-to-end connection. A subset of this class of middleboxes | |||
attempts to `normalize' the TCP wire protocol by checking that all | attempts to 'normalize' the TCP wire protocol by checking that all | |||
values in header fields comply with a rather narrow interpretation | values in header fields comply with a rather narrow interpretation | |||
of the TCP specifications that is also not always up to date.</t> | of the TCP specifications that is not always up to date.</t> | |||
<t>A middlebox that is not normalizing the TCP protocol and does not | <t>A middlebox that is not normalizing the TCP protocol and does not | |||
itself act as a back-to-back pair of TCP endpoints (i.e., a | itself act as a back-to-back pair of TCP endpoints (i.e., a | |||
middlebox that intends to be transparent or invisible at the | middlebox that intends to be transparent or invisible at the | |||
transport layer) ought to forward AccECN TCP Options unaltered, | transport layer) ought to forward AccECN TCP Options unaltered, | |||
whether or not the length value matches one of those specified in | whether or not the length value matches one of those specified in | |||
<xref target="accecn_option"/>, and whether or not the initial | <xref target="accecn_option"/>, and whether or not the initial | |||
values of the byte-counter fields match those in <xref | values of the byte-counter fields match those in <xref target="accecn_ | |||
target="accecn_init_counters"/>. This is because blocking apparently | init_counters"/>. This is because blocking apparently | |||
invalid values prevents the standardized set of values being | invalid values prevents the standardized set of values from being | |||
extended in future (such outdated normalizers would block updated | extended in the future (such outdated normalizers would block updated | |||
hosts from using the extended AccECN standard).</t> | hosts from using the extended AccECN standard).</t> | |||
<t>A TCP normalizer is likely to block or alter an AccECN TCP Option | <t>A TCP normalizer is likely to block or alter an AccECN TCP Option | |||
if the length value or the initial values of its byte-counter fields | if the length value or the initial values of its byte-counter fields | |||
do not match one of those specified in <xref | do not match one of those specified in Sections <xref target="accecn_o | |||
target="accecn_option"/> or <xref target="accecn_init_counters"/>. | ption" format="counter"/> or <xref target="accecn_init_counters" format="counter | |||
"/>. | ||||
However, to comply with the present AccECN specification, a | However, to comply with the present AccECN specification, a | |||
middlebox MUST NOT change the ACE field; or those fields of an | middlebox <bcp14>MUST NOT</bcp14> change the ACE field; or those field | |||
AccECN Option that are currently specified in <xref | s of an | |||
target="accecn_option"/>; or any AccECN field covered by integrity | AccECN Option that are currently specified in <xref target="accecn_opt | |||
protection (e.g., <xref target="RFC5925"/>).</t> | ion"/>; or any AccECN field covered by integrity | |||
protection (e.g., <xref target="RFC5925"/>).</t> | ||||
<!-- This includes the explicitly stated requirements to forward | <!-- This includes the explicitly stated requirements to forward | |||
Reserved (Rsvd) and Currently Unused (CU) values unaltered. | Reserved (Rsvd) and Currently Unused (CU) values unaltered. | |||
An 'ideal' TCP normalizer would not have to change to accommodate AccECN, becaus e AccECN does not directly | An 'ideal' TCP normalizer would not have to change to accommodate AccECN, becaus e AccECN does not directly | |||
contravene any existing TCP specifications, | contravene any existing TCP specifications, | |||
even though it uses existing TCP fields in unorthodox ways. | even though it uses existing TCP fields in unorthodox ways. | |||
--> | --> | |||
</section> | </section> | |||
<section> | ||||
<name>Requirements for TCP ACK Filtering</name> | ||||
<!-- [rfced] Mention of BCP 69 was removed to the HTML and PDF could link direc | ||||
tly to Section 5.2.1 of RFC 3449. Would you prefer that BCP 69 be included as t | ||||
he cite tag? | ||||
<section title="Requirements for TCP ACK Filtering"> | Original: | |||
<t>Section 5.2.1 of BCP 69 <xref target="RFC3449"/> gives best | Section 5.2.1 of BCP 69 [RFC3449] gives best current practice on | |||
current practice on filtering (aka. thinning or coalescing) of pure | filtering (aka. thinning or coalescing) of pure TCP ACKs. | |||
Perhaps: | ||||
Section 5.2.1 of RFC 3449 [BCP69] gives best current practice on | ||||
filtering (aka thinning or coalescing) of pure TCP ACKs. | ||||
--> | ||||
<t><xref target="RFC3449" sectionFormat="of" section="5.2.1"/> gives b | ||||
est | ||||
current practice on filtering (aka thinning or coalescing) of pure | ||||
TCP ACKs. It advises that filtering ACKs carrying ECN feedback ought | TCP ACKs. It advises that filtering ACKs carrying ECN feedback ought | |||
to preserve the correct operation of ECN feedback. As the present | to preserve the correct operation of ECN feedback. As the present | |||
specification updates the operation of ECN feedback, this section | specification updates the operation of ECN feedback, this section | |||
discusses how an ACK filter might preserve correct operation of | discusses how an ACK filter might preserve correct operation of | |||
AccECN feedback as well.</t> | AccECN feedback as well.</t> | |||
<t>The problem divides into two parts: determining if an ACK is part | <t>The problem divides into two parts: determining if an ACK is part | |||
of a connection that is using AccECN and then preserving the correct | of a connection that is using AccECN and then preserving the correct | |||
operation of AccECN feedback:<list style="symbols"> | operation of AccECN feedback:</t> | |||
<ul spacing="normal"> | ||||
<li> | ||||
<!-- [rfced] Does "even if it is" refer to using AccECN without ECN++ or with E | ||||
CN++? | ||||
Original: | ||||
However, it might omit some AccECN ACKs, because | ||||
AccECN can be used without ECN++ and even if it is, ECN++ does not | ||||
have to make pure ACKs ECN-capable - only deployment experience | ||||
will tell. | ||||
Perhaps: | ||||
However, it might omit some AccECN ACKs because | ||||
AccECN can be used without ECN++. Even if ECN++ is used, it does not | ||||
have to make pure ACKs ECN-capable - only deployment experience | ||||
will tell. | ||||
--> | ||||
<t>To determine whether a pure TCP ACK is part of an AccECN | <t>To determine whether a pure TCP ACK is part of an AccECN | |||
connection without resorting to connection tracking and per-flow | connection without resorting to connection tracking and per-flow | |||
state, a useful heuristic would be to check for a non-zero ECN | state, a useful heuristic would be to check for a non-zero ECN | |||
field at the IP layer (because the ECN++ experiment only allows | field at the IP layer (because the ECN++ experiment only allows | |||
TCP pure ACKs to be ECN-capable if AccECN has been negotiated | TCP pure ACKs to be ECN-capable if AccECN has been negotiated | |||
<xref target="I-D.ietf-tcpm-generalized-ecn"/>). This heuristic | <xref target="I-D.ietf-tcpm-generalized-ecn"/>). This heuristic | |||
is simple and stateless. However, it might omit some AccECN | is simple and stateless. However, it might omit some AccECN | |||
ACKs, because AccECN can be used without ECN++ and even if it | ACKs, because AccECN can be used without ECN++ and even if it | |||
is, ECN++ does not have to make pure ACKs ECN-capable - only | is, ECN++ does not have to make pure ACKs ECN-capable -- only | |||
deployment experience will tell. Also, TCP ACKs might be | deployment experience will tell. Also, TCP ACKs might be | |||
ECN-capable owing to some scheme other than AccECN, | ECN-capable owing to some scheme other than AccECN, | |||
e.g., <xref target="RFC5690"/> or some future standards | e.g., <xref target="RFC5690"/> or some future standards | |||
action. Again, only deployment experience will tell.</t> | action. Again, only deployment experience will tell.</t> | |||
</li> | ||||
<li> | ||||
<t>The main concern with preserving correct AccECN operation | <t>The main concern with preserving correct AccECN operation | |||
involves leaving enough ACKs for the Data Sender to work out | involves leaving enough ACKs for the Data Sender to work out | |||
whether the 3-bit ACE field has wrapped. In the worst case, in | whether the 3-bit ACE field has wrapped. In the worst case, in | |||
feedback about a run of received packets that were all | feedback about a run of received packets that were all | |||
ECN-marked, the ACE field will wrap every 8 acknowledged | ECN-marked, the ACE field will wrap every 8 acknowledged | |||
packets. ACE field wrap might be of less concern if packets also | packets. ACE field wrap might be of less concern if packets also | |||
carry AccECN TCP Options. However, note that logic to read an | carry AccECN TCP Options. However, note that logic to read an | |||
AccECN TCP Option is optional to implement (albeit recommended | AccECN TCP Option is optional to implement (albeit recommended | |||
— see <xref target="accecn_option"/>). So one end writing | -- see <xref target="accecn_option"/>). So one end writing | |||
an AccECN TCP Option into a packet does not necessarily imply | an AccECN TCP Option into a packet does not necessarily imply | |||
that the other end will read it.</t> | that the other end will read it.</t> | |||
</list></t> | </li> | |||
</ul> | ||||
<t>Note that the present specification of AccECN in TCP does not | <t>Note that the present specification of AccECN in TCP does not | |||
presume to rely on any of the above ACK filtering behaviour in the | presume to rely on any of the above ACK filtering behaviour in the | |||
network, because it has to be robust against pre-existing network | network, because it has to be robust against pre-existing network | |||
nodes that do not distinguish AccECN ACKs, and robust against ACK | nodes that do not distinguish AccECN ACKs, and robust against ACK | |||
loss during overload more generally.</t> | loss during overload more generally.</t> | |||
</section> | </section> | |||
<section> | ||||
<section title="Requirements for TCP Segmentation Offload and Large Rece | <name>Requirements for TCP Segmentation Offload and Large Receive Offl | |||
ive Offload"> | oad</name> | |||
<t>Hardware to offload certain TCP processing represents another | <t>Hardware to offload certain TCP processing represents another | |||
large class of middleboxes (even though it is often a function of a | large class of middleboxes (even though it is often a function of a | |||
host's network interface and rarely in its own 'box').</t> | host's network interface and rarely in its own 'box').</t> | |||
<t>Offloading can happen in the transmit path, usually referred to as | <t>Offloading can happen in the transmit path, usually referred to as | |||
TCP Segmentation Offload (TSO), and the receive path where it is calle d | TCP Segmentation Offload (TSO), and the receive path where it is calle d | |||
Large Receive Offload (LRO).</t> | Large Receive Offload (LRO).</t> | |||
<t>In the transmit direction, with AccECN, all segments created from | <t>In the transmit direction, with AccECN, all segments created from | |||
the same super-segment should retain the same ACE field, which should | the same super-segment should retain the same ACE field, which should | |||
make TSO straighforward.</t> | make TSO straighforward.</t> | |||
<t>However, with TSO hardware that supports <xref target="RFC3168"/>, | <t>However, with TSO hardware that supports <xref target="RFC3168"/>, | |||
the CWR bit is usually masked out on the middle and last segment. | the CWR bit is usually masked out on the middle and last segments. | |||
If applied to an AccECN segment, this would change the ACE field, and | If applied to an AccECN segment, this would change the ACE field, and | |||
would be interpreted as having received numerous CE marks in the | would be interpreted as having received numerous CE marks in the | |||
receive direction. Therefore, currently available TSO hardware with | receive direction. Therefore, currently available TSO hardware with | |||
<xref target="RFC3168"/> support may need some minor driver changes, | <xref target="RFC3168"/> support may need some minor driver changes, | |||
to adjust the bitmask for the first, middle and last segment processed | to adjust the bitmask for the first, middle, and last segments process ed | |||
with TSO.</t> | with TSO.</t> | |||
<t>Initially, when Classic ECN <xref target="RFC3168"/> and Accurate E CN flows | <t>Initially, when Classic ECN <xref target="RFC3168"/> and Accurate E CN flows | |||
coexist on the same offloading engine, the host software may need to | coexist on the same offloading engine, the host software may need to | |||
work around incompatibilities (e.g., when only global configurabl e | work around incompatibilities (e.g., when only global configurable | |||
TSO TCP Flag bitmasks are available), otherwise this would cause some | TSO TCP Flag bitmasks are available), otherwise this would cause some | |||
issues.</t> | issues.</t> | |||
<!-- [rfced] Instead of using [RFC3168] as an adjective, may we update this text | ||||
to refer to "Classic ECN"? | ||||
Original: | ||||
One way around this could be to only negotiate for Accurate ECN, but | ||||
not offer a fall back to [RFC3168] ECN. | ||||
Perhaps: | ||||
One way around this could be to only negotiate for Accurate ECN, but | ||||
not offer a fall back to Classic ECN [RFC3168]. | ||||
Original: | ||||
For LRO in the receive direction, a different issue may get exposed | ||||
with [RFC3168] ECN supporting hardware. | ||||
Perhaps: | ||||
For LRO in the receive direction, a different issue may get exposed | ||||
with Classic-ECN [RFC3168] supporting hardware. | ||||
--> | ||||
<t>One way around this could be to only negotiate for Accurate ECN, | <t>One way around this could be to only negotiate for Accurate ECN, | |||
but not offer a fall back to <xref target="RFC3168"/> ECN. Another way | but not offer a fall back to <xref target="RFC3168"/> ECN. Another way | |||
could be to allow TSO only as long as the CWR flag in the TCP header | could be to allow TSO only as long as the CWR flag in the TCP header | |||
is not set - at the cost of more processing overhead while the ACE | is not set -- at the cost of more processing overhead while the ACE | |||
field has this bit set.</t> | field has this bit set.</t> | |||
<t>For LRO in the receive direction, a different issue may get | <t>For LRO in the receive direction, a different issue may get | |||
exposed with <xref target="RFC3168"/> ECN supporting hardware.</t> | exposed with <xref target="RFC3168"/> ECN supporting hardware.</t> | |||
<t>The ACE field changes with every received CE marking, so today's | <t>The ACE field changes with every received CE marking, so today's | |||
receive offloading could lead to many interrupts in high congestion | receive offloading could lead to many interrupts in high congestion | |||
situations. Although that would be useful (because congestion | situations. Although that would be useful (because congestion | |||
information is received sooner), it could also significantly | information is received sooner), it could also significantly | |||
increase processor load, particularly in scenarios such as DCTCP or | increase processor load, particularly in scenarios such as DCTCP or | |||
L4S where the marking rate is generally higher.</t> | L4S where the marking rate is generally higher.</t> | |||
<t>Current offload hardware ejects a segment from the coalescing | <t>Current offload hardware ejects a segment from the coalescing | |||
process whenever the TCP ECN flags change. In data centres it has | process whenever the TCP ECN flags change. In data centres, it has | |||
been fortunate for this offload hardware that DCTCP-style feedback | been fortunate for this offload hardware that DCTCP-style feedback | |||
changes less often when there are long sequences of CE marks, which | changes less often when there are long sequences of CE marks, which | |||
is more common with a step marking threshold (but less likely the | is more common with a step marking threshold (but less likely the | |||
more short flows are in the mix). The ACE counter approach has been | more short flows are in the mix). The ACE counter approach has been | |||
designed so that coalescing can continue over arbitrary patterns of | designed so that coalescing can continue over arbitrary patterns of | |||
marking and only needs to stop when the counter wraps. Nonetheless, | marking and only needs to stop when the counter wraps. Nonetheless, | |||
until the particular offload hardware in use implements this more | until the particular offload hardware in use implements this more | |||
efficient approach, it is likely to be more efficient for AccECN | efficient approach, it is likely to be more efficient for AccECN | |||
connections to implement this counter-style logic using software | connections to implement this counter-style logic using software | |||
segmentation offload.</t> | segmentation offload.</t> | |||
skipping to change at line 2625 ¶ | skipping to change at line 2608 ¶ | |||
been fortunate for this offload hardware that DCTCP-style feedback | been fortunate for this offload hardware that DCTCP-style feedback | |||
changes less often when there are long sequences of CE marks, which | changes less often when there are long sequences of CE marks, which | |||
is more common with a step marking threshold (but less likely the | is more common with a step marking threshold (but less likely the | |||
more short flows are in the mix). The ACE counter approach has been | more short flows are in the mix). The ACE counter approach has been | |||
designed so that coalescing can continue over arbitrary patterns of | designed so that coalescing can continue over arbitrary patterns of | |||
marking and only needs to stop when the counter wraps. Nonetheless, | marking and only needs to stop when the counter wraps. Nonetheless, | |||
until the particular offload hardware in use implements this more | until the particular offload hardware in use implements this more | |||
efficient approach, it is likely to be more efficient for AccECN | efficient approach, it is likely to be more efficient for AccECN | |||
connections to implement this counter-style logic using software | connections to implement this counter-style logic using software | |||
segmentation offload.</t> | segmentation offload.</t> | |||
<t>ECN encodes a varying signal in the ACK stream, so it is | <t>ECN encodes a varying signal in the ACK stream, so it is | |||
inevitable that offload hardware will ultimately need to handle any | inevitable that offload hardware will ultimately need to handle any | |||
form of ECN feedback exceptionally. The ACE field has been designed | form of ECN feedback exceptionally. The ACE field has been designed | |||
as a counter so that it is straightforward for offload hardware to | as a counter so that it is straightforward for offload hardware to | |||
pass on the highest counter, and to push a segment from its cache | pass on the highest counter, and to push a segment from its cache | |||
before the counter wraps. The purpose of working towards | before the counter wraps. The purpose of working towards | |||
standardized TCP ECN feedback is to reduce the risk for hardware | standardized TCP ECN feedback is to reduce the risk for hardware | |||
developers, who would otherwise have to guess which scheme is likely | developers, who would otherwise have to guess which scheme is likely | |||
to become dominant.</t> | to become dominant.</t> | |||
<t>The above process has been designed to enable a continuing | <t>The above process has been designed to enable a continuing | |||
incremental deployment path - to more highly dynamic congestion | incremental deployment path -- to more highly dynamic congestion | |||
control. Once offload hardware supports AccECN, it will be able to | control. Once offload hardware supports AccECN, it will be able to | |||
coalesce efficiently for any sequence of marks, instead of relying | coalesce efficiently for any sequence of marks, instead of relying | |||
for efficiency on the long marking sequences from step marking. In | on the long marking sequences from step marking for efficiency. In | |||
the next stage, marking can evolve from a step to a ramp function. | the next stage, marking can evolve from a step to a ramp function. | |||
That in turn will allow host congestion control algorithms to | That in turn will allow host congestion control algorithms to | |||
respond faster to dynamics, while being backwards compatible with | respond faster to dynamics, while being backwards compatible with | |||
existing host algorithms.</t> | existing host algorithms.</t> | |||
</section> | </section> | |||
</section> | </section> | |||
</section> | </section> | |||
<section anchor="accecn_3168_updates"> | ||||
<section anchor="accecn_3168_updates" title="Updates to RFC 3168"> | <name>Updates to RFC 3168</name> | |||
<t>This section clarifies which parts of RFC3168 are updated and maps | <t>This section clarifies which parts of RFC 3168 are updated and maps | |||
them to the sections of the present AccECN specification that update | them to the relevant updated sections of the present AccECN specification. | |||
them: <list style="symbols"> | </t> | |||
<t>The whole of "6.1.1 TCP Initialization" of <xref | <ul spacing="normal"> | |||
target="RFC3168"/> is updated by <xref target="accecn_Negotiation"/> | <li> | |||
<t>The whole of <xref target="RFC3168" sectionFormat="of" | ||||
section="6.1.1"/> is updated by <xref target="accecn_Negotiation"/> | ||||
of the present specification.</t> | of the present specification.</t> | |||
</li> | ||||
<!-- [rfced] Throughout: We have removed the section titles and linked the secti | ||||
on numbers directly to the section of the RFC specified. For example, the text | ||||
has been updated as follows: | ||||
<t>In "6.1.2. The TCP Sender" of <xref target="RFC3168"/>, all | Original: | |||
* The whole of "6.1.1 TCP Initialization" of [RFC3168] is updated by | ||||
Section 3.1 of the present specification. | ||||
Current: | ||||
* The whole of Section 6.1.1 of [RFC3168] is updated by Section 3.1 | ||||
of the present specification. | ||||
In the HTML and PDF files, "Section 6.1.1 links to Section 6.1.1 of RFC 3168. P | ||||
lease review and let us know if you prefer the section titles be included. | ||||
--> | ||||
<li> | ||||
<t>In <xref target="RFC3168" sectionFormat="of" section="6.1.2"/>, all | ||||
mentions of a congestion response to an ECN-Echo (ECE) ACK packet | mentions of a congestion response to an ECN-Echo (ECE) ACK packet | |||
are updated by <xref target="accecn_feedback"/> of the present | are updated by <xref target="accecn_feedback"/> of the present | |||
specification to mean an increment to the sender's count of | specification to mean an increment to the sender's count of | |||
CE-marked packets, s.cep. And the requirements to set the CWR flag | CE-marked packets, s.cep. And the requirements to set the CWR flag | |||
no longer apply, as specified in <xref | no longer apply, as specified in <xref target="accecn_implications_acc | |||
target="accecn_implications_accecn_mode"/> of the present | ecn_mode"/> of the present | |||
specification. Otherwise, the remaining requirements in "6.1.2. The | specification. Otherwise, the remaining requirements in | |||
TCP Sender" still stand.<vspace blankLines="1"/>It will be noted | <xref target="RFC3168" sectionFormat="of" section="6.1.2"/> still stan | |||
that RFC 8311 already updates, or potentially updates, a number of | d.</t> | |||
the requirements in "6.1.2. The TCP Sender". Section 6.1.2 of RFC | <!-- [rfced] We are unclear why "potentially updates" is mentioned here. Is it | |||
3168 extended standard TCP congestion control <xref | mentioned to cover implementations of RFC 3168 have not been updated yet and/or | |||
target="RFC5681"/> to cover ECN marking as well as packet drop. | potential future updates? Otherwise, may it be cut? | |||
Whereas, RFC 8311 enables experimentation with alternative responses | ||||
to ECN marking, if specified for instance by an experimental RFC on | ||||
the IETF document stream. RFC 8311 also strengthened the statement | ||||
that "ECT(0) SHOULD be used" to a "MUST" (see <xref | ||||
target="RFC8311"/> for the details).</t> | ||||
<t>The whole of "6.1.3. The TCP Receiver" of <xref | Original: | |||
target="RFC3168"/> is updated by <xref target="accecn_feedback"/> of | It will be noted that RFC 8311 already updates, or potentially | |||
updates, a number of the requirements in "6.1.2. The TCP Sender". | ||||
--> | ||||
<t>It will be noted that <xref target="RFC8311"/> already updates, | ||||
or potentially updates, a number of the requirements in <xref | ||||
target="RFC3168" sectionFormat="of" section="6.1.2"/>. Section 6.1.2 o | ||||
f | ||||
RFC 3168 extended | ||||
standard TCP congestion control <xref target="RFC5681"/> to cover | ||||
ECN marking as well as packet drop. Whereas, <xref target="RFC8311"/> | ||||
enables | ||||
experimentation with alternative responses to ECN marking, if | ||||
specified for instance by an Experimental RFC produced by the IETF Str | ||||
eam. <xref target="RFC8311"/> also strengthened the statement that "ECT(0) | ||||
<bcp14>SHOULD</bcp14> be used" to a "<bcp14>MUST</bcp14>" (see <xref | ||||
target="RFC8311"/> for the details).</t> | ||||
</li> | ||||
<li> | ||||
<t>The whole of <xref target="RFC3168" sectionFormat="of" | ||||
section="6.1.3"/> is updated by <xref target="accecn_feedback"/> of | ||||
the present specification, with the exception of the last paragraph | the present specification, with the exception of the last paragraph | |||
(about congestion response to drop and ECN in the same round trip), | (about congestion response to drop and ECN in the same round trip), | |||
which still stands. Incidentally, this last paragraph is in the | which still stands. Incidentally, this last paragraph is in the | |||
wrong section, because it relates to "TCP Sender" behaviour.</t> | wrong section, because it relates to "TCP Sender" behaviour.</t> | |||
</li> | ||||
<t>The following text within "6.1.5. Retransmitted TCP packets": | <li> | |||
<list style="empty"> | <t>The following text within <xref target="RFC3168" sectionFormat="of" | |||
<t>"the TCP data receiver SHOULD ignore the ECN field on | section="6.1.5"/>:</t> | |||
arriving data packets that are outside of the receiver's current | <blockquote><t>the TCP data receiver <bcp14>SHOULD</bcp14> ignore | |||
window."</t> | the ECN field on arriving data packets that are outside of the | |||
</list> is updated by more stringent acceptability tests for any | receiver's current window.</t></blockquote> | |||
packet (not just data packets) in the present specification. | <t>is updated by more stringent acceptability tests for any packet | |||
Specifically, in the normative specification of AccECN (<xref | (not just data packets) in the present specification. Specifically, | |||
target="accecn_Spec"/>) only 'Acceptable' packets contribute to the | in the normative specification of AccECN (<xref | |||
target="accecn_Spec"/>), only 'Acceptable' packets contribute to the | ||||
ECN counters at the AccECN receiver and <xref | ECN counters at the AccECN receiver and <xref | |||
target="accecn_Terminology"/> defines an Acceptable packet as one | target="accecn_Terminology"/> defines an Acceptable packet as one | |||
that passes acceptability tests equivalent in strength to those in | that passes acceptability tests equivalent in strength to those in | |||
both <xref target="RFC9293"/> and <xref target="RFC5961"/>.</t> | both <xref target="RFC9293"/> and <xref target="RFC5961"/>.</t> | |||
</li> | ||||
<t>Sections 5.2, 6.1.1, 6.1.4, 6.1.5 and 6.1.6 of <xref | <li> | |||
target="RFC3168"/> prohibit use of ECN on TCP control packets and | <t>Sections <xref target="RFC3168" sectionFormat="bare" | |||
retransmissions. The present specification does not update that | section="5.2"/>, <xref target="RFC3168" sectionFormat="bare" | |||
aspect of RFC 3168, but it does say what feedback an AccECN Data | section="6.1.1"/>, <xref target="RFC3168" sectionFormat="bare" | |||
Receiver ought to provide if it receives an ECN-capable control | section="6.1.4"/>, <xref target="RFC3168" sectionFormat="bare" | |||
packet or retransmission. This ensures AccECN is forward compatible | section="6.1.5"/>, and <xref target="RFC3168" sectionFormat="bare" | |||
with any future scheme that allows ECN on these packets, as provided | section="6.1.6"/> of <xref target="RFC3168"/> prohibit use of ECN on | |||
for in section 4.3 of <xref target="RFC8311"/> and as proposed in | TCP control packets and retransmissions. The present specification | |||
<xref target="I-D.ietf-tcpm-generalized-ecn"/>.</t> | does not update that aspect of <xref target="RFC3168"/>, but it does | |||
</list></t> | say what feedback an AccECN Data Receiver ought to provide if it | |||
receives an ECN-capable control packet or retransmission. This | ||||
ensures AccECN is forward compatible with any future scheme that | ||||
allows ECN on these packets, as provided for in <xref | ||||
target="RFC8311" sectionFormat="of" section="4.3"/> and as proposed | ||||
in <xref target="I-D.ietf-tcpm-generalized-ecn"/>.</t> | ||||
</li> | ||||
</ul> | ||||
</section> | </section> | |||
<section anchor="accecn_Interact_Variants"> | ||||
<section anchor="accecn_Interact_Variants" | <name>Interaction with TCP Variants</name> | |||
title="Interaction with TCP Variants"> | ||||
<t>This section is informative, not normative.</t> | <t>This section is informative, not normative.</t> | |||
<section anchor="accecn_Interaction_SYN_Cookies"> | ||||
<section anchor="accecn_Interaction_SYN_Cookies" | <name>Compatibility with SYN Cookies</name> | |||
title="Compatibility with SYN Cookies"> | <t>A TCP Server can use SYN Cookies (see <xref section="A" target="RFC49 | |||
<t>A TCP Server can use SYN Cookies (see Appendix A of <xref | 87"/>) to protect itself from SYN flooding attacks. It | |||
target="RFC4987"/>) to protect itself from SYN flooding attacks. It | ||||
places minimal commonly used connection state in the SYN/ACK, and | places minimal commonly used connection state in the SYN/ACK, and | |||
deliberately does not hold any state while waiting for the subsequent | deliberately does not hold any state while waiting for the subsequent | |||
ACK (e.g., it closes the thread). Therefore it cannot record the | ACK (e.g., it closes the thread). Therefore, it cannot record the | |||
fact that it entered AccECN mode for both half-connections. Indeed, it | fact that it entered AccECN mode for both half-connections. Indeed, it | |||
cannot even remember whether it negotiated the use of Classic ECN | cannot even remember whether it negotiated the use of Classic ECN | |||
<xref target="RFC3168"/>.</t> | <xref target="RFC3168"/>.</t> | |||
<t>Nonetheless, such a Server can determine that it negotiated AccECN | <t>Nonetheless, such a Server can determine that it negotiated AccECN | |||
as follows. If a TCP Server using SYN Cookies supports AccECN and if | as follows. If a TCP Server using SYN Cookies supports AccECN and if | |||
it receives a pure ACK that acknowledges an ISN that is a valid SYN | it receives a pure ACK that acknowledges an ISN that is a valid SYN | |||
cookie, and if the ACK contains an ACE field with the value 0b010 to | cookie, and if the ACK contains an ACE field with the value 0b010 to | |||
0b111 (decimal 2 to 7), the Server can infer the first two stages of | 0b111 (decimal 2 to 7), the Server can infer the first two stages of | |||
the handshake:<list style="symbols"> | the handshake:</t> | |||
<ul spacing="normal"> | ||||
<li> | ||||
<t>the TCP Client has to have requested AccECN support on the | <t>the TCP Client has to have requested AccECN support on the | |||
SYN;</t> | SYN;</t> | |||
</li> | ||||
<li> | ||||
<t>then, even though the Server kept no state, it has to have | <t>then, even though the Server kept no state, it has to have | |||
confirmed that it supported AccECN.</t> | confirmed that it supported AccECN.</t> | |||
</list>Therefore the Server can switch itself into AccECN mode, and | </li> | |||
</ul> | ||||
<t>Therefore, the Server can switch itself into AccECN mode, and | ||||
continue as if it had never forgotten that it switched itself into | continue as if it had never forgotten that it switched itself into | |||
AccECN mode earlier.</t> | AccECN mode earlier.</t> | |||
<t>If the pure ACK that acknowledges a SYN cookie contains an ACE | <t>If the pure ACK that acknowledges a SYN cookie contains an ACE | |||
field with the value 0b000 or 0b001, these values indicate that the | field with the value 0b000 or 0b001, these values indicate that the | |||
TCP Client did not request support for AccECN and therefore the Server | TCP Client did not request support for AccECN; therefore, the Server | |||
does not enter AccECN mode for this connection. Further, 0b001 on the | does not enter AccECN mode for this connection. Further, 0b001 on the | |||
ACK implies that the Server sent an ECN-capable SYN/ACK, which was | ACK implies that the Server sent an ECN-capable SYN/ACK, which was | |||
marked CE in the network, and the non-AccECN TCP Client fed this back | marked CE in the network, and the non-AccECN TCP Client fed this back | |||
by setting ECE on the ACK of the SYN/ACK.</t> | by setting ECE on the ACK of the SYN/ACK.</t> | |||
</section> | </section> | |||
<section anchor="accecn_Interaction_Other"> | ||||
<section anchor="accecn_Interaction_Other" | <name>Compatibility with TCP Experiments and Common TCP Options</name> | |||
title="Compatibility with TCP Experiments and Common TCP Options" | ||||
> | ||||
<t>AccECN is compatible (at least on paper) with the most commonly | <t>AccECN is compatible (at least on paper) with the most commonly | |||
used TCP options: MSS, time-stamp, window scaling, SACK and TCP-AO. It | used TCP options: MSS, time-stamp, window scaling, SACK, and TCP-AO. It | |||
is also compatible with Multipath TCP (MPTCP <xref target="RFC8684"/>) | is also compatible with Multipath TCP (MPTCP <xref target="RFC8684"/>) | |||
and the experimental TCP option TCP Fast Open (TFO <xref | and the experimental TCP option TCP Fast Open (TFO <xref target="RFC7413 | |||
target="RFC7413"/>). AccECN is friendly to all these protocols, | "/>). AccECN is friendly to all these protocols, | |||
because space for TCP options is particularly scarce on the SYN, where | because space for TCP options is particularly scarce on the SYN, where | |||
AccECN consumes zero additional header space.</t> | AccECN consumes zero additional header space.</t> | |||
<!-- [rfced] As we believe "pressure" refers to options vying for limited space, perhaps this update would be more clear? | ||||
<t>When option space is under pressure from other options, <xref | Original: | |||
target="accecn_option_usage"/> provides guidance on how important it | When option space is under pressure from other options, | |||
Section 3.2.3.3 provides guidance on how important it is to send an | ||||
AccECN Option relative to other options, and which fields are more | ||||
important to include. | ||||
Perhaps: | ||||
Because option space is limited, Section 3.2.3.3 provides guidance on | ||||
how important it is to send an AccECN Option relative to other options | ||||
and specifies which fields are more important to include. | ||||
--> | ||||
<t>When option space is under pressure from other options, <xref target= | ||||
"accecn_option_usage"/> provides guidance on how important it | ||||
is to send an AccECN Option relative to other options, and which | is to send an AccECN Option relative to other options, and which | |||
fields are more important to include.</t> | fields are more important to include.</t> | |||
<t>Implementers of TFO need to take careful note of the recommendation | <t>Implementers of TFO need to take careful note of the recommendation | |||
in <xref target="accecn_ACE_3rdACK"/>. That section recommends that, | in <xref target="accecn_ACE_3rdACK"/>. That section recommends that, | |||
if the TCP Client has successfully negotiated AccECN, when | if the TCP Client has successfully negotiated AccECN, when | |||
acknowledging the SYN/ACK, even if it has data to send, it sends a | acknowledging the SYN/ACK, even if it has data to send, it sends a | |||
pure ACK immediately before the data. Then it can reflect the IP-ECN | pure ACK immediately before the data. Then it can reflect the IP-ECN | |||
field of the SYN/ACK on this pure ACK, which allows the Server to | field of the SYN/ACK on this pure ACK, which allows the Server to | |||
detect ECN mangling. Note that, as specified in <xref | detect ECN mangling. Note that, as specified in <xref target="accecn_fee | |||
target="accecn_feedback"/>, any data on the SYN (SYN=1, ACK=0) is not | dback"/>, any data on the SYN (SYN=1, ACK=0) is not | |||
included in any of the byte counters held locally for each ECN | included in any of the byte counters held locally for each ECN | |||
marking, nor in the AccECN Option on the wire.</t> | marking, nor in the AccECN Option on the wire.</t> | |||
<t>AccECN feedback is compatible with the ECN++ experiment <xref target= | ||||
<t>AccECN feedback is compatible with the ECN++ <xref | "I-D.ietf-tcpm-generalized-ecn"/>, which allows TCP | |||
target="I-D.ietf-tcpm-generalized-ecn"/> experiment, which allows TCP | control packets and retransmissions to be ECN-capable (<xref target="RFC | |||
control packets and retransmissions to be ECN-capable (<xref | 3168"/> was updated by <xref target="RFC8311"/> to permit | |||
target="RFC3168"/> was updated by <xref target="RFC8311"/> to permit | ||||
such experiments). AccECN is likely to inherently support any | such experiments). AccECN is likely to inherently support any | |||
experiment with ECN-capable packets, because it feeds back the | experiment with ECN-capable packets, because it feeds back the | |||
contents of the ECN field mechanistically, without judging whether a | contents of the ECN field mechanistically, without judging whether or no | |||
packet ought to use the ECN capability or not (<xref | t a | |||
target="accecn_demb_reflector"/>). This specification does not discuss | packet ought to use the ECN capability (<xref target="accecn_demb_reflec | |||
tor"/>). This specification does not discuss | ||||
implementing AccECN alongside <xref target="RFC5562"/>, which was an | implementing AccECN alongside <xref target="RFC5562"/>, which was an | |||
earlier experimental protocol with narrower scope than ECN++ and a | earlier experimental protocol with narrower scope than ECN++ and a | |||
5-way handshake.</t> | 5-way handshake.</t> | |||
</section> | </section> | |||
<section anchor="accecn_Integrity"> | ||||
<name>Compatibility with Feedback Integrity Mechanisms</name> | ||||
<section anchor="accecn_Integrity" | ||||
title="Compatibility with Feedback Integrity Mechanisms"> | ||||
<t>Three alternative mechanisms are available to assure the integrity | <t>Three alternative mechanisms are available to assure the integrity | |||
of ECN and/or loss signals. AccECN is compatible with any of these | of ECN and/or loss signals. AccECN is compatible with any of these | |||
approaches:<list style="symbols"> | approaches:</t> | |||
<ul spacing="normal"> | ||||
<li> | ||||
<t>The Data Sender can test the integrity of the receiver's ECN | <t>The Data Sender can test the integrity of the receiver's ECN | |||
(or loss) feedback by occasionally setting the IP-ECN field to a | (or loss) feedback by occasionally setting the IP-ECN field to a | |||
value normally only set by the network (and/or deliberately | value normally only set by the network (and/or deliberately | |||
leaving a sequence number gap). Then it can test whether the Data | leaving a sequence number gap). Then it can test whether the Data | |||
Receiver's feedback faithfully reports what it expects (similar to | Receiver's feedback faithfully reports what it expects (similar to | |||
paragraph 2 of Section 20.2 of <xref target="RFC3168"/>). Unlike | paragraph 2 of <xref target="RFC3168" sectionFormat="of" section="20 | |||
the ECN Nonce <xref target="RFC3540"/>, this approach does not | .2"/>). Unlike | |||
the ECN-nonce <xref target="RFC3540"/>, this approach does not | ||||
waste the ECT(1) codepoint in the IP header, it does not require | waste the ECT(1) codepoint in the IP header, it does not require | |||
standardization and it does not rely on misbehaving receivers | standardization, and it does not rely on misbehaving receivers | |||
volunteering to reveal feedback information that allows them to be | volunteering to reveal feedback information that allows them to be | |||
detected. However, setting the CE mark by the sender might conceal | detected. However, setting the CE mark by the sender might conceal | |||
actual congestion feedback from the network and therefore ought to | actual congestion feedback from the network and therefore ought to | |||
only be done sparingly.</t> | only be done sparingly.</t> | |||
</li> | ||||
<li> | ||||
<t>Networks generate congestion signals when they are becoming | <t>Networks generate congestion signals when they are becoming | |||
congested, so networks are more likely than Data Senders to be | congested, so networks are more likely than Data Senders to be | |||
concerned about the integrity of the receiver's feedback of these | concerned about the integrity of the receiver's feedback of these | |||
signals. A network can enforce a congestion response to its ECN | signals. A network can enforce a congestion response to its ECN | |||
markings (or packet losses) using congestion exposure (ConEx) | markings (or packet losses) using congestion exposure (ConEx) | |||
audit <xref target="RFC7713"/>. Whether the receiver or a | audit <xref target="RFC7713"/>. Whether the receiver or a | |||
downstream network is suppressing congestion feedback or the | downstream network is suppressing congestion feedback, or the | |||
sender is unresponsive to the feedback, or both, ConEx audit can | sender is unresponsive to the feedback, or both, ConEx audit can | |||
neutralize any advantage that any of these three parties would | neutralize any advantage that any of these three parties would | |||
otherwise gain. <vspace blankLines="1"/>ConEx is an experimental | otherwise gain. </t> | |||
<!-- [rfced] Please confirm "experimental" is correct here. We ask because RFC | ||||
7713 is an Informational RFC. | ||||
Original: | ||||
ConEx is an experimental change to the Data Sender that would be | ||||
most useful when combined with AccECN. | ||||
--> | ||||
<t>ConEx is an experimental | ||||
change to the Data Sender that would be most useful when combined | change to the Data Sender that would be most useful when combined | |||
with AccECN. Without AccECN, the ConEx behaviour of a Data Sender | with AccECN. Without AccECN, the ConEx behaviour of a Data Sender | |||
would have to be more conservative than would be necessary if it | would have to be more conservative than would be necessary if it | |||
had the accurate feedback of AccECN.</t> | had the accurate feedback of AccECN.</t> | |||
</li> | ||||
<t>The standards track TCP authentication option (TCP-AO <xref | <li> | |||
target="RFC5925"/>) can be used to detect any tampering with | <t>The Standards Track TCP authentication option (TCP-AO <xref targe | |||
t="RFC5925"/>) can be used to detect any tampering with | ||||
AccECN feedback between the Data Receiver and the Data Sender | AccECN feedback between the Data Receiver and the Data Sender | |||
(whether malicious or accidental). The AccECN fields are immutable | (whether malicious or accidental). The AccECN fields are immutable | |||
end-to-end, so they are amenable to TCP-AO protection, which | end to end, so they are amenable to TCP-AO protection, which | |||
covers TCP options by default. However, TCP-AO is often too | covers TCP options by default. However, TCP-AO is often too | |||
brittle to use on many end-to-end paths, where middleboxes can | brittle to use on many end-to-end paths, where middleboxes can | |||
make verification fail in their attempts to improve performance or | make verification fail in their attempts to improve performance or | |||
security, e.g., Network Address (and Port) Translation | security, e.g., Network Address Translation (NAT) and Network Addres | |||
(NAT/NAPT), resegmentation or shifting the sequence space.</t> | s Port Translation (NAPT), resegmentation, or shifting the sequence space.</t> | |||
</list></t> | </li> | |||
</ul> | ||||
</section> | </section> | |||
</section> | </section> | |||
<!-- ================================================================ --> | <section anchor="accecn_Properties"> | |||
<name>Summary: Protocol Properties</name> | ||||
<section anchor="accecn_Properties" title="Summary: Protocol Properties"> | <t>This section is informative, not normative. It describes how well the | |||
<t>This section is informative not normative. It describes how well the | ||||
protocol satisfies the agreed requirements for a more Accurate ECN | protocol satisfies the agreed requirements for a more Accurate ECN | |||
feedback protocol <xref target="RFC7560"/>.<list style="hanging"> | feedback protocol <xref target="RFC7560"/>.</t> | |||
<t hangText="Accuracy:">From each ACK, the Data Sender can infer the | <dl newline="false" spacing="normal"> | |||
number of new CE marked segments since the previous ACK. This | <dt>Accuracy:</dt> | |||
<dd>From each ACK, the Data Sender can infer the | ||||
number of new CE-marked segments since the previous ACK. This | ||||
provides better accuracy on CE feedback than Classic ECN. In | provides better accuracy on CE feedback than Classic ECN. In | |||
addition if an AccECN Option is present (not blocked by the network | addition, if an AccECN Option is present (not blocked by the network | |||
path) the number of bytes marked with CE, ECT(1) and ECT(0) are | path), the number of bytes marked with CE, ECT(1), and ECT(0) are | |||
provided.</t> | provided.</dd> | |||
<dt>Overhead:</dt> | ||||
<t hangText="Overhead:">The AccECN scheme is divided into two parts. | <dd>The AccECN scheme is divided into two parts. | |||
The essential feedback part reuses the 3 flags already assigned to ECN | The essential feedback part reuses the three flags already assigned to | |||
in the | ECN in the | |||
TCP header. The supplementary feedback part adds an additional TCP opt ion | TCP header. The supplementary feedback part adds an additional TCP opt ion | |||
consuming up to 11 bytes. However, no TCP option space is consumed | consuming up to 11 bytes. However, no TCP option space is consumed | |||
in the SYN.</t> | in the SYN.</dd> | |||
<dt>Ordering:</dt> | ||||
<t hangText="Ordering:">The order in which marks arrive at the Data | <dd>The order in which marks arrive at the Data | |||
Receiver is preserved in AccECN feedback, because the Data Receiver | Receiver is preserved in AccECN feedback, because the Data Receiver | |||
is expected to send an ACK immediately whenever a different mark | is expected to send an ACK immediately whenever a different mark | |||
arrives.</t> | arrives.</dd> | |||
<dt>Timeliness:</dt> | ||||
<t hangText="Timeliness:">While the same ECN markings are arriving | <dd>While the same ECN markings are arriving | |||
continually at the Data Receiver, it can defer ACKs as TCP does | continually at the Data Receiver, it can defer ACKs as TCP does | |||
normally, but it will immediately send an ACK as soon as a different | normally, but it will immediately send an ACK as soon as a different | |||
ECN marking arrives.</t> | ECN marking arrives.</dd> | |||
<dt>Timeliness vs Overhead:</dt> | ||||
<t hangText="Timeliness vs Overhead:">Change-Triggered ACKs are | <dd>Change-Triggered ACKs are | |||
intended to enable latency-sensitive uses of ECN feedback by | intended to enable latency-sensitive uses of ECN feedback by | |||
capturing the timing of transitions but not wasting resources while | capturing the timing of transitions but not wasting resources while | |||
the state of the signalling system is stable. Within the constraints | the state of the signalling system is stable. Within the constraints | |||
of the change-triggered ACK rules, the receiver can control how | of the change-triggered ACK rules, the receiver can control how | |||
frequently it sends AccECN TCP Options and therefore to some extent | frequently it sends AccECN TCP Options and therefore to some extent | |||
it can control the overhead induced by AccECN.</t> | it can control the overhead induced by AccECN.</dd> | |||
<dt>Resilience:</dt> | ||||
<t hangText="Resilience:">All information is provided based on | <dd>All information is provided based on | |||
counters. Therefore if ACKs are lost, the counters on the first ACK | counters. Therefore if ACKs are lost, the counters on the first ACK | |||
following the losses allows the Data Sender to immediately recover | following the losses allow the Data Sender to immediately recover | |||
the number of the ECN markings that it missed. And if data or ACKs | the number of the ECN markings that it missed. If data or ACKs | |||
are reordered, stale congestion information can be identified and | are reordered, stale congestion information can be identified and | |||
ignored.</t> | ignored.</dd> | |||
<dt>Resilience against Bias:</dt> | ||||
<t hangText="Resilience against Bias:">Because feedback is based on | <dd>Because feedback is based on | |||
repetition of counters, random losses do not remove any information, | repetition of counters, random losses do not remove any information, | |||
they only delay it. Therefore, even though some ACKs are | they only delay it. Therefore, even though some ACKs are | |||
change-triggered, random losses will not alter the proportions of | change-triggered, random losses will not alter the proportions of | |||
the different ECN markings in the feedback.</t> | the different ECN markings in the feedback.</dd> | |||
<dt>Resilience vs Overhead:</dt> | ||||
<t hangText="Resilience vs Overhead:">If space is limited in some | <dd>If space is limited in some | |||
segments (e.g., because more options are needed on some | segments (e.g., because more options are needed on some | |||
segments, such as the SACK option after loss), the Data Receiver can | segments, such as the SACK option after loss), the Data Receiver can | |||
send AccECN Options less frequently or truncate fields that have not | send AccECN Options less frequently or truncate fields that have not | |||
changed, usually down to as little as 5 bytes.</t> | changed, usually down to as little as 5 bytes.</dd> | |||
<dt>Resilience vs Timeliness and Ordering:</dt> | ||||
<t hangText="Resilience vs Timeliness and Ordering:">Ordering | <dd>Ordering | |||
information and the timing of transitions cannot be communicated in | information and the timing of transitions cannot be communicated in | |||
three cases: i) during ACK loss; ii) if something on the path strips | three cases: i) during ACK loss; ii) if something on the path strips | |||
AccECN Options; or iii) if the Data Receiver is unable to support | AccECN Options; or iii) if the Data Receiver is unable to support | |||
Change-Triggered ACKs. Following ACK reordering, the Data Sender can | Change-Triggered ACKs. Following ACK reordering, the Data Sender can | |||
reconstruct the order in which feedback was sent, but not until all | reconstruct the order in which feedback was sent, but not until all | |||
the missing feedback has arrived.</t> | the missing feedback has arrived.</dd> | |||
<dt>Complexity:</dt> | ||||
<t hangText="Complexity:">An AccECN implementation solely involves | <dd>An AccECN implementation solely involves | |||
simple counter increments, some modulo arithmetic to communicate the | simple counter increments, some modulo arithmetic to communicate the | |||
least significant bits and allow for wrap, and some heuristics for | least significant bits and allow for wrap, and some heuristics for | |||
safety against fields cycling due to prolonged periods of ACK loss. | safety against fields cycling due to prolonged periods of ACK loss. | |||
Each host needs to maintain eight additional counters. The hosts | Each host needs to maintain eight additional counters. The hosts | |||
have to apply some additional tests to detect tampering by | have to apply some additional tests to detect tampering by | |||
middleboxes, but in general the protocol is simple to understand, | middleboxes, but in general the protocol is simple to understand and | |||
simple to implement and requires few cycles per packet to | implement and requires few cycles per packet to | |||
execute.</t> | execute.</dd> | |||
<dt>Integrity:</dt> | ||||
<t hangText="Integrity:">AccECN is compatible with at least three | <dd>AccECN is compatible with at least three | |||
approaches that can assure the integrity of ECN feedback. If AccECN | approaches that can assure the integrity of ECN feedback. If AccECN | |||
Options are stripped the resolution of the feedback is degraded, but | Options are stripped, the resolution of the feedback is degraded, but | |||
the integrity of this degraded feedback can still be assured.</t> | the integrity of this degraded feedback can still be assured.</dd> | |||
<dt>Backward Compatibility:</dt> | ||||
<t hangText="Backward Compatibility:">If only one endpoint supports | <dd> | |||
the AccECN scheme, it will fall-back to the most advanced ECN | <t>If only one endpoint supports | |||
feedback scheme supported by the other end.<vspace | the AccECN scheme, it will fall back to the most advanced ECN | |||
blankLines="1"/>If AccECN Options are stripped by a middlebox, | feedback scheme supported by the other end.</t> | |||
<t>If AccECN Options are stripped by a middlebox, | ||||
AccECN still provides basic congestion feedback in the ACE field. | AccECN still provides basic congestion feedback in the ACE field. | |||
Further, AccECN can be used to detect mangling of the IP ECN field; | Further, AccECN can be used to detect mangling of the IP-ECN field; | |||
mangling of the TCP ECN flags; blocking of ECT-marked segments; and | mangling of the TCP ECN flags; blocking of ECT-marked segments; and | |||
blocking of segments carrying an AccECN Option. It can detect these | blocking of segments carrying an AccECN Option. It can detect these | |||
conditions during TCP's three-way handshake so that it can fall back t o operation | conditions during TCP's three-way handshake so that it can fall back t o operation | |||
without ECN and/or operation without AccECN Options.</t> | without ECN and/or operation without AccECN Options.</t> | |||
</dd> | ||||
<t hangText="Forward Compatibility:">The behaviour of endpoints and | <dt>Forward Compatibility:</dt> | |||
<dd>The behaviour of endpoints and | ||||
middleboxes is carefully defined for all reserved or currently | middleboxes is carefully defined for all reserved or currently | |||
unused codepoints in the scheme. Then, the designers of security | unused codepoints in the scheme. Then, the designers of security | |||
devices can understand which currently unused values might appear in | devices can understand which currently unused values might appear in t he | |||
future. So, even if they choose to treat such values as anomalous | future. So, even if they choose to treat such values as anomalous | |||
while they are not widely used, any blocking will at least be under | while they are not widely used, any blocking will at least be under | |||
policy control not hard-coded. Then, if previously unused values | policy control and not hard-coded. Then, if previously unused values | |||
start to appear on the Internet (or in standards), such policies | start to appear on the Internet (or in standards), such policies | |||
could be quickly reversed.</t> | could be quickly reversed.</dd> | |||
</list></t> | </dl> | |||
</section> | </section> | |||
<!-- ================================================================ --> | <section anchor="accecn_IANA_Considerations"> | |||
<name>IANA Considerations</name> | ||||
<section anchor="accecn_IANA_Considerations" title="IANA Considerations"> | ||||
<t>This document reassigns the TCP header flag at bit offset 7 to the | <t>This document reassigns the TCP header flag at bit offset 7 to the | |||
AccECN protocol. This bit was previously called the Nonce Sum (NS) flag | AccECN protocol. This bit was previously called the Nonce Sum (NS) flag | |||
<xref target="RFC3540"/>, but RFC 3540 has been reclassified as historic | <xref target="RFC3540"/>, but RFC 3540 has been reclassified as Historic | |||
<xref target="RFC8311"/>. The flag will now be defined as the following | <xref target="RFC8311"/>. The flag is now defined as the following | |||
in the "TCP Header Flags" registry in the "Transmission Control Protocol | in the "TCP Header Flags" registry in the "Transmission Control Protocol | |||
(TCP) Parameters" registry group:</t> | (TCP) Parameters" registry group:</t> | |||
<table> | ||||
<texttable suppress-title="true" title="TCP header flag reassignment"> | <name>TCP Header Flag Reassignment</name> | |||
<ttcol>Bit</ttcol> | <thead> | |||
<tr> | ||||
<ttcol>Name</ttcol> | <th>Bit</th> | |||
<th>Name</th> | ||||
<ttcol>Reference</ttcol> | <th>Reference</th> | |||
<th>Assignment Notes</th> | ||||
<ttcol>Assignment Notes</ttcol> | </tr> | |||
</thead> | ||||
<c>7</c> | <tbody> | |||
<tr> | ||||
<c>AE (Accurate ECN)</c> | <td>7</td> | |||
<td>AE (Accurate ECN)</td> | ||||
<c>RFC XXXX</c> | <td>RFC 9768</td> | |||
<td>Previously used as NS (Nonce Sum) by <xref target="RFC3540"/>, w | ||||
<c>Previously used as NS (Nonce Sum) by [RFC3540], which is now | hich is now | |||
historic [RFC8311]</c> | Historic <xref target="RFC8311"/></td> | |||
</texttable> | </tr> | |||
</tbody> | ||||
<t>[TO BE REMOVED: IANA is requested to update the existing entry in the | </table> | |||
TCP Header Flags registry | <t>This document also defines two new TCP options for AccECN | |||
(https://www.iana.org/assignments/tcp-parameters/tcp-parameters.xhtml#tcp- | from the TCP option space. These values | |||
header-flags) | ||||
for Bit 7 to "AE (Accurate ECN)" and to change the reference to this | ||||
RFC-to-be instead of RFC8311. Also IANA is requested to change the | ||||
assignment note to "Previously used as NS (Nonce Sum) by [RFC3540], | ||||
which is now historic [RFC8311]."]</t> | ||||
<t>This document also defines two new TCP options for AccECN, assigned | ||||
values of 172 and 174 (decimal) from the TCP option space. These values | ||||
are defined as the following in the "TCP Option Kind Numbers" registry | are defined as the following in the "TCP Option Kind Numbers" registry | |||
in the "Transmission Control Protocol (TCP) Parameters" registry group:</t > | in the "Transmission Control Protocol (TCP) Parameters" registry group:</t > | |||
<table> | ||||
<name>New TCP Option assignments</name> | ||||
<thead> | ||||
<tr> | ||||
<th>Kind</th> | ||||
<th>Length</th> | ||||
<th>Meaning</th> | ||||
<th>Reference</th> | ||||
</tr> | ||||
</thead> | ||||
<tbody> | ||||
<tr> | ||||
<td>172</td> | ||||
<td>N</td> | ||||
<td>Accurate ECN Order 0 (AccECN0)</td> | ||||
<td>RFC 9768</td> | ||||
</tr> | ||||
<tr> | ||||
<td>174</td> | ||||
<td>N</td> | ||||
<td>Accurate ECN Order 1 (AccECN1)</td> | ||||
<td>RFC 9768</td> | ||||
</tr> | ||||
</tbody> | ||||
</table> | ||||
<!-- [rfced] We have updated the registry title per the note below from IANA. W | ||||
hile draft-ietf-tsvwg-udp-options has not yet been published, this title matches | ||||
what currently appears on the IANA site. Please let us know any concerns. | ||||
<texttable suppress-title="true" title="New TCP Option assignments"> | NOTE: The name of the registry called "TCP Experimental Option Experiment Identi | |||
<ttcol>Kind</ttcol> | fiers (TCP ExIDs)" in the IANA Considerations section has been changed to "TCP/U | |||
DP Experimental Option Experiment Identifiers (TCP/UDP ExIDs)," per draft-ietf-t | ||||
<ttcol>Length</ttcol> | svwg-udp-options-45. | |||
<ttcol>Meaning</ttcol> | ||||
<ttcol>Reference</ttcol> | ||||
<c>172</c> | ||||
<c>N</c> | ||||
<c>Accurate ECN Order 0 (AccECN0)</c> | ||||
<c>RFC XXXX</c> | ||||
<c>174</c> | ||||
<c>N</c> | ||||
<c>Accurate ECN Order 1 (AccECN1)</c> | ||||
<c>RFC XXXX</c> | ||||
</texttable> | ||||
<t>[TO BE REMOVED: These registrations have taken place using the early | Original: | |||
registration procedure, which may be temporary if this draft does not | Early experimental implementations of the two AccECN Options used | |||
proceed, at the following location: | experimental option 254 per [RFC6994] with the 16-bit magic numbers | |||
http://www.iana.org/assignments/tcp-parameters/tcp-parameters.xhtml#tcp-pa | 0xACC0 and 0xACC1 respectively for Order 0 and 1, as allocated in the | |||
rameters-1 | IANA "TCP Experimental Option Experiment Identifiers (TCP ExIDs)" | |||
]</t> | registry. | |||
--> | ||||
<t>Early experimental implementations of the two AccECN Options used | <t>Early experimental implementations of the two AccECN Options used | |||
experimental option 254 per <xref target="RFC6994"/> with the 16-bit | experimental option 254 per <xref target="RFC6994"/> with the 16-bit | |||
magic numbers 0xACC0 and 0xACC1 respectively for Order 0 and 1, as | magic numbers 0xACC0 and 0xACC1, respectively, for Order 0 and 1, as | |||
allocated in the IANA "TCP Experimental Option Experiment Identifiers | allocated in the IANA "TCP/UDP Experimental Option Experiment Identifiers | |||
(TCP ExIDs)" registry. Even earlier experimental implementations used | (TCP/UDP ExIDs)" registry. Even earlier experimental implementations used | |||
the single magic number 0xACCE (16 bits). Uses of these experimental | the single magic number 0xACCE (16 bits). Uses of these experimental | |||
options SHOULD migrate to use the new option kinds (172 & 174).</t> | options <bcp14>SHOULD</bcp14> migrate to use the new option kinds (172 and | |||
174).</t> | ||||
<t>[TO BE REMOVED: IANA is requested to replace the references for all | ||||
three of the above experimental options (0xACC0, 0xACC1 and 0xACCE) with | ||||
a reference to the present RFC XXXX.]</t> | ||||
<t>[TO BE REMOVED: If the early registrations, which may be temporary, | ||||
do not proceed, the three references to them in the TCP ExIDs registry | ||||
at the following location will also need to be edited out: | ||||
https://www.iana.org/assignments/tcp-parameters/tcp-parameters.xhtml#tcp-e | ||||
xids | ||||
]</t> | ||||
</section> | </section> | |||
<!-- ================================================================ --> | <section anchor="accecn_Security_Considerations"> | |||
<name>Security and Privacy Considerations</name> | ||||
<section anchor="accecn_Security_Considerations" | <t>If ever the supplementary feedback part of AccECN that is based on one | |||
title="Security and Privacy Considerations"> | of the new | |||
<t>If ever the supplementary feedback part of AccECN based on one of the n | ||||
ew | ||||
AccECN TCP Options is unusable (due for example to middlebox | AccECN TCP Options is unusable (due for example to middlebox | |||
interference) the essential feedback part of AccECN's congestion feedback | interference), the essential feedback part of AccECN's congestion feedback | |||
offers | offers | |||
only limited resilience to long runs of ACK loss (see <xref | only limited resilience to long runs of ACK loss (see <xref target="accecn | |||
target="accecn_ACE_Safety"/>). These problems are unlikely to be due to | _ACE_Safety"/>). These problems are unlikely to be due to | |||
malicious intervention (because if an attacker could strip a TCP option | malicious intervention (because if an attacker could strip a TCP option | |||
or discard a long run of ACKs it could wreak other arbitrary havoc). | or discard a long run of ACKs, it could wreak other arbitrary havoc). | |||
However, it would be of concern if AccECN's resilience could be | However, it would be of concern if AccECN's resilience could be | |||
indirectly compromised during a flooding attack. AccECN is still | indirectly compromised during a flooding attack. AccECN is still | |||
considered safe though, because if AccECN Options are not present, the | considered safe though, because if AccECN Options are not present, the | |||
AccECN Data Sender is then required to switch to more conservative | AccECN Data Sender is then required to switch to more conservative | |||
assumptions about wrap of congestion indication counters (see <xref | assumptions about wrap of congestion indication counters (see <xref target | |||
target="accecn_ACE_Safety"/> and <xref | ="accecn_ACE_Safety"/> and <xref target="accecn_Algo_ACE_Wrap"/>).</t> | |||
target="accecn_Algo_ACE_Wrap"/>).</t> | ||||
<t><xref target="accecn_Interaction_SYN_Cookies"/> describes how a TCP | <t><xref target="accecn_Interaction_SYN_Cookies"/> describes how a TCP | |||
Server can negotiate AccECN and use the SYN cookie method for mitigating | Server can negotiate AccECN and use the SYN cookie method for mitigating | |||
SYN flooding attacks.</t> | SYN flooding attacks.</t> | |||
<t>There is concern that ECN feedback could be altered or suppressed, | <t>There is concern that ECN feedback could be altered or suppressed, | |||
particularly because a misbehaving Data Receiver could increase its own | particularly because a misbehaving Data Receiver could increase its own | |||
throughput at the expense of others. AccECN is compatible with the three | throughput at the expense of others. AccECN is compatible with the three | |||
schemes known to assure the integrity of ECN feedback (see <xref | schemes known to assure the integrity of ECN feedback (see <xref target="a | |||
target="accecn_Integrity"/> for details). If AccECN Options are stripped | ccecn_Integrity"/> for details). If AccECN Options are stripped | |||
by an incorrectly implemented middlebox, the resolution of the feedback | by an incorrectly implemented middlebox, the resolution of the feedback | |||
will be degraded, but the integrity of this degraded information can | will be degraded, but the integrity of this degraded information can | |||
still be assured. Assuring that Data Senders respond appropriately to | still be assured. Assuring that Data Senders respond appropriately to | |||
ECN feedback is possible, but the scope of the present document is | ECN feedback is possible, but the scope of the present document is | |||
confined to the feedback protocol, and excludes the response to this | confined to the feedback protocol and excludes the response to this | |||
feedback.</t> | feedback.</t> | |||
<!-- [rfced] Please consider whether the placement of B at the end of the senten ce is correct. | ||||
<t>In <xref target="accecn_option"/> a Data Sender is allowed to ignore | Original: | |||
This opens up a potential covert channel of up to 29B (40 - | ||||
(2+3*3)) B. | ||||
--> | ||||
<t>In <xref target="accecn_option"/>, a Data Sender is allowed to ignore | ||||
an unrecognized TCP AccECN Option length and read as many whole 3-octet | an unrecognized TCP AccECN Option length and read as many whole 3-octet | |||
fields from it as possible up to a maximum of 3, treating the remainder | fields from it as possible up to a maximum of 3, treating the remainder | |||
as padding. This opens up a potential covert channel of up to 29B (40 - | as padding. This opens up a potential covert channel of up to 29B (40 - | |||
(2+3*3)) B. However, it is really an overt channel (not hidden) and it | (2+3*3)) B. However, it is really an overt channel (not hidden) and it | |||
is no different to the use of unknown TCP options with unknown option | is no different than the use of unknown TCP options with unknown option | |||
lengths in general. Therefore, where this is of concern, it can already | lengths in general. Therefore, where this is of concern, it can already | |||
be adequately mitigated by regular TCP normalizer technology (see <xref | be adequately mitigated by regular TCP normalizer technology (see <xref ta | |||
target="accecn_middlebox_transparent_normalizers"/>).</t> | rget="accecn_middlebox_transparent_normalizers"/>).</t> | |||
<t>The AccECN protocol is not believed to introduce any new privacy | <t>The AccECN protocol is not believed to introduce any new privacy | |||
concerns, because it merely counts and feeds back signals at the | concerns, because it merely counts and feeds back signals at the | |||
transport layer that had already been visible at the IP layer. A covert | transport layer that had already been visible at the IP layer. A covert | |||
channel can be used to compromise privacy. However, as explained above, | channel can be used to compromise privacy. However, as explained above, | |||
undefined TCP options in general open up such channels and common | undefined TCP options in general open up such channels, and common | |||
techniques are available to close them off.</t> | techniques are available to close them off.</t> | |||
<!-- [rfced] This sentence reads a bit awkwardly. Perhaps this can be rephrased | ||||
? | ||||
Original: | ||||
No known way can yet be contrived for a receiver to take | ||||
advantage of this behaviour, which seems to always degrade its own | ||||
performance. | ||||
Perhaps: | ||||
Currently, there is no known way for a receiver to take | ||||
advantage of this behaviour, which seems to always degrade its own | ||||
performance. | ||||
--> | ||||
<t>There is a potential concern that a Data Receiver could deliberately | <t>There is a potential concern that a Data Receiver could deliberately | |||
omit AccECN Options pretending that they had been stripped by a | omit AccECN Options pretending that they had been stripped by a | |||
middlebox. No known way can yet be contrived for a receiver to take | middlebox. No known way can yet be contrived for a receiver to take | |||
advantage of this behaviour, which seems to always degrade its own | advantage of this behaviour, which seems to always degrade its own | |||
performance. However, the concern is mentioned here for | performance. However, the concern is mentioned here for | |||
completeness.</t> | completeness.</t> | |||
<!-- [rfced] Instead of "show up more easily", perhaps "be more easily identifie | ||||
d" would improve readability? | ||||
Original: | ||||
A generic privacy concern of any new protocol is that for a while it | ||||
will be used by a small population of hosts, and thus show up more | ||||
easily. | ||||
--> | ||||
<!-- [rfced] We have updated the text as shown below. Please let us know any co | ||||
ncerns. | ||||
Original: | ||||
However, it is expected that this option will become | ||||
available in operating systems over time, and eventually turned on by | ||||
default in them. | ||||
Current: | ||||
However, it is expected that AccECN will become | ||||
available in operating systems over time and that it will eventually | ||||
be turned on by default. | ||||
--> | ||||
<t>A generic privacy concern of any new protocol is that for a while | <t>A generic privacy concern of any new protocol is that for a while | |||
it will be used by a small population of hosts, and thus show up more | it will be used by a small population of hosts, and thus show up more | |||
easily. However, it is expected that this option will become available | easily. However, it is expected that AccECN will become available | |||
in operating systems over time, and eventually turned on by default | in operating systems over time and that it will eventually be turned on by | |||
in them. Thus a individual identification of a particular user is | default. Thus, an individual identification of a particular user is | |||
less of a concern than the fingerprinting of specific versions of | less of a concern than the fingerprinting of specific versions of | |||
operation systems. However, the latter can be done using different | operation systems. However, the latter can be done using different | |||
means independent of Accurate ECN.</t> | means independent of Accurate ECN.</t> | |||
<t>As Accurate ECN exposes more bits in the TCP header that could | ||||
<t>As Accurate ECN exposes more bits in the TCP header which could | ||||
be tampered with without interfering with the transport excessively, | be tampered with without interfering with the transport excessively, | |||
it may allow an additional way to identify specific | it may allow an additional way to identify specific | |||
data streams across a virtual private network (VPN) to an attacker which | data streams across a virtual private network (VPN) to an attacker that | |||
has access to the datastream before and after the VPN tunnel endpoints. | has access to the datastream before and after the VPN tunnel endpoints. | |||
This may be achieved by injecting or modifying the ACE field in specific | This may be achieved by injecting or modifying the ACE field in specific | |||
patters that can be recognized.</t> | patterns that can be recognized.</t> | |||
<t>Overall, Accurate ECN does not change the risk profile on privacy to | <t>Overall, Accurate ECN does not change the risk profile on privacy to | |||
a user dramatically beyond what is already possible using classic ECN. | a user dramatically beyond what is already possible using classic ECN. | |||
However, in order to prevent such attacks and means of easier identificati on | However, in order to prevent such attacks and means of easier identificati on | |||
of flows, it is adviseable for privacy conscious users behind VPNs to | of flows, it is advisable for privacy-conscious users behind VPNs to | |||
not enable the Accurate ECN, or Classic ECN for that matter.</t> | not enable the Accurate ECN, or Classic ECN for that matter.</t> | |||
</section> | </section> | |||
</middle> | </middle> | |||
<back> | <back> | |||
<!-- ================================================================ --> | ||||
<references title="Normative References"> | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | <displayreference target="I-D.ietf-tcpm-generalized-ecn" to="ECN++"/> | |||
e.RFC.9293.xml"/> | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | ||||
e.RFC.2018.xml"/> | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | ||||
e.RFC.2119.xml"/> | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | ||||
e.RFC.2883.xml"/> | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | ||||
e.RFC.3168.xml"/> | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | ||||
e.RFC.5961.xml"/> | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | ||||
e.RFC.8174.xml"/> | ||||
</references> | ||||
<references title="Informative References"> | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | ||||
e.RFC.3449.xml"/> | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | ||||
e.RFC.3540.xml"/> | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | ||||
e.RFC.4987.xml"/> | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | ||||
e.RFC.5562.xml"/> | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | ||||
e.RFC.5681.xml"/> | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | ||||
e.RFC.5690.xml"/> | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | ||||
e.RFC.5925.xml"/> | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | ||||
e.RFC.8684.xml"/> | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | ||||
e.RFC.6994.xml"/> | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | ||||
e.RFC.6582.xml"/> | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | ||||
e.RFC.7323.xml"/> | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | ||||
e.RFC.7560.xml"/> | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | ||||
e.RFC.7413.xml"/> | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | ||||
e.RFC.7713.xml"/> | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | ||||
e.RFC.8257.xml"/> | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | <references> | |||
e.RFC.8311.xml"/> | <name>References</name> | |||
<references> | ||||
<name>Normative References</name> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.9 | ||||
293.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.2 | ||||
018.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.2 | ||||
119.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.2 | ||||
883.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.3 | ||||
168.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.5 | ||||
961.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.8 | ||||
174.xml"/> | ||||
</references> | ||||
<references> | ||||
<name>Informative References</name> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.3 | ||||
449.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.3 | ||||
540.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.4 | ||||
987.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.5 | ||||
562.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.5 | ||||
681.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.5 | ||||
690.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.5 | ||||
925.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.8 | ||||
684.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.6 | ||||
994.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.6 | ||||
582.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.7 | ||||
323.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.7 | ||||
560.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.7 | ||||
413.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.7 | ||||
713.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.8 | ||||
257.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.8 | ||||
311.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml3/reference.I-D.ie | <!-- [I-D.ietf-tcpm-generalized-ecn] | |||
tf-tcpm-generalized-ecn.xml"/> | draft-ietf-tcpm-generalized-ecn-17 | |||
IESG State: I-D Exists as of 04/25/25. | ||||
--> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml3/reference.I-D. | ||||
ietf-tcpm-generalized-ecn.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.7 | ||||
141.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.9 | ||||
260.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.9 | ||||
000.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.6 | ||||
679.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.9 | ||||
438.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.9 | ||||
040.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.8 | ||||
511.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.9 | ||||
330.xml"/> | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | <!-- [rfced] [RoCEv2] | |||
e.RFC.7141.xml"/> | Please review. We could not confirm the Volume or Release number for | |||
this reference. Note that there is information at the current URL which mentions | ||||
"Volume 1 Release 1.8" (see: https://www.infinibandta.org/wp-content/uploads/202 | ||||
4/09/IBTA-Overview-of-IBTA-Volume-1-Release-1.8.pdf). | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | Would you like us to update this reference to Release 1.8, use a | |||
e.RFC.9260.xml"/> | version-less reference, or keep the Release 1.4 version of the reference? | |||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc e.RFC.9000.xml"/> | Current: | |||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | [RoCEv2] InfiniBand Trade Association, "InfiniBand Architecture | |||
e.RFC.6679.xml"/> | Specification", Volume 1, Release 1.4, 2020, | |||
<https://www.infinibandta.org/ibta-specification/>. | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc e.RFC.9438.xml"/> | Perhaps: | |||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | [RoCEv2] InfiniBand Trade Association, "InfiniBand Architecture | |||
e.RFC.9040.xml"/> | Specification", | |||
<https://www.infinibandta.org/ibta-specification/>. | ||||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc e.RFC.8511.xml"/> | OR | |||
<xi:include href="http://xml2rfc.tools.ietf.org/public/rfc/bibxml/referenc | [RoCEv2] InfiniBand Trade Association, "InfiniBand Architecture | |||
e.RFC.9330.xml"/> | Specification", Volume 1, Release 1.8, July 2024, | |||
<https://www.infinibandta.org/ibta-specification/>. | ||||
<reference anchor="RoCEv2"> | --> | |||
<front> | <reference anchor="RoCEv2" target="https://www.infinibandta.org/ibta-spe | |||
<title>InfiniBand Architecture Specification Volume 1, Release 1.4</ti | cification/"> | |||
tle> | <front> | |||
<author> | <title>InfiniBand Architecture Specification</title> | |||
<organization>InfiniBand Trade Association</organization> | <author> | |||
</author> | <organization>InfiniBand Trade Association</organization> | |||
<date year="2020"/> | </author> | |||
</front> | <date year="2020"/> | |||
<format target="https://www.infinibandta.org/ibta-specification/" /> | </front> | |||
<!-- https://cw.infinibandta.org/document/dl/7781"/> --> | <refcontent>Volume 1, Release 1.4</refcontent> | |||
<!-- https://cw.infinibandta.org/document/dl/7781"/> --> | ||||
</reference> | </reference> | |||
<reference anchor="Mandalari18" target="http://www.it.uc3m.es/amandala/e | ||||
<reference anchor="Mandalari18"> | cn++/ecn_commag_2018.html"> | |||
<front> | <front> | |||
<title>Measuring ECN++: Good News for ++, Bad News for ECN over | <title>Measuring ECN++: Good News for ++, Bad News for ECN over | |||
Mobile</title> | Mobile</title> | |||
<author fullname="Anna Mandalari" initials="A." surname="Mandalari"> | ||||
<author fullname="Anna Mandalari" initials="A." surname="Mandalari"> | <organization>UC3M</organization> | |||
<organization>UC3M</organization> | </author> | |||
</author> | <author fullname="Andra Lutu" initials="A." surname="Lutu"> | |||
<organization>Simula</organization> | ||||
<author fullname="Andra Lutu" initials="A." surname="Lutu"> | <address> | |||
<organization>Simula</organization> | <postal> | |||
<street/> | ||||
<address> | <city/> | |||
<postal> | <region/> | |||
<street/> | <code/> | |||
<country/> | ||||
<city/> | </postal> | |||
<phone/> | ||||
<region/> | <email/> | |||
<uri/> | ||||
<code/> | </address> | |||
</author> | ||||
<country/> | <author fullname="Bob Briscoe" initials="B." surname="Briscoe"> | |||
</postal> | <organization>Simula</organization> | |||
<address> | ||||
<phone/> | <postal> | |||
<street/> | ||||
<facsimile/> | <city/> | |||
<region/> | ||||
<email/> | <code/> | |||
<country/> | ||||
<uri/> | </postal> | |||
</address> | <phone/> | |||
</author> | <email/> | |||
<uri/> | ||||
<author fullname="Bob Briscoe" initials="B." surname="Briscoe"> | </address> | |||
<organization>Simula</organization> | </author> | |||
<author fullname="Marcelo Bagnulo" initials="M." surname="Bagnulo"> | ||||
<address> | <organization>UC3M</organization> | |||
<postal> | <address> | |||
<street/> | <postal> | |||
<street/> | ||||
<city/> | <city/> | |||
<region/> | ||||
<region/> | <code/> | |||
<country/> | ||||
<code/> | </postal> | |||
<phone/> | ||||
<country/> | <email/> | |||
</postal> | <uri/> | |||
</address> | ||||
<phone/> | </author> | |||
<author fullname="Özgü Alay" initials="Ö." surname="Alay"> | ||||
<facsimile/> | <organization>Simula</organization> | |||
<address> | ||||
<email/> | <postal> | |||
<street/> | ||||
<uri/> | <city/> | |||
</address> | <region/> | |||
</author> | <code/> | |||
<country/> | ||||
<author fullname="Marcelo Bagnulo" initials="M." surname="Bagnulo"> | </postal> | |||
<organization>UC3M</organization> | <phone/> | |||
<email/> | ||||
<address> | <uri/> | |||
<postal> | </address> | |||
<street/> | </author> | |||
<date month="March" year="2018"/> | ||||
<city/> | </front> | |||
<seriesInfo name="IEEE Communications Magazine" value=""/> | ||||
<region/> | </reference> | |||
</references> | ||||
<code/> | ||||
<country/> | ||||
</postal> | ||||
<phone/> | ||||
<facsimile/> | ||||
<email/> | ||||
<uri/> | ||||
</address> | ||||
</author> | ||||
<author fullname="Özgü Alay" initials="Ö." | ||||
surname="Alay"> | ||||
<organization>Simula</organization> | ||||
<address> | ||||
<postal> | ||||
<street/> | ||||
<city/> | ||||
<region/> | ||||
<code/> | ||||
<country/> | ||||
</postal> | ||||
<phone/> | ||||
<facsimile/> | ||||
<email/> | ||||
<uri/> | ||||
</address> | ||||
</author> | ||||
<date month="March" year="2018"/> | ||||
</front> | ||||
<seriesInfo name="IEEE Communications Magazine" value=""/> | ||||
<format target="http://www.it.uc3m.es/amandala/ecn++/ecn_commag_2018.htm | ||||
l" | ||||
type="PDF"/> | ||||
</reference> | ||||
</references> | </references> | |||
<section anchor="accecn_Algo_Examples"> | ||||
<name>Example Algorithms</name> | ||||
<!-- [rfced] May we update "implement" to "satisfy" to clarify the text and avoi | ||||
d "implementers implement"? | ||||
<section anchor="accecn_Algo_Examples" title="Example Algorithms"> | Original: | |||
However, implementers are free to choose other ways | ||||
to implement the requirements. | ||||
--> | ||||
<t>This appendix is informative, not normative. It gives example | <t>This appendix is informative, not normative. It gives example | |||
algorithms that would satisfy the normative requirements of the AccECN | algorithms that would satisfy the normative requirements of the AccECN | |||
protocol. However, implementers are free to choose other ways to | protocol. However, implementers are free to choose other ways to | |||
implement the requirements.</t> | implement the requirements.</t> | |||
<!-- [rfced] The following note was included in the XML. | ||||
<!--ToDo: Note to RFC Editor: Pls change all bare <artwork> elements (with | ToDo: Note to RFC Editor: Pls change all bare <artwork> elements (without | |||
out any keywords like align) to <sourcecode>. | any keywords like align) to <sourcecode>. | |||
Reason My XML editor doesn't support the <sourcecode> element, so it mangles lin | Reason My XML editor doesn't support the <sourcecode> element, so it mangles lin | |||
e breaks within sourcecode, ignoring even CDATA protection.--> | e breaks within sourcecode, ignoring even CDATA protection. | |||
<section anchor="accecn_Algo_Option_Coding" | We have updated the XML file as noted. Please let us know how/if he "type" attr | |||
title="Example Algorithm to Encode/Decode the AccECN Option"> | ibute of each sourcecode element should be set. Perhaps some/all should be marke | |||
d as pseudocode? | ||||
If the current list of preferred values for "type" | ||||
(https://www.rfc-editor.org/rpc/wiki/doku.php?id=sourcecode-types) | ||||
does not contain an applicable type, then feel free to let us know. | ||||
Also, it is acceptable to leave the "type" attribute not set. | ||||
--> | ||||
<section anchor="accecn_Algo_Option_Coding"> | ||||
<name>Example Algorithm to Encode/Decode the AccECN Option</name> | ||||
<t><!--ToDo: Example code to check the AccECN Option fields are consiste nt with the ACE field.-->The | <t><!--ToDo: Example code to check the AccECN Option fields are consiste nt with the ACE field.-->The | |||
example algorithms below show how a Data Receiver in AccECN mode could | example algorithms below show how a Data Receiver in AccECN mode could | |||
encode its CE byte counter r.ceb into the ECEB field within an AccECN | encode its CE byte counter r.ceb into the ECEB field within an AccECN | |||
TCP Option, and how a Data Sender in AccECN mode could decode the ECEB | TCP Option, and how a Data Sender in AccECN mode could decode the ECEB | |||
field into its byte counter s.ceb. The other counters for bytes marked | field into its byte counter s.ceb. The other counters for bytes marked | |||
ECT(0) and ECT(1) in an AccECN Option would be similarly encoded and | ECT(0) and ECT(1) in an AccECN Option would be similarly encoded and | |||
decoded.</t> | decoded.</t> | |||
<t>It is assumed that each local byte counter is an unsigned integer | <t>It is assumed that each local byte counter is an unsigned integer | |||
greater than 24b (probably 32b), and that the following constant has | greater than 24b (probably 32b), and that the following constant has | |||
been assigned:</t> | been assigned:</t> | |||
<sourcecode><![CDATA[ DIVOPT = 2^24]]></sourcecode> | <sourcecode><![CDATA[ DIVOPT = 2^24]]></sourcecode> | |||
<t>Every time a CE-marked data segment arrives, the Data Receiver | ||||
<t>Every time a CE marked data segment arrives, the Data Receiver | ||||
increments its local value of r.ceb by the size of the TCP Data. | increments its local value of r.ceb by the size of the TCP Data. | |||
Whenever it sends an ACK with an AccECN Option, the value it writes | Whenever it sends an ACK with an AccECN Option, the value it writes | |||
into the ECEB field is</t> | into the ECEB field is</t> | |||
<sourcecode><![CDATA[ ECEB = r.ceb % DIVOPT]]></sourcecode> | <sourcecode><![CDATA[ ECEB = r.ceb % DIVOPT]]></sourcecode> | |||
<t>where '%' is the remainder operator.</t> | <t>where '%' is the remainder operator.</t> | |||
<t>On the arrival of an AccECN Option, the Data Sender first makes | <t>On the arrival of an AccECN Option, the Data Sender first makes | |||
sure the ACK has not been superseded in order to avoid winding the | sure the ACK has not been superseded in order to avoid winding the | |||
s.ceb counter backwards. It uses the TCP acknowledgement number and | s.ceb counter backwards. It uses the TCP acknowledgement number and | |||
any SACK options <xref target="RFC2018"/> to calculate newlyAckedB, | any SACK options <xref target="RFC2018"/> to calculate newlyAckedB, | |||
the amount of new data that the ACK acknowledges in bytes (newlyAckedB | the amount of new data that the ACK acknowledges in bytes (newlyAckedB | |||
can be zero but not negative). If newlyAckedB is zero, either the ACK | can be zero but not negative). If newlyAckedB is zero, either the ACK | |||
has been superseded or CE-marked packet(s) without data could have | has been superseded or CE-marked packet(s) without data could have | |||
arrived. To break the tie for the latter case, the Data Sender could | arrived. To break the tie for the latter case, the Data Sender could | |||
use time-stamps <xref target="RFC7323"/> (if present) to work out | use time-stamps <xref target="RFC7323"/> (if present) to work out | |||
newlyAckedT, the amount of new time that the ACK acknowledges. If the | newlyAckedT, the amount of new time that the ACK acknowledges. If the | |||
Data Sender determines that the ACK has been superseded it ignores the | Data Sender determines that the ACK has been superseded, it ignores the | |||
AccECN Option. Otherwise, the Data Sender calculates the minimum | AccECN Option. Otherwise, the Data Sender calculates the minimum | |||
non-negative difference d.ceb between the ECEB field and its local | non-negative difference d.ceb between the ECEB field and its local | |||
s.ceb counter, using modulo arithmetic as follows:</t> | s.ceb counter, using modulo arithmetic as follows:</t> | |||
<sourcecode><![CDATA[ if ((newlyAckedB > 0) || (newlyAckedT > 0)) { | ||||
<figure> | ||||
<sourcecode><![CDATA[ if ((newlyAckedB > 0) || (newlyAckedT > 0)) { | ||||
d.ceb = (ECEB + DIVOPT - (s.ceb % DIVOPT)) % DIVOPT | d.ceb = (ECEB + DIVOPT - (s.ceb % DIVOPT)) % DIVOPT | |||
s.ceb += d.ceb | s.ceb += d.ceb | |||
} | } | |||
]]></sourcecode> | ]]></sourcecode> | |||
</figure> | ||||
<t>For example, if s.ceb is 33,554,433 and ECEB is 1461 (both | <t>For example, if s.ceb is 33,554,433 and ECEB is 1461 (both | |||
decimal), then</t> | decimal), then</t> | |||
<sourcecode><![CDATA[ s.ceb % DIVOPT = 1 | ||||
<figure> | ||||
<sourcecode><![CDATA[ s.ceb % DIVOPT = 1 | ||||
d.ceb = (1461 + 2^24 - 1) % 2^24 | d.ceb = (1461 + 2^24 - 1) % 2^24 | |||
= 1460 | = 1460 | |||
s.ceb = 33,554,433 + 1460 | s.ceb = 33,554,433 + 1460 | |||
= 33,555,893 | = 33,555,893 | |||
]]></sourcecode> | ]]></sourcecode> | |||
</figure> | <t>In practice, an implementation might use heuristics to guess the | |||
feedback in missing ACKs. Then when it subsequently receives feedback, | ||||
<t>In practice an implementation might use heuristics to guess the | ||||
feedback in missing ACKs, then when it subsequently receives feedback | ||||
it might find that it needs to correct its earlier heuristics as part | it might find that it needs to correct its earlier heuristics as part | |||
of the decoding process. The above decoding process does not include | of the decoding process. The above decoding process does not include | |||
any such heuristics.</t> | any such heuristics.</t> | |||
</section> | </section> | |||
<section anchor="accecn_Algo_ACE_Wrap"> | ||||
<section anchor="accecn_Algo_ACE_Wrap" | <name>Example Algorithm for Safety Against Long Sequences of ACK Loss</n | |||
title="Example Algorithm for Safety Against Long Sequences of ACK | ame> | |||
Loss"> | ||||
<t>The example algorithms below show how a Data Receiver in AccECN | <t>The example algorithms below show how a Data Receiver in AccECN | |||
mode could encode its CE packet counter r.cep into the ACE field, and | mode could encode its CE packet counter r.cep into the ACE field, and | |||
how the Data Sender in AccECN mode could decode the ACE field into its | how the Data Sender in AccECN mode could decode the ACE field into its | |||
s.cep counter. The Data Sender's algorithm includes code to | s.cep counter. The Data Sender's algorithm includes code to | |||
heuristically detect a long enough unbroken string of ACK losses that | heuristically detect a long enough unbroken string of ACK losses that | |||
could have concealed a cycle of the congestion counter in the ACE | could have concealed a cycle of the congestion counter in the ACE | |||
field of the next ACK to arrive.</t> | field of the next ACK to arrive.</t> | |||
<t>Two variants of the algorithm are given: i) a more conservative | <t>Two variants of the algorithm are given: i) a more conservative | |||
variant for a Data Sender to use if it detects that AccECN Options are | variant for a Data Sender to use if it detects that AccECN Options are | |||
not available (see <xref target="accecn_ACE_Safety"/> and <xref | not available (see <xref target="accecn_ACE_Safety"/> and <xref target=" | |||
target="accecn_Mbox_Interference"/>); and ii) a less conservative | accecn_Mbox_Interference"/>); and ii) a less conservative | |||
variant that is feasible when complementary information is available | variant that is feasible when complementary information is available | |||
from AccECN Options.</t> | from AccECN Options.</t> | |||
<section> | ||||
<section title="Safety Algorithm without the AccECN Option"> | <name>Safety Algorithm Without the AccECN Option</name> | |||
<t>It is assumed that each local packet counter is a sufficiently | <t>It is assumed that each local packet counter is a sufficiently | |||
sized unsigned integer (probably 32b) and that the following | sized unsigned integer (probably 32b) and that the following | |||
constant has been assigned:</t> | constant has been assigned:</t> | |||
<sourcecode><![CDATA[ DIVACE = 2^3]]></sourcecode> | <sourcecode><![CDATA[ DIVACE = 2^3]]></sourcecode> | |||
<t>Every time an Acceptable CE marked packet arrives (<xref target="ac | ||||
<t>Every time an Acceptable CE marked packet arrives (<xref | cecn_sec_ACE_feedback"/>), the Data Receiver increments | |||
target="accecn_sec_ACE_feedback"/>), the Data Receiver increments | ||||
its local value of r.cep by 1. It repeats the same value of ACE in | its local value of r.cep by 1. It repeats the same value of ACE in | |||
every subsequent ACK until the next CE marking arrives, where</t> | every subsequent ACK until the next CE marking arrives, where</t> | |||
<sourcecode><![CDATA[ ACE = r.cep % DIVACE.]]></sourcecode> | <sourcecode><![CDATA[ ACE = r.cep % DIVACE.]]></sourcecode> | |||
<t>If the Data Sender received an earlier value of the counter that | <t>If the Data Sender received an earlier value of the counter that | |||
had been delayed due to ACK reordering, it might incorrectly | had been delayed due to ACK reordering, it might incorrectly | |||
calculate that the ACE field had wrapped. Therefore, on the arrival | calculate that the ACE field had wrapped. Therefore, on the arrival | |||
of every ACK, the Data Sender ensures the ACK has not been | of every ACK, the Data Sender ensures the ACK has not been | |||
superseded using the TCP acknowledgement number, any SACK options | superseded using the TCP acknowledgement number, any SACK options, | |||
and timestamps (if available) to calculate newlyAckedB, as in <xref | and timestamps (if available) to calculate newlyAckedB, as in <xref ta | |||
target="accecn_Algo_Option_Coding"/>. If the ACK has not been | rget="accecn_Algo_Option_Coding"/>. If the ACK has not been | |||
superseded, the Data Sender calculates the minimum difference d.cep | superseded, the Data Sender calculates the minimum difference d.cep | |||
between the ACE field and its local s.cep counter, using modulo | between the ACE field and its local s.cep counter, using modulo | |||
arithmetic as follows:</t> | arithmetic as follows:</t> | |||
<sourcecode><![CDATA[ if ((newlyAckedB > 0) || (newlyAckedT > 0)) | <sourcecode><![CDATA[ if ((newlyAckedB > 0) || (newlyAckedT > 0)) | |||
d.cep = (ACE + DIVACE - (s.cep % DIVACE)) % DIVACE | d.cep = (ACE + DIVACE - (s.cep % DIVACE)) % DIVACE | |||
]]></sourcecode> | ]]></sourcecode> | |||
<t><xref target="accecn_ACE_Safety"/> expects the Data Sender to | <t><xref target="accecn_ACE_Safety"/> expects the Data Sender to | |||
assume that the ACE field cycled if it is the safest likely case | assume that the ACE field cycled if it is the safest likely case | |||
under prevailing conditions. The 3-bit ACE field in an arriving ACK | under prevailing conditions. The 3-bit ACE field in an arriving ACK | |||
could have cycled and become ambiguous to the Data Sender if a | could have cycled and become ambiguous to the Data Sender if a | |||
sequence of ACKs goes missing that covers a stream of data long | sequence of ACKs goes missing that covers a stream of data long | |||
enough to contain 8 or more CE marks. We use the word `missing' | enough to contain 8 or more CE marks. We use the word 'missing' | |||
rather than `lost', because some or all the missing ACKs might | rather than 'lost', because some or all the missing ACKs might | |||
arrive eventually, but out of order. Even if some of the missing | arrive eventually, but out of order. Even if some of the missing | |||
ACKs were piggy-backed on data (i.e., not pure ACKs) | ACKs were piggy-backed on data (i.e., not pure ACKs) | |||
retransmissions will not repair the lost AccECN information, because | retransmissions will not repair the lost AccECN information, because | |||
AccECN requires retransmissions to carry the latest AccECN counters, | AccECN requires retransmissions to carry the latest AccECN counters, | |||
not the original ones.</t> | not the original ones.</t> | |||
<!-- [rfced] We are having trouble parsing this sentence. Where does the "which " statement end - after "full-sized"? Does "it" refer to the algorithm? | ||||
<t>The phrase `under prevailing conditions' allows for | Original: | |||
However, we shall start | ||||
with the simplest algorithm, which assumes segments are all full- | ||||
sized and ultra-conservatively it assumes that ECN marking was 100% | ||||
on the forward path when ACKs on the reverse path started to all be | ||||
dropped. | ||||
--> | ||||
<t>The phrase 'under prevailing conditions' allows for | ||||
implementation-dependent interpretation. A Data Sender might take | implementation-dependent interpretation. A Data Sender might take | |||
account of the prevailing size of data segments and the prevailing | account of the prevailing size of data segments and the prevailing | |||
CE marking rate just before the sequence of missing ACKs. However, | CE marking rate just before the sequence of missing ACKs. However, | |||
we shall start with the simplest algorithm, which assumes segments | we shall start with the simplest algorithm, which assumes segments | |||
are all full-sized and ultra-conservatively it assumes that ECN | are all full-sized and ultra-conservatively it assumes that ECN | |||
marking was 100% on the forward path when ACKs on the reverse path | marking was 100% on the forward path when ACKs on the reverse path | |||
started to all be dropped. Specifically, if newlyAckedB is the | started to all be dropped. Specifically, if newlyAckedB is the | |||
amount of data that an ACK acknowledges since the previous ACK, then | amount of data that an ACK acknowledges since the previous ACK, then | |||
the Data Sender could assume that this acknowledges newlyAckedPkt | the Data Sender could assume that this acknowledges newlyAckedPkt | |||
full-sized segments, where newlyAckedPkt = newlyAckedB/MSS. Then it | full-sized segments, where newlyAckedPkt = newlyAckedB/MSS. Then it | |||
could assume that the ACE field incremented by</t> | could assume that the ACE field incremented by</t> | |||
<sourcecode><![CDATA[ dSafer.cep = newlyAckedPkt - ((newlyAckedPkt | ||||
<sourcecode><![CDATA[ dSafer.cep = newlyAckedPkt - ((newlyAckedPkt | - d.cep) % DIVACE) | |||
- d.cep) % DIVACE), | ||||
]]></sourcecode> | ]]></sourcecode> | |||
d.cep) % <span class="insert">DIVACE)</span> | <!-- [rfced] May we change "works out" to "indicates" or "determines"? | |||
Original: | ||||
The above formula works out that it | ||||
would still be safe to assume 2 CE marks (because 9 - ((9-2) % 8) = | ||||
2). | ||||
--> | ||||
<t>For example, imagine an ACK acknowledges newlyAckedPkt=9 more | <t>For example, imagine an ACK acknowledges newlyAckedPkt=9 more | |||
full-size segments than any previous ACK, and that ACE increments by | full-size segments than any previous ACK, and that ACE increments by | |||
a minimum of 2 CE marks (d.cep=2). The above formula works out that | a minimum of 2 CE marks (d.cep=2). The above formula works out that | |||
it would still be safe to assume 2 CE marks (because 9 - ((9-2) % 8) | it would still be safe to assume 2 CE marks (because 9 - ((9-2) % 8) | |||
= 2). However, if ACE increases by a minimum of 2 but acknowledges | = 2). However, if ACE increases by a minimum of 2 but acknowledges | |||
10 full-sized segments, then it would be necessary to assume that | 10 full-sized segments, then it would be necessary to assume that | |||
there could have been 10 CE marks (because 10 - ((10-2) % 8) = | there could have been 10 CE marks (because 10 - ((10-2) % 8) = | |||
10).</t> | 10).</t> | |||
skipping to change at line 3487 ¶ | skipping to change at line 3479 ¶ | |||
a minimum of 2 CE marks (d.cep=2). The above formula works out that | a minimum of 2 CE marks (d.cep=2). The above formula works out that | |||
it would still be safe to assume 2 CE marks (because 9 - ((9-2) % 8) | it would still be safe to assume 2 CE marks (because 9 - ((9-2) % 8) | |||
= 2). However, if ACE increases by a minimum of 2 but acknowledges | = 2). However, if ACE increases by a minimum of 2 but acknowledges | |||
10 full-sized segments, then it would be necessary to assume that | 10 full-sized segments, then it would be necessary to assume that | |||
there could have been 10 CE marks (because 10 - ((10-2) % 8) = | there could have been 10 CE marks (because 10 - ((10-2) % 8) = | |||
10).</t> | 10).</t> | |||
<t>Note that checks would need to be added to the above pseudocode | <t>Note that checks would need to be added to the above pseudocode | |||
for (d.cep > newlyAckedPkt), which could occur if newlyAckedPkt | for (d.cep > newlyAckedPkt), which could occur if newlyAckedPkt | |||
had been wrongly estimated using an inappropriate packet size.</t> | had been wrongly estimated using an inappropriate packet size.</t> | |||
<t>ACKs that acknowledge a large stretch of packets might be common | <t>ACKs that acknowledge a large stretch of packets might be common | |||
in data centres to achieve a high packet rate or might be due to ACK | in data centres to achieve a high packet rate or might be due to ACK | |||
thinning by a middlebox. In these cases, cycling of the ACE field | thinning by a middlebox. In these cases, cycling of the ACE field | |||
would often appear to have been possible, so the above algorithm | would often appear to have been possible, so the above algorithm | |||
would be over-conservative, leading to a false high marking rate and | would be overly conservative, leading to a false high marking rate and | |||
poor performance. Therefore it would be reasonable to only use | poor performance. Therefore, it would be reasonable to only use | |||
dSafer.cep rather than d.cep if the moving average of newlyAckedPkt | dSafer.cep rather than d.cep if the moving average of newlyAckedPkt | |||
was well below 8.</t> | was well below 8.</t> | |||
<t>Implementers could build in more heuristics to estimate | <t>Implementers could build in more heuristics to estimate | |||
prevailing average segment size and prevailing ECN marking. For | a prevailing average segment size and prevailing ECN marking. For | |||
instance, newlyAckedPkt in the above formula could be replaced with | instance, newlyAckedPkt in the above formula could be replaced with | |||
newlyAckedPktHeur = newlyAckedPkt*p*MSS/s, where s is the prevailing | newlyAckedPktHeur = newlyAckedPkt*p*MSS/s, where s is the prevailing | |||
segment size and p is the prevailing ECN marking probability. | segment size and p is the prevailing ECN marking probability. | |||
However, ultimately, if TCP's ECN feedback becomes inaccurate it | However, ultimately, if TCP's ECN feedback becomes inaccurate, it | |||
still has loss detection to fall back on. Therefore, it would seem | still has loss detection to fall back on. Therefore, it would seem | |||
safe to implement a simple algorithm, rather than a perfect one.</t> | safe to implement a simple algorithm, rather than a perfect one.</t> | |||
<!-- [rfced] Does "5% of full-sized" mean segments are "5% of their full size"? | ||||
May we change "as long as" to "while" for readability? | ||||
Original: | ||||
The simple algorithm for dSafer.cep above requires no monitoring of | ||||
prevailing conditions and it would still be safe if, for example, | ||||
segments were on average at least 5% of full-sized as long as ECN | ||||
marking was 5% or less. | ||||
--> | ||||
<t>The simple algorithm for dSafer.cep above requires no monitoring | <t>The simple algorithm for dSafer.cep above requires no monitoring | |||
of prevailing conditions and it would still be safe if, for example, | of prevailing conditions and it would still be safe if, for example, | |||
segments were on average at least 5% of full-sized as long as ECN | segments were on average at least 5% of full-sized as long as ECN | |||
marking was 5% or less. Assuming it was used, the Data Sender would | marking was 5% or less. Assuming it was used, the Data Sender would | |||
increment its packet counter as follows:</t> | increment its packet counter as follows:</t> | |||
<sourcecode><![CDATA[ s.cep += dSafer.cep]]></sourcecode> | <sourcecode><![CDATA[ s.cep += dSafer.cep]]></sourcecode> | |||
<!-- [rfced] We updated the text to point directly to Section 3.2.2.5.2 (where t | ||||
he quoted text appears). Please let us know any concerns. | ||||
Original: | ||||
If missing acknowledgement numbers arrive later (due to reordering), | ||||
Section 3.2.2.5 says "the Data Sender MAY attempt to neutralize the | ||||
effect of any action it took based on a conservative assumption that | ||||
it later found to be incorrect". | ||||
--> | ||||
<t>If missing acknowledgement numbers arrive later (due to | <t>If missing acknowledgement numbers arrive later (due to | |||
reordering), <xref target="accecn_ACE_Safety"/> says "the Data | reordering), <xref target="accecn_ACE_Safety_S"/> says "the Data | |||
Sender MAY attempt to neutralize the effect of any action it took | Sender <bcp14>MAY</bcp14> attempt to neutralize the effect of any acti | |||
on it took | ||||
based on a conservative assumption that it later found to be | based on a conservative assumption that it later found to be | |||
incorrect". To do this, the Data Sender would have to store the | incorrect". To do this, the Data Sender would have to store the | |||
values of all the relevant variables whenever it made assumptions, | values of all the relevant variables whenever it made assumptions, | |||
so that it could re-evaluate them later. Given this could become | so that it could re-evaluate them later. Given this could become | |||
complex and it is not required, we do not attempt to provide an | complex and it is not required, we do not attempt to provide an | |||
example of how to do this.</t> | example of how to do this.</t> | |||
</section> | </section> | |||
<section> | ||||
<section title="Safety Algorithm with the AccECN Option"> | <name>Safety Algorithm with the AccECN Option</name> | |||
<!--ToDo: Ilpo says this algo is useless, 'cos (I think) you don't hav e the state of d.ceb and d.cep at the same time. | <!--ToDo: Ilpo says this algo is useless, 'cos (I think) you don't hav e the state of d.ceb and d.cep at the same time. | |||
See emails 3/1/20.--> | See emails 3/1/20.--> | |||
<t>When AccECN Options are available on the ACKs before and after | <t>When AccECN Options are available on the ACKs before and after | |||
the possible sequence of ACK losses, if the Data Sender only needs | the possible sequence of ACK losses, if the Data Sender only needs | |||
CE-marked bytes, it will have sufficient information in AccECN | CE-marked bytes, it will have sufficient information in AccECN | |||
Options without needing to process the ACE field. If for some reason | Options without needing to process the ACE field. If for some reason | |||
it needs CE-marked packets, if dSafer.cep is different from d.cep, | it needs CE-marked packets, if dSafer.cep is different from d.cep, | |||
it can determine whether d.cep is likely to be a safe enough | it can determine whether d.cep is likely to be a safe enough | |||
estimate by checking whether the average marked segment size (s = | estimate by checking whether the average marked segment size (s = | |||
d.ceb/d.cep) is less than the MSS (where d.ceb is the amount of | d.ceb/d.cep) is less than the MSS (where d.ceb is the amount of | |||
newly CE-marked bytes - see <xref | newly CE-marked bytes -- see <xref target="accecn_Algo_Option_Coding"/ | |||
target="accecn_Algo_Option_Coding"/>). Specifically, it could use | >). Specifically, it could use | |||
the following algorithm:</t> | the following algorithm:</t> | |||
<sourcecode><![CDATA[ SAFETY_FACTOR = 2 | ||||
<figure> | ||||
<sourcecode><![CDATA[ SAFETY_FACTOR = 2 | ||||
if (dSafer.cep > d.cep) { | if (dSafer.cep > d.cep) { | |||
if (d.ceb <= MSS * d.cep) { % Same as (s <= MSS), but no DBZ | if (d.ceb <= MSS * d.cep) { % Same as (s <= MSS), but no DBZ | |||
sSafer = d.ceb/dSafer.cep | sSafer = d.ceb/dSafer.cep | |||
if (sSafer < MSS/SAFETY_FACTOR) | if (sSafer < MSS/SAFETY_FACTOR) | |||
dSafer.cep = d.cep % d.cep is a safe enough estimate | dSafer.cep = d.cep % d.cep is a safe enough estimate | |||
} % else | } % else | |||
% No need for else; dSafer.cep is already correct, | % No need for else; dSafer.cep is already correct, | |||
% because d.cep must have been too small | % because d.cep must have been too small | |||
} | } | |||
]]></sourcecode> | ]]></sourcecode> | |||
</figure> | <!-- [rfced] We are having trouble parsing "will consider d.cep can replace". P | |||
lease clarify. | ||||
Original: | ||||
The chart below shows when the above algorithm will consider d.cep | ||||
can replace dSafer.cep as a safe enough estimate of the number of CE- | ||||
marked packets: | ||||
Perhaps: | ||||
The chart below shows when the above algorithm will consider the number | ||||
of CE-marked packets as a safe enough estimate to replace dsafer.cep | ||||
with d.cep. | ||||
--> | ||||
<t>The chart below shows when the above algorithm will consider | <t>The chart below shows when the above algorithm will consider | |||
d.cep can replace dSafer.cep as a safe enough estimate of the number | d.cep can replace dSafer.cep as a safe enough estimate of the number | |||
of CE-marked packets:</t> | of CE-marked packets:</t> | |||
<artwork align="left"><![CDATA[ | ||||
<figure align="left"> | ^ | |||
<artwork align="left"><![CDATA[ ^ | ||||
sSafer| | sSafer| | |||
| | | | |||
MSS+ | MSS+ | |||
| | | | |||
| dSafer.cep | | dSafer.cep | |||
| is | | is | |||
MSS/SAFETY_FACTOR+--------------+ safest | MSS/SAFETY_FACTOR+--------------+ safest | |||
| | | | | | |||
| d.cep is safe| | | d.cep is safe| | |||
| enough | | | enough | | |||
skipping to change at line 3573 ¶ | skipping to change at line 3585 ¶ | |||
MSS+ | MSS+ | |||
| | | | |||
| dSafer.cep | | dSafer.cep | |||
| is | | is | |||
MSS/SAFETY_FACTOR+--------------+ safest | MSS/SAFETY_FACTOR+--------------+ safest | |||
| | | | | | |||
| d.cep is safe| | | d.cep is safe| | |||
| enough | | | enough | | |||
+--------------------> | +--------------------> | |||
MSS s | MSS s | |||
]]></artwork> | ]]></artwork> | |||
</figure> | ||||
<t>The following examples give the reasoning behind the algorithm, | <t>The following examples give the reasoning behind the algorithm, | |||
assuming MSS=1460 :<list style="symbols"> | assuming MSS=1460 :</t> | |||
<t>if d.cep=0, dSafer.cep=8 and d.ceb=1460, then s=infinity and | <ul spacing="normal"> | |||
sSafer=182.5.<vspace blankLines="0"/>Therefore even though the | <li> | |||
<t>if d.cep=0, dSafer.cep=8, and d.ceb=1460, then s=infinity and | ||||
sSafer=182.5.</t> | ||||
<t>Therefore, even though the | ||||
average size of 8 data segments is unlikely to have been as | average size of 8 data segments is unlikely to have been as | |||
small as MSS/8, d.cep cannot have been correct, because it would | small as MSS/8, d.cep cannot have been correct, because it would | |||
imply an average segment size greater than the MSS.</t> | imply an average segment size greater than the MSS.</t> | |||
</li> | ||||
<t>if d.cep=2, dSafer.cep=10 and d.ceb=1460, then s=730 and | <li> | |||
sSafer=146.<vspace blankLines="0"/>Therefore d.cep is safe | <t>if d.cep=2, dSafer.cep=10, and d.ceb=1460, then s=730 and | |||
sSafer=146.</t> | ||||
<t>Therefore d.cep is safe | ||||
enough, because the average size of 10 data segments is unlikely | enough, because the average size of 10 data segments is unlikely | |||
to have been as small as MSS/10.</t> | to have been as small as MSS/10.</t> | |||
</li> | ||||
<t>if d.cep=7, dSafer.cep=15 and d.ceb=10200, then s=1457 and | <li> | |||
sSafer=680.<vspace blankLines="0"/>Therefore d.cep is safe | <t>if d.cep=7, dSafer.cep=15, and d.ceb=10200, then s=1457 and | |||
sSafer=680.</t> | ||||
<t>Therefore d.cep is safe | ||||
enough, because the average data segment size is more likely to | enough, because the average data segment size is more likely to | |||
have been just less than one MSS, rather than below MSS/2.</t> | have been just less than one MSS, rather than below MSS/2.</t> | |||
</list></t> | </li> | |||
</ul> | ||||
<t>If pure ACKs were allowed to be ECN-capable, missing ACKs would | <t>If pure ACKs were allowed to be ECN-capable, missing ACKs would | |||
be far less likely. However, because <xref target="RFC3168"/> | be far less likely. However, because <xref target="RFC3168"/> | |||
currently precludes this, the above algorithm assumes that pure ACKs | currently precludes this, the above algorithm assumes that pure ACKs | |||
are not ECN-capable.</t> | are not ECN-capable.</t> | |||
</section> | </section> | |||
</section> | </section> | |||
<section anchor="accecn_Algo_ACE_Bytes"> | ||||
<name>Example Algorithm to Estimate Marked Bytes from Marked Packets</na | ||||
me> | ||||
<!-- [rfced] To what does "this" refer - the ACK? The sentence prior is include | ||||
d for context. | ||||
Original: | ||||
If AccECN Options are not available, the Data Sender can only decode | ||||
CE-marking from the ACE field in packets. Every time an ACK arrives, | ||||
to convert this into an estimate of CE-marked bytes, it needs an | ||||
average of the segment size, s_ave. | ||||
--> | ||||
<section anchor="accecn_Algo_ACE_Bytes" | ||||
title="Example Algorithm to Estimate Marked Bytes from Marked Pac | ||||
kets"> | ||||
<t>If AccECN Options are not available, the Data Sender can only | <t>If AccECN Options are not available, the Data Sender can only | |||
decode CE-marking from the ACE field in packets. Every time an ACK | decode a CE marking from the ACE field in packets. Every time an ACK | |||
arrives, to convert this into an estimate of CE-marked bytes, it needs | arrives, to convert this into an estimate of CE-marked bytes, it needs | |||
an average of the segment size, s_ave. Then it can add or subtract | an average of the segment size, s_ave. Then it can add or subtract | |||
s_ave from the value of d.ceb as the value of d.cep increments or | s_ave from the value of d.ceb as the value of d.cep increments or | |||
decrements. Some possible ways to calculate s_ave are outlined below. | decrements. Some possible ways to calculate s_ave are outlined below. | |||
The precise details will depend on why an estimate of marked bytes is | The precise details will depend on why an estimate of marked bytes is | |||
needed.</t> | needed.</t> | |||
<t>The implementation could keep a record of the byte numbers of all | <t>The implementation could keep a record of the byte numbers of all | |||
the boundaries between packets in flight (including control packets), | the boundaries between packets in flight (including control packets), | |||
and recalculate s_ave on every ACK. However it would be simpler to | and recalculate s_ave on every ACK. However, it would be simpler to | |||
merely maintain a counter packets_in_flight for the number of packets | merely maintain a counter packets_in_flight for the number of packets | |||
in flight (including control packets), which is reset once per RTT. | in flight (including control packets), which is reset once per RTT. | |||
Either way, it would estimate s_ave as:</t> | Either way, it would estimate s_ave as:</t> | |||
<sourcecode><![CDATA[ s_ave ~= flightsize / packets_in_flight,]]></sou rcecode> | <sourcecode><![CDATA[ s_ave ~= flightsize / packets_in_flight,]]></sou rcecode> | |||
<t>where flightsize is the variable that TCP already maintains for the | <t>where flightsize is the variable that TCP already maintains for the | |||
number of bytes in flight and '~=' means 'approximately equal to'. To | number of bytes in flight and '~=' means 'approximately equal to'. To | |||
avoid floating point arithmetic, it could right-bit-shift by | avoid floating point arithmetic, it could right-bit-shift by | |||
lg(packets_in_flight), where lg() means log base 2.</t> | lg(packets_in_flight), where lg() means log base 2.</t> | |||
<t>An alternative would be to maintain an exponentially weighted | <t>An alternative would be to maintain an exponentially weighted | |||
moving average (EWMA) of the segment size:</t> | moving average (EWMA) of the segment size:</t> | |||
<sourcecode><![CDATA[ s_ave = a * s + (1-a) * s_ave,]]></sourcecode> | <sourcecode><![CDATA[ s_ave = a * s + (1-a) * s_ave,]]></sourcecode> | |||
<t>where a is the decay constant for the EWMA. However, then it is | <t>where a is the decay constant for the EWMA. However, then it is | |||
necessary to choose a good value for this constant, which ought to | necessary to choose a good value for this constant, which ought to | |||
depend on the number of packets in flight. Also the decay constant | depend on the number of packets in flight. Also the decay constant | |||
needs to be power of two to avoid floating point arithmetic.</t> | needs to be power of two to avoid floating point arithmetic.</t> | |||
</section> | </section> | |||
<section anchor="accecn_Algo_Not-ECT"> | ||||
<section anchor="accecn_Algo_Not-ECT" | <name>Example Algorithm to Count Not-ECT Bytes</name> | |||
title="Example Algorithm to Count Not-ECT Bytes"> | ||||
<t>A Data Sender in AccECN mode can infer the amount of TCP payload | <t>A Data Sender in AccECN mode can infer the amount of TCP payload | |||
data arriving at the receiver marked Not-ECT from the difference | data arriving at the receiver marked Not-ECT from the difference | |||
between the amount of newly ACKed data and the sum of the bytes with | between the amount of newly ACKed data and the sum of the bytes with | |||
the other three markings, d.ceb, d.e0b and d.e1b.</t> | the other three markings, d.ceb, d.e0b, and d.e1b.</t> | |||
<!--ToDo: write-up pseudocode, rather than just describe it.--> | <!--ToDo: write-up pseudocode, rather than just describe it.--> | |||
<t>For this approach to be precise, it has to be assumed that spurious | <t>For this approach to be precise, it has to be assumed that spurious | |||
(unnecessary) retransmissions do not lead to double counting. This | (unnecessary) retransmissions do not lead to double counting. This | |||
assumption is currently correct, given that RFC 3168 requires that the | assumption is currently correct, given that RFC 3168 requires that the | |||
Data Sender marks retransmitted segments as Not-ECT. However, the | Data Sender mark retransmitted segments as Not-ECT. However, the | |||
converse is not true; necessary retransmissions will result in | converse is not true; necessary retransmissions will result in | |||
under-counting.</t> | undercounting.</t> | |||
<t>However, such precision is unlikely to be necessary. The only known | <t>However, such precision is unlikely to be necessary. The only known | |||
use of a count of Not-ECT marked bytes is to test whether equipment on | use of a count of Not-ECT marked bytes is to test whether equipment on | |||
the path is clearing the ECN field (perhaps due to an out-dated | the path is clearing the ECN field (perhaps due to an out-dated | |||
attempt to clear, or bleach, what used to be the IPv4 ToS byte or the | attempt to clear, or bleach, what used to be the IPv4 ToS byte or the | |||
IPv6 Traffic Class field). To detect bleaching it will be sufficient | IPv6 Traffic Class field). To detect bleaching, it will be sufficient | |||
to detect whether nearly all bytes arrive marked as Not-ECT. Therefore | to detect whether nearly all bytes arrive marked as Not-ECT. Therefore, | |||
there ought to be no need to keep track of the details of | there ought to be no need to keep track of the details of | |||
retransmissions.</t> | retransmissions.</t> | |||
</section> | </section> | |||
</section> | </section> | |||
<section anchor="accecn_flags_rationale"> | ||||
<section anchor="accecn_flags_rationale" | <name>Rationale for Usage of TCP Header Flags</name> | |||
title="Rationale for Usage of TCP Header Flags"> | <section> | |||
<section title="Three TCP Header Flags in the SYN-SYN/ACK Handshake"> | <name>Three TCP Header Flags in the SYN-SYN/ACK Handshake</name> | |||
<t>AccECN uses a rather unorthodox approach to negotiate the highest | <t>AccECN uses a rather unorthodox approach to negotiate the highest | |||
version TCP ECN feedback scheme that both ends support, as justified | version TCP ECN feedback scheme that both ends support, as justified | |||
below. It follows from the original TCP ECN capability negotiation | below. It follows from the original TCP ECN capability negotiation | |||
<xref target="RFC3168"/>, in which the Client set the 2 least | <xref target="RFC3168"/>, in which the Client set the 2 least | |||
significant of the original reserved flags in the TCP header, and fell | significant of the original reserved flags in the TCP header, and fell | |||
back to no ECN support if the Server responded with the 2 flags | back to No ECN support if the Server responded with the 2 flags | |||
cleared, which had previously been the default.</t> | cleared, which had previously been the default.</t> | |||
<t>Classic ECN used header flags rather than a TCP option because it | <t>Classic ECN used header flags rather than a TCP option because it | |||
was considered more efficient to use a header flag for 1 bit of | was considered more efficient to use a header flag for 1 bit of | |||
feedback per ACK, and this bit could be overloaded to indicate support | feedback per ACK, and this bit could be overloaded to indicate support | |||
for Classic ECN during the handshake. During the development of ECN, 1 | for Classic ECN during the handshake. During the development of ECN, 1 | |||
bit crept up to 2, in order to deliver the feedback reliably and to | bit crept up to 2, in order to deliver the feedback reliably and to | |||
work round some broken hosts that reflected the reserved flags during | work round some broken hosts that reflected the reserved flags during | |||
the handshake.</t> | the handshake.</t> | |||
<t>In order to be backward compatible with RFC 3168, AccECN continues | <t>In order to be backward compatible with RFC 3168, AccECN continues | |||
this approach, using the 3rd least significant TCP header flag that | this approach, using the 3rd least significant TCP header flag that | |||
had previously been allocated for the ECN nonce (now historic). Then, | had previously been allocated for the ECN-nonce (now historic). Then, | |||
whatever form of Server an AccECN Client encounters, the connection | whatever form of Server an AccECN Client encounters, the connection | |||
can fall back to the highest version of feedback protocol that both | can fall back to the highest version of feedback protocol that both | |||
ends support, as explained in <xref target="accecn_Negotiation"/>.</t> | ends support, as explained in <xref target="accecn_Negotiation"/>.</t> | |||
<t>If AccECN capability negotiation had used the more orthodox | <t>If AccECN capability negotiation had used the more orthodox | |||
approach of a TCP option, it would still have had to set the two ECN | approach of a TCP option, it would still have had to set the two ECN | |||
flags in the main TCP header, in order to be able to fall back to | flags in the main TCP header, in order to be able to fall back to | |||
Classic RFC 3168 ECN, or to disable ECN support, without another round | Classic ECN <xref target="RFC3168"/>, or to disable ECN support, without another round | |||
of negotiation. Then AccECN would also have had to handle all the | of negotiation. Then AccECN would also have had to handle all the | |||
different ways that Servers currently respond to settings of the ECN | different ways that Servers currently respond to settings of the ECN | |||
flags in the main TCP header, including all the conflicting cases | flags in the main TCP header, including all of the conflicting cases | |||
where a Server might have said it supported one approach in the flags | where a Server might have said it supported one approach in the flags | |||
and another approach in a new TCP option. And AccECN would have had to | and another approach in a new TCP option. And AccECN would have had to | |||
deal with all the additional possibilities where a middlebox might | deal with all of the additional possibilities where a middlebox might | |||
have mangled the ECN flags, or removed TCP options. Thus, usage of the | have mangled the ECN flags, or removed TCP options. Thus, usage of the | |||
3rd reserved TCP header flag simplified the protocol.</t> | 3rd reserved TCP header flag simplified the protocol.</t> | |||
<t>The third flag was used in a way that could be distinguished from | <t>The third flag was used in a way that could be distinguished from | |||
the ECN nonce, in case any nonce deployment was encountered. Previous | the ECN-nonce, in case any nonce deployment was encountered. Previous | |||
usage of this flag for the ECN nonce was integrated into the original | usage of this flag for the ECN-nonce was integrated into the original | |||
ECN negotiation. This further justified the 3rd flag's use for AccECN, | ECN negotiation. This further justified the third flag's use for AccECN, | |||
because a non-ECN usage of this flag would have had to use it as a | because a non-ECN usage of this flag would have had to use it as a | |||
separate single bit, rather than in combination with the other 2 ECN | separate single bit, rather than in combination with the other 2 ECN | |||
flags.</t> | flags.</t> | |||
<t>Indeed, having overloaded the original uses of these three flags | <t>Indeed, having overloaded the original uses of these three flags | |||
for its handshake, AccECN overloads all three bits again as a 3-bit | for its handshake, AccECN overloads all three bits again as a 3-bit | |||
counter.</t> | counter.</t> | |||
</section> | </section> | |||
<section> | ||||
<section title="Four Codepoints in the SYN/ACK"> | <name>Four Codepoints in the SYN/ACK</name> | |||
<t>Of the 8 possible codepoints that the 3 TCP header flags can | <t>Of the eight possible codepoints that the three TCP header flags can | |||
indicate on the SYN/ACK, 4 already indicated earlier (or broken) | indicate on the SYN/ACK, four already indicated earlier (or broken) | |||
versions of ECN support, 1 now being historic. In the early design of | versions of ECN support, one now being Historic. In the early design of | |||
AccECN, an AccECN Server could use only 2 of the 4 remaining | AccECN, an AccECN Server could use only 2 of the 4 remaining | |||
codepoints. They both indicated AccECN support, but one fed back that | codepoints. They both indicated AccECN support, but one fed back that | |||
the SYN had arrived marked as CE. Even though ECN support on a SYN is | the SYN had arrived marked as CE. Even though ECN support on a SYN is | |||
not yet on the standards track, the idea is for either end to act as a | not yet on the Standards Track, the idea is for either end to act as a | |||
mechanistic reflector, so that future capabilities can be unilaterally | mechanistic reflector, so that future capabilities can be unilaterally | |||
deployed without requiring 2-ended deployment (justified in <xref | deployed without requiring 2-ended deployment (justified in <xref target | |||
target="accecn_demb_reflector"/>).</t> | ="accecn_demb_reflector"/>).</t> | |||
<!-- [rfced] Does "earlier versions" refer to earlier draft versions of this doc | ||||
ument? | ||||
<t>During traversal testing it was discovered that the IP-ECN field in | Original: | |||
the SYN was mangled on a non-negligible proportion of paths. Therefore | This development consumed the remaining 2 codepoints | |||
on the SYN/ACK that had been reserved for future use by AccECN in | ||||
earlier versions. | ||||
--> | ||||
<t>During traversal testing, it was discovered that the IP-ECN field in | ||||
the SYN was mangled on a non-negligible proportion of paths. Therefore, | ||||
it was necessary to allow the SYN/ACK to feed all four IP-ECN | it was necessary to allow the SYN/ACK to feed all four IP-ECN | |||
codepoints that the SYN could arrive with back to the Client. Without | codepoints that the SYN could arrive with back to the Client. Without | |||
this, the Client could not know whether to disable ECN for the | this, the Client could not know whether to disable ECN for the | |||
connection due to mangling of the IP-ECN field (also explained in | connection due to mangling of the IP-ECN field (also explained in | |||
<xref target="accecn_demb_reflector"/>). This development consumed the | <xref target="accecn_demb_reflector"/>). This development consumed the | |||
remaining 2 codepoints on the SYN/ACK that had been reserved for | remaining two codepoints on the SYN/ACK that had been reserved for | |||
future use by AccECN in earlier versions.</t> | future use by AccECN in earlier versions.</t> | |||
</section> | </section> | |||
<section anchor="accecn_space_evolution"> | ||||
<section anchor="accecn_space_evolution" | <name>Space for Future Evolution</name> | |||
title="Space for Future Evolution"> | ||||
<t>Despite availability of usable TCP header space being extremely | <t>Despite availability of usable TCP header space being extremely | |||
scarce, the AccECN protocol has taken all possible steps to ensure | scarce, the AccECN protocol has taken all possible steps to ensure | |||
that there is space to negotiate possible future variants of the | that there is space to negotiate possible future variants of the | |||
protocol, either if a variant of AccECN is required, or if a | protocol, either if a variant of AccECN is required, or if a | |||
completely different ECN feedback approach is needed:<list | completely different ECN feedback approach is needed.</t> | |||
style="hanging"> | <dl newline="false" spacing="normal"> | |||
<t hangText="Future AccECN variants:">When the AccECN capability | <dt>Future AccECN variants:</dt> | |||
is negotiated during TCP's three-way handshake, the rows in <xref | <dd> | |||
target="accecn_Tab_Negotiation"/> tagged as 'Nonce' and 'Broken' | <t>When the AccECN capability | |||
is negotiated during TCP's three-way handshake, the rows in <xref ta | ||||
rget="accecn_Tab_Negotiation"/> tagged as 'Nonce' and 'Broken' | ||||
in the column for the capability of node B are unused by any | in the column for the capability of node B are unused by any | |||
current protocol in the RFC series. These could be used by TCP | current protocol defined in the RFC series. These could be used by T | |||
Servers in future to indicate a variant of the AccECN protocol. In | CP | |||
Servers in the future to indicate a variant of the AccECN protocol. | ||||
In | ||||
recent measurement studies in which the response of large numbers | recent measurement studies in which the response of large numbers | |||
of Servers to an AccECN SYN has been tested, e.g., <xref | of Servers to an AccECN SYN has been tested, e.g., <xref target="Man | |||
target="Mandalari18"/>, a very small number of SYN/ACKs arrive | dalari18"/>, a very small number of SYN/ACKs arrive | |||
with the pattern tagged as 'Nonce', and a small but more | with the pattern tagged as 'Nonce', and a small but more | |||
significant number arrive with the pattern tagged as 'Broken'. The | significant number arrive with the pattern tagged as 'Broken'. The | |||
'Nonce' pattern could be a sign that a few Servers have | 'Nonce' pattern could be a sign that a few Servers have | |||
implemented the ECN Nonce <xref target="RFC3540"/>, which has now | implemented the ECN-nonce <xref target="RFC3540"/>, which has now | |||
been reclassified as historic <xref target="RFC8311"/>, or it | been reclassified as Historic <xref target="RFC8311"/>, or it | |||
could be the random result of some unknown middlebox behaviour. | could be the random result of some unknown middlebox behaviour. | |||
The greater prevalence of the 'Broken' pattern suggests that some | The greater prevalence of the 'Broken' pattern suggests that some | |||
instances still exist of the broken code that reflects the | instances still exist of the broken code that reflects the | |||
reserved flags on the SYN.<vspace blankLines="1"/>The requirement | reserved flags on the SYN.</t> | |||
<t>The requirement | ||||
not to reject unexpected initial values of the ACE counter (in the | not to reject unexpected initial values of the ACE counter (in the | |||
main TCP header) in the last paragraph of <xref | main TCP header) in the last paragraph of <xref target="accecn_sec_A | |||
target="accecn_sec_ACE_init_invalid"/> ensures that 3 unused | CE_init_invalid"/> ensures that three unused | |||
codepoints on the ACK of the SYN/ACK, 6 unused values on the first | codepoints on the ACK of the SYN/ACK, six unused values on the first | |||
SYN=0 data packet from the Client and 7 unused values on the first | SYN=0 data packet from the Client, and seven unused values on the fi | |||
rst | ||||
SYN=0 data packet from the Server could be used to declare future | SYN=0 data packet from the Server could be used to declare future | |||
variants of the AccECN protocol. The word 'declare' is used rather | variants of the AccECN protocol. The word 'declare' is used rather | |||
than 'negotiate' because, at this late stage in the three-way handsh ake, it would | than 'negotiate' because, at this late stage in the three-way handsh ake, it would | |||
be too late for a negotiation between the endpoints to be | be too late for a negotiation between the endpoints to be | |||
completed. A similar requirement not to reject unexpected initial | completed. A similar requirement not to reject unexpected initial | |||
values in AccECN TCP Options (<xref | values in AccECN TCP Options (<xref target="accecn_sec_zero_option"/ | |||
target="accecn_sec_zero_option"/>) is for the same purpose. If | >) is for the same purpose. If | |||
traversal of AccECN TCP Options were reliable, this would have | traversal of AccECN TCP Options were reliable, this would have | |||
enabled a far wider range of future variation of the whole AccECN | enabled a far wider range of future variation of the whole AccECN | |||
protocol. Nonetheless, it could be used to reliably negotiate a | protocol. Nonetheless, it could be used to reliably negotiate a | |||
wide range of variation in the semantics of the AccECN Option.</t> | wide range of variation in the semantics of the AccECN Option.</t> | |||
</dd> | ||||
<t hangText="Future non-AccECN variants:">Five codepoints out of | <dt>Future non-AccECN variants:</dt> | |||
the 8 possible in the 3 TCP header flags used by AccECN are unused | <dd> | |||
<t>Five codepoints out of | ||||
the eight possible in the three TCP header flags used by AccECN are | ||||
unused | ||||
on the initial SYN (in the order (AE,CWR,ECE)): (0,0,1), (0,1,0), | on the initial SYN (in the order (AE,CWR,ECE)): (0,0,1), (0,1,0), | |||
(1,0,0), (1,0,1), (1,1,0). <xref | (1,0,0), (1,0,1), (1,1,0). <xref target="accecn_sec_forward_compat"/ | |||
target="accecn_sec_forward_compat"/> ensures that the installed | > ensures that the installed | |||
base of AccECN Servers will all assume these are equivalent to | base of AccECN Servers will all assume these are equivalent to | |||
AccECN negotiation with (1,1,1) on the SYN. These codepoints would | AccECN negotiation with (1,1,1) on the SYN. These codepoints would | |||
not allow fall-back to Classic ECN support for a Server that did | not allow fall-back to Classic ECN support for a Server that did | |||
not understand them, but this approach ensures they are available | not understand them, but this approach ensures they are available | |||
in future, perhaps for uses other than ECN alongside the AccECN | in the future, perhaps for uses other than ECN alongside the AccECN | |||
scheme. All possible combinations of SYN/ACK could be used in | scheme. All possible combinations of SYN/ACK could be used in | |||
response except either (0,0,0) or reflection of the same values | response except either (0,0,0) or reflection of the same values | |||
sent on the SYN. <vspace blankLines="1"/>In order to extend AccECN | sent on the SYN. </t> | |||
or ECN in future, other ways could be resorted to, although their | <t>In order to extend AccECN | |||
or ECN in the future, other ways could be resorted to, although thei | ||||
r | ||||
traversal properties are likely to be inferior. They include a new | traversal properties are likely to be inferior. They include a new | |||
TCP option; using the remaining reserved flags in the main TCP | TCP option; using the remaining reserved flags in the main TCP | |||
header (preferably extending the 3-bit combinations used by AccECN | header (preferably extending the 3-bit combinations used by AccECN | |||
to 4-bit combinations, rather than burning one bit for just one | to 4-bit combinations, rather than burning one bit for just one | |||
state); a non-zero urgent pointer in combination with the URG flag | state); a non-zero urgent pointer in combination with the URG flag | |||
cleared; or some other unexpected combination of fields yet to be | cleared; or some other unexpected combination of fields yet to be | |||
invented.</t> | invented.</t> | |||
</list></t> | </dd> | |||
</dl> | ||||
</section> | </section> | |||
</section> | </section> | |||
<!-- ================================================================ --> | <section anchor="accecn_Acknowledgements" numbered="false"> | |||
<name>Acknowledgements</name> | ||||
<section anchor="accecn_Acknowledgements" numbered="false" | <t>We want to thank <contact fullname="Koen De Schepper"/>, <contact | |||
title="Acknowledgements"> | fullname="Praveen Balasubramanian"/>, <contact fullname="Michael | |||
<t>We want to thank Koen De Schepper, Praveen Balasubramanian, Michael | Welzl"/>, <contact fullname="Gorry Fairhurst"/>, <contact | |||
Welzl, Gorry Fairhurst, David Black, Spencer Dawkins, Michael Scharf, | fullname="David Black"/>, <contact fullname="Spencer Dawkins"/>, | |||
Michael Tüxen, Yuchung Cheng, Kenjiro Cho, Olivier Tilmans, Ilpo | <contact fullname="Michael Scharf"/>, <contact fullname="Michael | |||
Järvinen, Neal Cardwell, Yoshifumi Nishida, Martin Duke, Jonathan | Tüxen"/>, <contact fullname="Yuchung Cheng"/>, <contact | |||
Morton, Vidhi Goel, Alex Burr, Markku Kojo, Grenville Armitage and Wes | fullname="Kenjiro Cho"/>, <contact fullname="Olivier Tilmans"/>, | |||
Eddy for their input and discussion. The idea of using the three | <contact fullname="Ilpo Järvinen"/>, <contact fullname="Neal | |||
ECN-related TCP flags as one field for more accurate TCP-ECN feedback | Cardwell"/>, <contact fullname="Yoshifumi Nishida"/>, <contact | |||
was first introduced in the re-ECN protocol that was the ancestor of | fullname="Martin Duke"/>, <contact fullname="Jonathan Morton"/>, | |||
ConEx.</t> | <contact fullname="Vidhi Goel"/>, <contact fullname="Alex Burr"/>, | |||
<contact fullname="Markku Kojo"/>, <contact fullname="Grenville | ||||
Armitage"/> and <contact fullname="Wes Eddy"/> for their input and | ||||
discussion. The idea of using the three ECN-related TCP flags as one | ||||
field for more accurate TCP-ECN feedback was first introduced in the | ||||
re-ECN protocol that was the ancestor of ConEx.</t> | ||||
<t>The following contributed implementations of AccECN that validated | <t>The following contributed implementations of AccECN that validated | |||
and helped to improve this specification:<list style="hanging"> | and helped to improve this specification:</t> | |||
<t hangText="Linux:">Mirja Kühlewind, Ilpo Järvinen, Neal | <dl newline="false" spacing="normal"> | |||
Cardwell and Chia-Yu Chang;</t> | <dt>Linux:</dt> | |||
<dd><t><contact fullname="Mirja Kühlewind"/>, <contact fullname="Ilpo Jä | ||||
<t hangText="FreeBSD:">Richard Scheffenegger;</t> | rvinen"/>, <contact fullname="Neal | |||
Cardwell"/>, and <contact fullname="Chia-Yu Chang"/></t></dd> | ||||
<t hangText="Apple OSs:">Vidhi Goel.</t> | <dt>FreeBSD:</dt> | |||
</list></t> | <dd><t><contact fullname="Richard Scheffenegger"/></t></dd> | |||
<dt>Apple OSs:</dt> | ||||
<t>Bob Briscoe was part-funded by Apple Inc, the Comcast Innovation | <dd><t><contact fullname="Vidhi Goel"/></t></dd> | |||
</dl> | ||||
<t><contact fullname="Bob Briscoe"/> was part-funded by Apple Inc, the Com | ||||
cast Innovation | ||||
Fund, the European Community under its Seventh Framework Programme | Fund, the European Community under its Seventh Framework Programme | |||
through the Reducing Internet Transport Latency (RITE) project | through the Reducing Internet Transport Latency (RITE) project | |||
(ICT-317700) and through the Trilogy 2 project (ICT-317756), and the | (ICT-317700) and through the Trilogy 2 project (ICT-317756), and the | |||
Research Council of Norway through the TimeIn project. The views | Research Council of Norway through the TimeIn project. The views | |||
expressed here are solely those of the authors.</t> | expressed here are solely those of the authors.</t> | |||
<t><contact fullname="Mirja Kühlewind"/> was partly supported by the Europ | ||||
<t>Mirja Kühlewind was partly supported by the European Commission | ean Commission | |||
under Horizon 2020 grant agreement no. 688421 Measurement and | under Horizon 2020 grant agreement no. 688421 Measurement and | |||
Architecture for a Middleboxed Internet (MAMI), and by the Swiss State | Architecture for a Middleboxed Internet (MAMI), and by the Swiss State | |||
Secretariat for Education, Research, and Innovation under contract no. | Secretariat for Education, Research, and Innovation under contract no. | |||
15.0268. This support does not imply endorsement.</t> | 15.0268. This support does not imply endorsement.</t> | |||
</section> | </section> | |||
<!-- ================================================================ --> | ||||
<section anchor="accecn_Comments_Solicited" numbered="false" | ||||
removeInRFC="true" title="Comments Solicited"> | ||||
<t>Comments and questions are encouraged and very welcome. They can be | ||||
addressed to the IETF TCP maintenance and minor modifications working | ||||
group mailing list <tcpm@ietf.org>, and/or to the authors.</t> | ||||
</section> | ||||
</back> | </back> | |||
<!-- [rfced] Please review the following terminology-related questions. | ||||
A) We updated the following to the form on the right. Please let us know if any | ||||
corrections are needed. | ||||
not-ECT vs Not-ECT | ||||
no ECN vs No ECN | ||||
ECN Nonce vs ECN-Nonce vs ECN nonce (to match RFC 3540) | ||||
Cubic vs CUBIC (to match RFC 9438) | ||||
IP ECN field vs IP-ECN field | ||||
ECN capable vs ECN-capable (to match RFC 3168, though we wonder if it should be | ||||
open (ECN capable) when not acting as an adjective appearing before then noun. | ||||
time-out vs timeout | ||||
CE mark* vs CE-mark* - updated to use the hyphen when acting as an adjective app | ||||
earing before the noun | ||||
B) Please review occurrences of the terms below and let us know if/how they may | ||||
be made consistent. | ||||
TCP Option vs TCP option (perhaps TCP Option when referring to a specific option | ||||
?) | ||||
Established state vs established state vs ESTABLISHED state | ||||
half connection vs half-connection | ||||
C) We note that "time-stamp" is used consistently. However, RFC 7323 uses "time | ||||
stamp". May we update the text for consistency? | ||||
--> | ||||
<!-- [rfced] Please review whether any of the notes in this document | ||||
should be in the <aside> element. It is defined as "a container for | ||||
content that is semantically less important or tangential to the | ||||
content that surrounds it" (https://authors.ietf.org/en/rfcxml-vocabulary#aside) | ||||
. | ||||
--> | ||||
<!-- [rfced] Some author comments are present in the XML. Please confirm that | ||||
no updates related to these comments are outstanding. Note that the | ||||
comments will be deleted prior to publication. | ||||
--> | ||||
<!-- [rfced] Please review the "Inclusive Language" portion of the online | ||||
Style Guide <https://www.rfc-editor.org/styleguide/part2/#inclusive_language> | ||||
and let us know if any changes are needed. Updates of this nature typically | ||||
result in more precise language, which is helpful for readers. | ||||
Note that our script did not flag any words in particular, but this should | ||||
still be reviewed as a best practice. | ||||
--> | ||||
</rfc> | </rfc> | |||
End of changes. 749 change blocks. | ||||
1986 lines changed or deleted | 2303 lines changed or added | |||
This html diff was produced by rfcdiff 1.48. |